Python script recovers the genbank ids for all the nucleotide entries linked to a taxon id. | all4bioinformatics
Breaking News
Loading...

Thursday, 30 May 2013

Python script recovers the genbank ids for all the nucleotide entries linked to a taxon id.


This python script recovers the genbank ids for all the nucleotide entries linked to a taxon id. The number of requests is minimized using the retmax and retstart parameters provided by the Entrez Utilities.
taxid_2_gbids.py
Python

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465

#!/usr/bin/env python


 


import xml.etree.ElementTree as ET


import sys, urllib, urllib2


 


eutils_base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"


 


def get_ids(taxid):


accession_numbers =[]


retstart = 0


iteration_step = 10000


while True:


result = esearch(db = "nucleotide", term = "txid%s[Organism:exp]"%taxid, retstart = retstart, retmax = iteration_step)


try:


result = ET.fromstring(result)


ids = []


if result.find('IdList') is not None:


for id in result.find('IdList').findall('Id'):


ids.append(id.text)


 


result = esummary(db = "nucleotide", ids = ids, retmax = iteration_step)


result = ET.fromstring(result)


for docsum in result.findall('DocSum'):


for item in docsum.findall("Item[@Name='Caption']"):


accession_numbers.append(item.text)


else:


break




except Exception, e:


print e


retstart += iteration_step


return accession_numbers


 


def esearch(db, term, retstart = 0, retmax = 20):


response = urllib.urlopen("%sesearch.fcgi?db=%s&term=%s&retstart=%i&retmax=%i"%(eutils_base_url, db, term, retstart, retmax))


content = str(response.read())


response.close()


return content


 


def esummary(db, ids, retstart = 0, retmax = 20):


data = {


'db':db,


'id':','.join(ids)


}


data = urllib.urlencode(data)


req = urllib2.Request("%sesummary.fcgi"%eutils_base_url, data)


response = urllib2.urlopen(req)


content = str(response.read())


response.close()


return content


 


if __name__ == '__main__':


taxid = None


 


if "-id" in sys.argv:


taxid = sys.argv[sys.argv.index("-id")+1]


 


if not taxid:


print "Usage: taxid_2_gbids.py -id taxid"


print "Example: taxid_2_gbids.py -id 4754"


sys.exit(-1)


 


ids = get_ids(taxid)


print ids


print "%i ids found..."%len(ids)

google+

linkedin

About Author
  • Donec sed odio dui. Duis mollis, est non commodo luctus, nisi erat porttitor ligula, eget lacinia odio sem nec elit. Sed posuere consecteturDonec sed odio dui. Duis mollis, est non commodo luctus, nisi erat porttitor ligula, eget lacinia odio sem nec elit. Read More

    0 comments:

    POST A COMMENT

     

    Gallery

    About

    About Us