#! /usr/bin/python import sys, urllib, re, codecs from time import sleep _verbose=[''] _usage=""" pubmed.py outfile Fetch articles in xml format from ncbi pubmed and saves them. Respect ncbi resources ! Uses utf-8 encoding $Id: pubmed.py,v 1.1.1.1 2003/03/03 15:46:33 wresch Exp $ """ _termPrompt = """ -------------------------------------------------------------------- ENTER SEARCH TERMS IN NCBI PUBMED SYNTAX. Summary of some fields (incomplete): Affiliation [AD] All Fields [ALL] Author [AU] Issue [IP] Journal Title [TA] Language [LA] MeSH Major Topic [MAJR] MeSH Subheadings [SH] MeSH Terms [MH] Pagination [PG] Publication Date [DP] YYYY/MM/DD [dp]; month and day are optional Publication Type [PT] Substance Name [NM] Text Words [TW] Title [TI] Title/Abstract [TIAB] Unique Identifiers [UID] Volume [VI] Whitespace is ignored; upper/lower case are unimportant AND, OR, NOT need to be specified -------------------------------------------------------------------- terms=\t""" _datePrompt=""" Date of earliest and latest publication to retrieve in YYYY/MM/DD format (month and day optional) examples: 2001 1999/01/01 Leave either date empty to ignore -------------------------------------------------------------------- mindate=\t""" #read user input and return a dict that can be used by urlencode to start query========================= def makeSearchDataDict(): data={} #read search string rawTerms = raw_input(_termPrompt) if rawTerms == '': sys.exit("No terms defined") else: data['term']=rawTerms temp = raw_input(68*"-"+"\nImpose publication date limits (y/n)? ") if temp == 'y': rawMindate = raw_input(_datePrompt).split().join("") if rawMindate != '': data["mindate"]=rawMindate rawMaxdate = raw_input("maxdate=\t").split().join("") if rawMaxdate != '': data["maxdate"]=rawMaxdate else: print 68*"-" data['retmax']=50 data['rettype']='text' data['retmode']='xml' data['usehistory']='y' return data #analyze search result - only read first 10 lines for webenv and query key=============================== def analyzeSearchResult(ids): #print "analyzing..." header="" counter=0 while counter < 10: header += ids.readline() counter += 1 ids.close() #print "connection closed" count = re.search(r"(?P[^<>]*)", header) if count == None: return None count = int(count.group('nr')) queryKey = re.search(r"(?P.*)", header) if (queryKey== None): return None queryKey = queryKey.group('qk') webEnv = re.search(r"(?P.*)", header) if (webEnv == None): return None webEnv = webEnv.group('we') if 'analyzeSearchResult' in _verbose: print "FOUND %i ARTICLES" % count print "QUERY KEY: %s" % queryKey print "WebEnv: %s" % webEnv return (webEnv,queryKey, count) #main ================================================================================== def main(): if len(sys.argv) < 2: sys.exit(_usage) baseurl = 'http://www.ncbi.nlm.nih.gov/entrez/eutils/' searchurl="esearch.fcgi?db=pubmed" fetchurl="efetch.fcgi?db=pubmed" data = makeSearchDataDict() #get the results try: ids=urllib.urlopen(baseurl+searchurl, urllib.urlencode(data)) except IOError, msg: sys.exit(msg) #print "connection open" #analyze header of results webenv_key_count = analyzeSearchResult(ids) while 0==0: fetchNr = raw_input("Retrieve first n of %s articles; defaults to all ? " % webenv_key_count[2]) try: fetchNr = int(fetchNr) break except ValueError: if fetchNr == '': fetchNr = webenv_key_count[2] break print "Enter number !" if fetchNr != '0': start=0 (u_to_utf8, utf8_to_u, utf8_reader, utf8_writer) = codecs.lookup('utf-8') outfile=codecs.open(sys.argv[1],"w","utf-8") data={'webenv':webenv_key_count[0], 'query_key':webenv_key_count[1], 'rettype':'text', 'retmode':'xml', 'tool':'pubmed.py','email':'wresch@niaid.nih.gov'} while start < fetchNr: data['retstart']=start data['retmax']=min(200, fetchNr-start) print urllib.urlencode(data) try: records = utf8_reader(urllib.urlopen(baseurl+fetchurl, urllib.urlencode(data))) except IOError, msg: sys.exit(msg) #reading/writing will be threaded line = records.readline() while line != "": #print u_to_utf8(line)[0].strip() outfile.write(line) line = records.readline() records.close() start+= 200 outfile.close() else: sys.exit("No articles retrieved") main()