#! /usr/bin/python
import sys, urllib, re, codecs
from time import sleep
_verbose=['']
_usage="""
pubmed.py outfile
Fetch articles in xml format from ncbi pubmed and saves
them. Respect ncbi resources !
Uses utf-8 encoding
$Id: pubmed.py,v 1.1.1.1 2003/03/03 15:46:33 wresch Exp $
"""
_termPrompt = """
--------------------------------------------------------------------
ENTER SEARCH TERMS IN NCBI PUBMED SYNTAX.
Summary of some fields (incomplete):
Affiliation [AD]
All Fields [ALL]
Author [AU]
Issue [IP]
Journal Title [TA]
Language [LA]
MeSH Major Topic [MAJR]
MeSH Subheadings [SH]
MeSH Terms [MH]
Pagination [PG]
Publication Date [DP] YYYY/MM/DD [dp]; month and day are optional
Publication Type [PT]
Substance Name [NM]
Text Words [TW]
Title [TI]
Title/Abstract [TIAB]
Unique Identifiers [UID]
Volume [VI]
Whitespace is ignored; upper/lower case are unimportant
AND, OR, NOT need to be specified
--------------------------------------------------------------------
terms=\t"""
_datePrompt="""
Date of earliest and latest publication to retrieve in
YYYY/MM/DD format (month and day optional)
examples:
2001
1999/01/01
Leave either date empty to ignore
--------------------------------------------------------------------
mindate=\t"""
#read user input and return a dict that can be used by urlencode to start query=========================
def makeSearchDataDict():
data={}
#read search string
rawTerms = raw_input(_termPrompt)
if rawTerms == '':
sys.exit("No terms defined")
else:
data['term']=rawTerms
temp = raw_input(68*"-"+"\nImpose publication date limits (y/n)? ")
if temp == 'y':
rawMindate = raw_input(_datePrompt).split().join("")
if rawMindate != '':
data["mindate"]=rawMindate
rawMaxdate = raw_input("maxdate=\t").split().join("")
if rawMaxdate != '':
data["maxdate"]=rawMaxdate
else:
print 68*"-"
data['retmax']=50
data['rettype']='text'
data['retmode']='xml'
data['usehistory']='y'
return data
#analyze search result - only read first 10 lines for webenv and query key===============================
def analyzeSearchResult(ids):
#print "analyzing..."
header=""
counter=0
while counter < 10:
header += ids.readline()
counter += 1
ids.close()
#print "connection closed"
count = re.search(r"(?P[^<>]*)", header)
if count == None:
return None
count = int(count.group('nr'))
queryKey = re.search(r"(?P.*)", header)
if (queryKey== None):
return None
queryKey = queryKey.group('qk')
webEnv = re.search(r"(?P.*)", header)
if (webEnv == None):
return None
webEnv = webEnv.group('we')
if 'analyzeSearchResult' in _verbose:
print "FOUND %i ARTICLES" % count
print "QUERY KEY: %s" % queryKey
print "WebEnv: %s" % webEnv
return (webEnv,queryKey, count)
#main ==================================================================================
def main():
if len(sys.argv) < 2:
sys.exit(_usage)
baseurl = 'http://www.ncbi.nlm.nih.gov/entrez/eutils/'
searchurl="esearch.fcgi?db=pubmed"
fetchurl="efetch.fcgi?db=pubmed"
data = makeSearchDataDict()
#get the results
try:
ids=urllib.urlopen(baseurl+searchurl, urllib.urlencode(data))
except IOError, msg:
sys.exit(msg)
#print "connection open"
#analyze header of results
webenv_key_count = analyzeSearchResult(ids)
while 0==0:
fetchNr = raw_input("Retrieve first n of %s articles; defaults to all ? " % webenv_key_count[2])
try:
fetchNr = int(fetchNr)
break
except ValueError:
if fetchNr == '':
fetchNr = webenv_key_count[2]
break
print "Enter number !"
if fetchNr != '0':
start=0
(u_to_utf8, utf8_to_u, utf8_reader, utf8_writer) = codecs.lookup('utf-8')
outfile=codecs.open(sys.argv[1],"w","utf-8")
data={'webenv':webenv_key_count[0], 'query_key':webenv_key_count[1], 'rettype':'text',
'retmode':'xml', 'tool':'pubmed.py','email':'wresch@niaid.nih.gov'}
while start < fetchNr:
data['retstart']=start
data['retmax']=min(200, fetchNr-start)
print urllib.urlencode(data)
try:
records = utf8_reader(urllib.urlopen(baseurl+fetchurl, urllib.urlencode(data)))
except IOError, msg:
sys.exit(msg)
#reading/writing will be threaded
line = records.readline()
while line != "":
#print u_to_utf8(line)[0].strip()
outfile.write(line)
line = records.readline()
records.close()
start+= 200
outfile.close()
else:
sys.exit("No articles retrieved")
main()