#! /usr/bin/python
# Wolfgang Resch, LVD/NIAID/NIH
# 030221: wr implementation of converter script from medxml to bibtex
# 030224: wr finished basic implementation; works for most records
import sys, re
from codecs import open
from math import log10, pow, ceil
verbose=['makeArtikleFromXml']
_usage="""
medxml2bib.py xmlfile outfile
Read xmlfile generated by ncbi interface of pubmed (summary xml file) and write
the data as a bibtex format file. This program will fix the truncated notation
of page ranges, but will not convert special characters to their latex equivalents.
Output format is UTF-8.
$Id: medxml2bib.py,v 1.1.1.1 2003/03/03 15:46:33 wresch Exp $
"""
_u_to_latex_dict={u'\u00c0':ur'{\`A}',u'\u00c1':ur'{\'A}',u'\u00c2':ur'{\^A}',u'\u00c3':ur'{\~A}',
u'\u00c4':ur'{\"A}',u'\u00c5':ur'{\AA}',u'\u00c8':ur'{\`E}',u'\u00c9':ur'{\'E}',
u'\u00ca':ur'{\^E}',u'\u00cb':ur'{\"E}',u'\u00cc':ur'{\`I}',u'\u00cd':ur'{\'I}',
u'\u00ce':ur'{\^I}',u'\u00cf':ur'{\"I}',u'\u00d1':ur'{\~N}',
u'\u00d2':ur'{\`O}',u'\u00d3':ur'{\'O}',u'\u00d4':ur'{\^O}',u'\u00d5':ur'{\~O}',u'\u00d6':ur'{\"O}',
u'\u00d9':ur'{\`U}',u'\u00da':ur'{\'U}',u'\u00db':ur'{\^U}',u'\u00dc':ur'{\"O}',
u'\u00df':ur'{\ss}',u'\u00f1':ur'{\~n}',
u'\u00e0':ur'{\`a}',u'\u00e1':ur'{\'a}',u'\u00e2':ur'{\^a}',u'\u00e3':ur'{\~a}',u'\u00e4':ur'{\aa}',u'\u00e5':ur'{\ae}',
u'\u00e8':ur'{\`e}',u'\u00e9':ur'{\'e}',u'\u00ea':ur'{\^e}',u'\u00eb':ur'{\"e}',
u'\u00ec':ur'{\`i}',u'\u00ed':ur'{\'i}',u'\u00ee':ur'{\^i}',u'\u00ef':ur'{\"i}',
u'\u00f2':ur'{\`o}',u'\u00f3':ur'{\'o}',u'\u00f4':ur'{\^o}',u'\u00f5':ur'{\~o}',u'\u00f6':ur'{\"o}',
u'\u00f9':ur'{\`u}', u'\u00fa':ur'{\'u}', u'\u00fb':ur'{\^u}', u'\u00fc':ur'{\"u}',
u'%':ur'\%'}
#article class=====================================================================
class article:
def __init__(self, journal, pages, year, title, authors, volume='', abstract='', type='', id='', issue=-1):
"""volume, issue, year: integers
journal, id, title, abstract, type: strings
pages: 2-tuple of strings (for a range, both need to be full numbers)
authors: 2D list (last name, initials)"""
self.__journal=journal
self.__volume=volume
#page error handling:
self.__pages=pages
self.__year=year
self.__title=title
#optional arguments
if id=='':
id = "%4i_%i_%i" % (year,volume,pages[0])
self.__abstract=abstract
self.__authors=authors
self.__type=type
self.__id=id
self.__issue=issue
def show(self, fileObj):
#preliminary
bib=u''
bib += u'@article{' + self.__id + u',\n'
bib += u'\ttitle =\t\t{ ' + self.__title + u' },\n'
bib += u'\tauthor=\t\t{ '
authorNr = len(self.__authors)
if authorNr>0:
for a in xrange(authorNr):
bib += self.__authors[a][1] + u' ' + self.__authors[a][0]
if a < authorNr-1:
bib += u' and '
bib += u' },\n'
bib +=u'\tjournal =\t{ ' + self.__journal + u' },\n'
bib +=u'\tvolume =\t{ %s },\n' % self.__volume
if self.__issue != -1:
bib +=u'\tnumber =\t{ ' + self.__issue + u' },\n'
else:
bib +=u'\tnumber =\t{ },\n'
bib +=u'\tyear =\t\t{ %i },\n' % self.__year
bib +=u'\tpages =\t\t{ %s--%s },\n' % (self.__pages[0], self.__pages[1])
bib +=u'\tabstract =\t{ ' + self.__abstract + ' }\n}\n'
fileObj.write(utf8_to_latex(bib))
# remove offending characters and replace with latex sequences=================================
def utf8_to_latex(uString):
"""convert some special characters to latex characters - not exhaustive"""
for i in _u_to_latex_dict.keys():
uString = uString.replace(i, _u_to_latex_dict[i])
return uString
# parse xml record of article============================================================
def makeArtikleFromXml( record ):
"""take xml from ncbi pubmed article summary minus the top node and return
article object"""
#Volume
temp = re.search(r"(?P[^<>]*)", record, re.S)
if (temp != None):
volume=temp.group("g")
else:
volume=''
#if 'makeArtikleFromXml' in verbose:
# print "Could not parse Volume"
#return None
#Journal name - required
temp =re.search(r"(?P[^<>]*)", record, re.S)
if (temp != None):
journal=temp.group("g").strip()
else:
if 'makeArtikleFromXml' in verbose:
print "Could not parse Journal======================================="
print record.encode('utf-8')
return None
#pages - required - fix any possible page range abbreviation problem
temp =re.search(r"(?P[^<>]*)", record)
if (temp != None):
temp = temp.group("g").split("-")
if len(temp)==0:
if 'makeArtikleFromXml' in verbose:
print "No page numbers decoded======================================="
print record.encode('utf-8')
return None
elif len(temp)==1:
pages = (temp[0].strip())
elif len(temp)>=2:
if len(temp[0]) != len(temp[1]):
temp[1] = temp[0][0:len(temp[0])-len(temp[1])]+temp[1]
pages = (temp[0], temp[1])
else:
if 'makeArtikleFromXml' in verbose:
print "Could not parse page numbers======================================="
print record.encode('utf-8')
return None
#year - required
temp = re.search(r"[ \t\n]*(?P\d*)[\t\n ]*", record, re.S)
if (temp != None):
year=int(temp.group("g"))
else:
if 'makeArtikleFromXml' in verbose:
print "Could not parse Year======================================="
print record.encode('utf-8')
return None
#title - required
temp = re.search(r"(?P[^<>]*)", record, re.S)
if (temp != None):
title=temp.group("g").strip().replace(".","")
else:
if 'makeArtikleFromXml' in verbose:
print "Could not parse Title======================================="
print record.encode('utf-8')
return None
#authors - required
authList = re.search(r".*", record, re.S)
if authList != None:
authList = authList.group().split("")
temp = r"(?P[^<>]*).*(?P[^<>]*)"
authorREO = re.compile(temp, re.S)
formatedAuthList=[]
for i in authList:
temp = authorREO.search(i)
if temp != None:
initials = temp.group("in")
formatedInitials=""
for j in initials:
formatedInitials += j +". "
formatedAuthList.append([temp.group("ln").strip(), formatedInitials.strip()])
else:
if 'makeArtikleFromXml' in verbose:
print "Could not parse authors======================================="
print record.encode('utf-8')
return None
#issue
temp = re.search(r"(?P[^<>]*)", record, re.S)
if (temp != None):
issue=temp.group("g")
else:
issue=-1
#medlineID
temp = re.search(r"(?P[^<>]*)", record, re.S)
if (temp != None):
id=temp.group("g").strip()
else:
id=''
#abstract
temp = re.search(r"(?P[^<>]*)", record)
if (temp != None):
abstract=temp.group("g").strip()
else:
abstract=''
#make and return article
temp = article(journal, pages, year, title, authors=formatedAuthList,
volume=volume, issue=issue, id=id, abstract=abstract)
return temp
#main ===============================================================
def main():
if len(sys.argv) < 3:
sys.exit(_usage)
#read unicode file and get records, one by one
xmlfile = open(sys.argv[1],"r", "utf-8")
line = xmlfile.readline()
record=""
articles = []
foundArticles = 0
while line != "":
if line.find(u"") != -1:
record=""
foundArticles += 1
if foundArticles % 50 == 0:
sys.stderr.write("Parsing article %i\n" % foundArticles)
elif line.find(u"") != -1:
#print record
temp = makeArtikleFromXml(record)
if temp != None:
articles.append(temp)
else:
#accumulate record
record = record + line
line = xmlfile.readline()
xmlfile.close()
outfile = open(sys.argv[2], 'w','utf-8')
for a in articles:
a.show(outfile)
sys.stderr.write("\nArticles found in file:\t%4i\nArticles converted:\t%4i\n" % (foundArticles, len(articles)))
main()