#! /usr/bin/python # Wolfgang Resch, LVD/NIAID/NIH # 030221: wr implementation of converter script from medxml to bibtex # 030224: wr finished basic implementation; works for most records import sys, re from codecs import open from math import log10, pow, ceil verbose=['makeArtikleFromXml'] _usage=""" medxml2bib.py xmlfile outfile Read xmlfile generated by ncbi interface of pubmed (summary xml file) and write the data as a bibtex format file. This program will fix the truncated notation of page ranges, but will not convert special characters to their latex equivalents. Output format is UTF-8. $Id: medxml2bib.py,v 1.1.1.1 2003/03/03 15:46:33 wresch Exp $ """ _u_to_latex_dict={u'\u00c0':ur'{\`A}',u'\u00c1':ur'{\'A}',u'\u00c2':ur'{\^A}',u'\u00c3':ur'{\~A}', u'\u00c4':ur'{\"A}',u'\u00c5':ur'{\AA}',u'\u00c8':ur'{\`E}',u'\u00c9':ur'{\'E}', u'\u00ca':ur'{\^E}',u'\u00cb':ur'{\"E}',u'\u00cc':ur'{\`I}',u'\u00cd':ur'{\'I}', u'\u00ce':ur'{\^I}',u'\u00cf':ur'{\"I}',u'\u00d1':ur'{\~N}', u'\u00d2':ur'{\`O}',u'\u00d3':ur'{\'O}',u'\u00d4':ur'{\^O}',u'\u00d5':ur'{\~O}',u'\u00d6':ur'{\"O}', u'\u00d9':ur'{\`U}',u'\u00da':ur'{\'U}',u'\u00db':ur'{\^U}',u'\u00dc':ur'{\"O}', u'\u00df':ur'{\ss}',u'\u00f1':ur'{\~n}', u'\u00e0':ur'{\`a}',u'\u00e1':ur'{\'a}',u'\u00e2':ur'{\^a}',u'\u00e3':ur'{\~a}',u'\u00e4':ur'{\aa}',u'\u00e5':ur'{\ae}', u'\u00e8':ur'{\`e}',u'\u00e9':ur'{\'e}',u'\u00ea':ur'{\^e}',u'\u00eb':ur'{\"e}', u'\u00ec':ur'{\`i}',u'\u00ed':ur'{\'i}',u'\u00ee':ur'{\^i}',u'\u00ef':ur'{\"i}', u'\u00f2':ur'{\`o}',u'\u00f3':ur'{\'o}',u'\u00f4':ur'{\^o}',u'\u00f5':ur'{\~o}',u'\u00f6':ur'{\"o}', u'\u00f9':ur'{\`u}', u'\u00fa':ur'{\'u}', u'\u00fb':ur'{\^u}', u'\u00fc':ur'{\"u}', u'%':ur'\%'} #article class===================================================================== class article: def __init__(self, journal, pages, year, title, authors, volume='', abstract='', type='', id='', issue=-1): """volume, issue, year: integers journal, id, title, abstract, type: strings pages: 2-tuple of strings (for a range, both need to be full numbers) authors: 2D list (last name, initials)""" self.__journal=journal self.__volume=volume #page error handling: self.__pages=pages self.__year=year self.__title=title #optional arguments if id=='': id = "%4i_%i_%i" % (year,volume,pages[0]) self.__abstract=abstract self.__authors=authors self.__type=type self.__id=id self.__issue=issue def show(self, fileObj): #preliminary bib=u'' bib += u'@article{' + self.__id + u',\n' bib += u'\ttitle =\t\t{ ' + self.__title + u' },\n' bib += u'\tauthor=\t\t{ ' authorNr = len(self.__authors) if authorNr>0: for a in xrange(authorNr): bib += self.__authors[a][1] + u' ' + self.__authors[a][0] if a < authorNr-1: bib += u' and ' bib += u' },\n' bib +=u'\tjournal =\t{ ' + self.__journal + u' },\n' bib +=u'\tvolume =\t{ %s },\n' % self.__volume if self.__issue != -1: bib +=u'\tnumber =\t{ ' + self.__issue + u' },\n' else: bib +=u'\tnumber =\t{ },\n' bib +=u'\tyear =\t\t{ %i },\n' % self.__year bib +=u'\tpages =\t\t{ %s--%s },\n' % (self.__pages[0], self.__pages[1]) bib +=u'\tabstract =\t{ ' + self.__abstract + ' }\n}\n' fileObj.write(utf8_to_latex(bib)) # remove offending characters and replace with latex sequences================================= def utf8_to_latex(uString): """convert some special characters to latex characters - not exhaustive""" for i in _u_to_latex_dict.keys(): uString = uString.replace(i, _u_to_latex_dict[i]) return uString # parse xml record of article============================================================ def makeArtikleFromXml( record ): """take xml from ncbi pubmed article summary minus the top node and return article object""" #Volume temp = re.search(r"(?P[^<>]*)", record, re.S) if (temp != None): volume=temp.group("g") else: volume='' #if 'makeArtikleFromXml' in verbose: # print "Could not parse Volume" #return None #Journal name - required temp =re.search(r"(?P[^<>]*)", record, re.S) if (temp != None): journal=temp.group("g").strip() else: if 'makeArtikleFromXml' in verbose: print "Could not parse Journal=======================================" print record.encode('utf-8') return None #pages - required - fix any possible page range abbreviation problem temp =re.search(r"(?P[^<>]*)", record) if (temp != None): temp = temp.group("g").split("-") if len(temp)==0: if 'makeArtikleFromXml' in verbose: print "No page numbers decoded=======================================" print record.encode('utf-8') return None elif len(temp)==1: pages = (temp[0].strip()) elif len(temp)>=2: if len(temp[0]) != len(temp[1]): temp[1] = temp[0][0:len(temp[0])-len(temp[1])]+temp[1] pages = (temp[0], temp[1]) else: if 'makeArtikleFromXml' in verbose: print "Could not parse page numbers=======================================" print record.encode('utf-8') return None #year - required temp = re.search(r"[ \t\n]*(?P\d*)[\t\n ]*", record, re.S) if (temp != None): year=int(temp.group("g")) else: if 'makeArtikleFromXml' in verbose: print "Could not parse Year=======================================" print record.encode('utf-8') return None #title - required temp = re.search(r"(?P[^<>]*)", record, re.S) if (temp != None): title=temp.group("g").strip().replace(".","") else: if 'makeArtikleFromXml' in verbose: print "Could not parse Title=======================================" print record.encode('utf-8') return None #authors - required authList = re.search(r".*", record, re.S) if authList != None: authList = authList.group().split("") temp = r"(?P[^<>]*).*(?P[^<>]*)" authorREO = re.compile(temp, re.S) formatedAuthList=[] for i in authList: temp = authorREO.search(i) if temp != None: initials = temp.group("in") formatedInitials="" for j in initials: formatedInitials += j +". " formatedAuthList.append([temp.group("ln").strip(), formatedInitials.strip()]) else: if 'makeArtikleFromXml' in verbose: print "Could not parse authors=======================================" print record.encode('utf-8') return None #issue temp = re.search(r"(?P[^<>]*)", record, re.S) if (temp != None): issue=temp.group("g") else: issue=-1 #medlineID temp = re.search(r"(?P[^<>]*)", record, re.S) if (temp != None): id=temp.group("g").strip() else: id='' #abstract temp = re.search(r"(?P[^<>]*)", record) if (temp != None): abstract=temp.group("g").strip() else: abstract='' #make and return article temp = article(journal, pages, year, title, authors=formatedAuthList, volume=volume, issue=issue, id=id, abstract=abstract) return temp #main =============================================================== def main(): if len(sys.argv) < 3: sys.exit(_usage) #read unicode file and get records, one by one xmlfile = open(sys.argv[1],"r", "utf-8") line = xmlfile.readline() record="" articles = [] foundArticles = 0 while line != "": if line.find(u"") != -1: record="" foundArticles += 1 if foundArticles % 50 == 0: sys.stderr.write("Parsing article %i\n" % foundArticles) elif line.find(u"") != -1: #print record temp = makeArtikleFromXml(record) if temp != None: articles.append(temp) else: #accumulate record record = record + line line = xmlfile.readline() xmlfile.close() outfile = open(sys.argv[2], 'w','utf-8') for a in articles: a.show(outfile) sys.stderr.write("\nArticles found in file:\t%4i\nArticles converted:\t%4i\n" % (foundArticles, len(articles))) main()