#! /usr/bin/env python _usage=""" USAGE: quickChange.py fastaFile mutationFile given a fastafile with one or more sequences with unique names and a list containing a desired mutation per line in the following format: sequenceName pos wt mut [pos wt mut] For example: a20r 120 A C a20r 240 C G 320 A C The mutations in each line have to be able to be mutated with one primer, otherwise the mutation if omitted from analysis The following guidlines for primer design are applied: 1) >=1 G or C on both ends of the target 2) 25-50 bases 3) Tm = 81.5 + 0.41*(%GC) - 675/N - %mismatch >= 78¡C 4) mutation should be in the middle of the primer 5) %GC >= 40% These rules rules are applied in a strict sense (SCORE) and in a more fuzzy sense (fSCORE; place more emphasis on melting temp; allow deviations; linear scoring models) """ import sys, re from Seq.SeqIO import readFasta from string import lower, upper from Seq import Seq from copy import copy from time import localtime from math import ceil, floor _debug=[] _version='v0.5 wolfgang resch 030306' _minLen=25 _maxLen=50 _minMeltTemp=78.0 _minFlankingNucl=10 _minGCClamp=1 _maxSpan=15 _minGC=40.0 _outputPrimers=5 #================================================================== class mutation: def __init__(self): self.seqName="" self.mutDict={} def __str__(self): temp="Sequence: %-20s" % self.seqName for pos,change in self.mutDict.items(): temp += "\n\tPos: %4i wt:%2s mut:%2s" % (pos, change[0],change[1]) return temp #================================================================== class primer: pass def __str__(self): output=80*'='+'\n' output += str(self.mutation) output += "\n"+60*'-' + '\n' output += "%s wt\n" % self.wtSeq mutations = list(self.length*".") mutSeq_pos = list(self.wtSeq) for i in self.mutation.mutDict.keys(): mutations[i-self.startPos]=self.mutation.mutDict[i][1] mutSeq_pos[i-self.startPos]=self.mutation.mutDict[i][1] mutSeq_pos = ''.join(mutSeq_pos) output += ''.join(mutations) + ' mut' + '\n\n' output += "Target start position: %4i\n" % self.startPos output += "Target end position: %4i\n" % self.endPos output += "Length: %3i\n" % self.length output += "Distance of first mutation from primer end: %i\n" % (min(self.mutation.mutDict.keys())-self.startPos) output += "Distance of last mutation from primer end: %i\n" % (self.endPos-max(self.mutation.mutDict.keys())) output += "%%GC: %3.1f\n" % self.gc_percentage output += "GC clamp upstream = %i\n" % self.gcClampUp output += "GC clamp downstream = %i\n" % self.gcClampDw output += "Melting temperature: %3.1f\n" % self.meltingTemp output += "\n SCORE: %i / 5\n" % self.score output += "fSCORE: %f\n" % self.fscore output += +60*'-' + '\nORDERING INFORMATION\n' output += "5' - " i=0 while i <= self.length: output += mutSeq_pos[i:i+3] + " " i += 3 output += " -3' , %i nts\n5' - " % self.length mutSeqRevComp = reverseComplement53(mutSeq_pos) i=0 while i <= self.length: output += mutSeqRevComp[i:i+3] + " " i += 3 output += " -3' , %i nts\n" % self.length output += 80*'=' + "\n" return output #================================================================== def reverseComplement53(seqString): """returns reverse complement, but in 5'-3' orientation""" dict={"C":"G","G":"C","A":"T","T":"A"} revComp=[] i=-1 while 1: try: revComp.append(dict[seqString[i]]) i -= 1 except IndexError: break return ''.join(revComp) #================================================================== def readMutations(filename): try: mutationFile=open(sys.argv[2], "r") except IOError, msg: sys.exit(msg) mutationList=[] recordList=mutationFile.readlines() for record in recordList: if record == '\n': continue temp=record.split() tempMut=mutation() tempMut.seqName=temp.pop(0).lower() #sequence names are lowercase ! temp = map(upper, temp) while not(len(temp)<3): tempMut.mutDict[int(temp[0])]=(temp[1],temp[2]) temp[0:3]=[] mutationList.append(tempMut) if 'readMutations' in _debug: print tempMut mutationFile.close() return mutationList #================================================================== def meltingTemp(length, mismatch_percentage, gc_percentage): return 81.5+0.41*gc_percentage-675/length-mismatch_percentage def abs(a): if a >= 0: return a else: return -a def writeConstraints(outfile): temp='/'.join(map(str,localtime()[0:3])) + ' ' + ':'.join(map(str,localtime()[3:6]))+' '+_version+'\n' temp+=40*'='+'\nCONSTAINTS \n' + 40*'=' + '\n' temp += "Max Length:\t\t%4i\nMin Length:\t\t%4i\n" % (_maxLen, _minLen) temp += "MinMeltTemp:\t\t%4.1fC\n" % _minMeltTemp temp += "MinFlankingNucl:\t%4i\n" % _minFlankingNucl temp += "MinGCClamp:\t\t%4i\n" % _minGCClamp temp += "MinGCContent:\t\t%4.1f\n\n\n" % _minGC outfile.write(temp) def writeHistogram(title, scores, outfile): temp=title + '\n' minScores=floor(min(scores)) maxScores=ceil(max(scores)) binNr=10 span=maxScores-minScores counts=binNr*[0] for i in scores: counts[int((i-minScores)/span*binNr)] += 1 for i in range(len(counts)): start = i/float(binNr)*span+minScores stop = (i+1)/float(binNr)*span+minScores temp += "[% 3.2f - % 3.2f[ " % (start, stop)+ counts[i]*'*' + '\n' temp += '\n' outfile.write(temp) #================================================================== def primerData(seqString, mismatchNr): temp=primer() temp.wtSeq=seqString temp.length=len(seqString) temp.gc_percentage = round(len(filter(lambda s: (s=="G") or (s=="C"), seqString.upper()))/float(temp.length)*100) temp.mismatch_percentage = round(mismatchNr/float(temp.length)*100) #test for gc clamp on both ends: temp.gcClampUp=0 temp.gcClampDw=0 i=0 while 1: if seqString[i] == 'G' or seqString[i] == 'C': temp.gcClampUp += 1 i += 1 else: break i=-1 while 1: if seqString[i] == 'G' or seqString[i] == 'C': temp.gcClampDw += 1 i -= 1 else: break temp.meltingTemp=meltingTemp( temp.length, temp.mismatch_percentage, temp.gc_percentage ) temp.score=0 temp.score += (temp.length >= _minLen) and (temp.length <= _maxLen) temp.score += temp.gc_percentage >= 40 temp.score += temp.gcClampUp >= _minGCClamp temp.score += temp.gcClampDw>= _minGCClamp temp.score += (temp.meltingTemp >= _minMeltTemp) temp.fscore=0 #use linear models to punish deviation from desired values if (temp.length >= _minLen) and (temp.length <= _maxLen): temp.fscore += 1.0 elif temp.length < _minLen: temp.fscore += 1.0 - abs(temp.length-_minLen)*0.2 else: temp.fscore += 1.0 - abs(_maxLen-temp.length)*0.2 temp.fscore += min(temp.gc_percentage*0.2-7.0, 1.0) temp.fscore += min((temp.gcClampUp + temp.gcClampDw)/10.0, 0.5) temp.fscore += min(-temp.meltingTemp*0.1+9.8, temp.meltingTemp*0.45-33.1) return temp #================================================================== def main(): if len(sys.argv) != 3: print _usage sequenceDict=readFasta(filename=sys.argv[1], degap=0, output='dict') mutationList = readMutations(filename=sys.argv[2]) for mutation in mutationList: if not sequenceDict.has_key(mutation.seqName): print "Sequence %s not found in fasta file" % mutation.seqName else: seq=sequenceDict[mutation.seqName] positions=mutation.mutDict.keys() positions.sort() mutNr=len(positions) span=max(positions)-min(positions) #make sure positions are within 15 nucleotides of each other if span > _maxSpan: print "The following mutagenesis definition contained mutations spaced too widely:" print mutation continue ####### if 'main' in _debug: print 85*'-' for pos in positions: print "Seq: %20s Pos: %4i nucl. in sequence: %2s wt given: %2s mutant: %2s" % (mutation.seqName,\ pos, seq[pos-1], mutation.mutDict[pos][0], \ mutation.mutDict[pos][1]) print 85*'-' ####### startMax=min(positions)-_minFlankingNucl startList = range(startMax-20, startMax) endMin=max(positions)+_minFlankingNucl endList=range(endMin, endMin+20) #test primers with starting and ending positions that make somewhat symmetrical primers primerList=[] posListLength = len(startList) for pos in xrange(posListLength): for end in endList[max(0,posListLength-(pos+4)):min(posListLength,posListLength-(pos-4))]: mutPrimer = primerData(seqString=seq.getSeq1(startList[pos], end), mismatchNr= mutNr) mutPrimer.startPos=startList[pos] mutPrimer.endPos=end mutPrimer.mutation=copy(mutation) primerList.append(copy(mutPrimer)) bestfPrimers=[] fscores=[] scores=[] while primerList !=[]: p=primerList.pop(0) bestfPrimers = filter(lambda a: a.fscore > p.fscore, bestfPrimers) if len(bestfPrimers) < _outputPrimers: bestfPrimers.append(p) fscores.append(p.fscore) scores.append(p.score) temp=min(mutation.mutDict.keys()) outfile = open("%s_%i%s%s.quickchange" % (mutation.seqName, temp, mutation.mutDict[temp][0],mutation.mutDict[temp][1]),'w') writeConstraints(outfile) writeHistogram("fScore distribution", fscores, outfile) for p in bestfPrimers: outfile.write(str(p)) outfile.close() #================================================================== if __name__ == '__main__': main()