#! /usr/bin/python import SeqIO, Seq, random, sys, re _usage=""" USAGE: siRNA.py fastafile Reads in a fasta format sequence, identifies all AAN(19)TT and AAN(21) motifs, and outputs a table with the motif positions and GC-content. $Id: siRNA.py,v 1.1.1.1 2003/03/03 15:46:33 wresch Exp $""" def main(): if len(sys.argv) != 2: print _usage #read sequence in fasta format seqs = SeqIO.readFasta(filename=sys.argv[1], degap=1, output='list') #strict rule strictDEF = r"AA.{19,19}TT" strictREO = re.compile(strictDEF, re.I) #more relaxed relaxedDEF = r"AA.{19,19}.." relaxedREO = re.compile(relaxedDEF, re.I) #find targets for all sequeces for seq in seqs: outputMatches(strictREO, "STRICT MATCHES", seq.getSeq()) outputMatches(relaxedREO, "RELAXED MATCHES", seq.getSeq()) def outputMatches(reo, comment, seq): complementDict = {"G":"C","C":"G","A":"T","T":"A"} targetIter = reo.finditer(seq.upper()) print 70*"-","\n", comment, "\n%4s %10s %10s %27s %11s" % ("#","from","to","structure","%GC[1-21]") print 70*"-" counter=0 for target in targetIter: compDict = {"A":0, "G":0, "T":0,"C":0} complement = "" for nt in target.group()[0:21]: compDict[nt] += 1 complement += complementDict[nt] gc = (compDict["G"]+compDict["C"])/21.0*100.0 counter += 1 print "%4i %10i %10i 5'P--%-24s %8.1f" % (counter, target.start(), target.end(), target.group()[2:], gc) print "%4s %10s %10s %24s--P-5'" % ("","","",complement) main()