#! /usr/bin/awk -f # this program counts the frequency of each amino acid in each protein # in a fasta file (from pox bioinfo webpage) BEGIN {nrProt=0; protLength=0} # identify header line />/ {split($1,temp,/\|/); nrProt++; protLength=0; if (NR==1) printf("%20s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%6s\n"," ","A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y","charge") if (NR==1) printf("%-20\s",temp[3]); # this prints the sequence name from bioinfo page if (NR != 1) for (i in aaArray) printf("%4i",aaArray[i]); if (NR != 1) printf("%6i",aaArray["K"]+aaArray["R"]-aaArray["D"]-aaArray["E"]); if (NR != 1) printf("\n"); if (NR != 1) printf("%-20\s",temp[3]); for (i in aaArray) aaArray[i] = 0} # sequence containing lines !/>/ {countAA($0,aaArray); protLength = protlength + length()} # in the end print info for last sequence END {for (i in aaArray) printf("%4i",aaArray[i])} function countAA(s,a, temp){ a["A"]= a["A"] + gsub("A","Z",s); a["C"]= a["C"] + gsub("C","Z",s); a["D"]= a["D"] + gsub("D","Z",s); a["E"]= a["E"] + gsub("E","Z",s); a["F"]= a["F"] + gsub("F","Z",s); a["G"]= a["G"] + gsub("G","Z",s); a["H"]= a["H"] + gsub("H","Z",s); a["I"]= a["I"] + gsub("I","Z",s); a["K"]= a["K"] + gsub("K","Z",s); a["L"]= a["L"] + gsub("L","Z",s); a["M"]= a["M"] + gsub("M","Z",s); a["N"]= a["N"] + gsub("N","Z",s); a["P"]= a["P"] + gsub("P","Z",s); a["Q"]= a["Q"] + gsub("Q","Z",s); a["R"]= a["R"] + gsub("R","Z",s); a["S"]= a["S"] + gsub("S","Z",s); a["T"]= a["T"] + gsub("T","Z",s); a["V"]= a["V"] + gsub("V","Z",s); a["W"]= a["W"] + gsub("W","Z",s); a["Y"]= a["Y"] + gsub("Y","Z",s); }