import sys defcalculateGC(sequence:str)->Tuple[int,int]: """Calculate the GC content of a DNA sequence""" gc = 0 allnumber = 0 for i in sequence: if i != 'N'and i != 'n': allnumber += 1 if i == 'G'or i == 'C'or i == 'g'or i == 'c': gc += 1 return gc, allnumber
defmain(file:str): """Main function""" gcNum = 0 allNum = 0 withopen(file,'r') as f: for line in f: line=line.strip() if line.startswith('>'): continue else: gc, allnumber = calculateGC(line) gcNum += gc allNum += allnumber print(f'GC content is: {gcNum/allNum:.3f}')
main(sys.argv[1])
然后运行
codon build --release -o calGC calGC.py
最后速度为:
time ./calGC ../../Project/DataBase/hg38.fa GC content is: 0.410 ./calGC ../../Project/DataBase/hg38.fa 22.30s user 2.75s system 111% cpu 22.421 total
function lineGC(seq::String) GCnumber=count(x->(x=='G'||x=='C'||x=='g'||x=='c'),seq) lineNum=count(x->(x!='N' && x!='n'),seq) (GCnumber,lineNum) end
function calGC(fs) GCnumber=zero(Int) lineNum=zero(Int) open(fs,"r") do IOstream for line in eachline(IOstream) if startswith(line,">") continue else GC,all=lineGC(line) GCnumber+=GC lineNum+=all end end end round(GCnumber/lineNum;digits=3) end
println("GC content: ",calGC(ARGS[1]))
python
import sys
deflineGC(seq): tmp=[base for base in seq if base =="G"or base =="g"or base == "C"or base == "c"] gcNumber=len(tmp) tmp2=[base for base in seq if base !="N"and base !="n"] allNumber=len(tmp2) return (gcNumber,allNumber)
withopen(sys.argv[1],'r') as f: gcNum=0 allNum=0 for line in f: if line.startswith(">"): continue else: gc,alln=lineGC(line.strip("\n")) gcNum=gcNum+gc allNum=allNum+alln
withopen(sys.argv[1],'r') as f: gcNum=0 allNum=0 for line in f: if line.startswith(">"): continue else: gc,alln=lineGC(line.strip("\n")) gcNum=gcNum+gc allNum=allNum+alln
print("GC content: {:.3f}".format(gcNum/allNum))
这样计算下来,大概需要6分20秒,提速了一半
另外,我们也可以使用NumPy的向量化运算来提速
import sys from pyfaidx import Fasta import numpy as np
function lineGC(seq) GCnumber=count(x->(x==DNA_G||x==DNA_C),seq) lineNum=length(seq)-count(isambiguous,seq) GCnumber,lineNum end
function calGC(fs) GCnumber=zero(Int) lineNum=zero(Int) reader=open(FASTA.Reader,fs) for record in reader GC,all=lineGC(FASTA.sequence(record)) GCnumber+=GC lineNum+=all end close(reader) round(GCnumber/lineNum;digits=3) end