import re
def complement(seq): return seq.translate(str.maketrans('ACGT', 'TGCA'))
def revcomp(seq): return complement(seq)[::-1]
def stop_test_for_Z(n, cds_lst, trap): new_cds_lst = [cds for cds in cds_lst if trap in cds] score_lst = [] if len(new_cds_lst) > 0: for haha in new_cds_lst: m = haha.find(trap) + 1 + 10 score = (m - 1 + n - 1) score_lst.append(score) if len([new_score for new_score in score_lst if score % 3 == 0]) > 0: return 1 else: return 0 else: return 0
def stop_test_for_F(n, cds_lst, trap): trap = revcomp(trap) new_cds_lst = [cds for cds in cds_lst if trap in cds] score_lst = [] if len(new_cds_lst) > 0: for haha in new_cds_lst: m = haha.find(trap) + 1 + 10 + 23 score = (m - n) score_lst.append(score) if len([new_score for new_score in score_lst if score % 3 == 0]) > 0: return 1 else: return 0 else: return 0
def read_cds(fs): fasta = {} with open(fs, 'r') as f: for line in f: if line.startswith(">"): seq_lst = [] name = line.strip("\n").replace(">", "") else: seq_lst.append(line.strip("\n")) fasta[name] = "".join(seq_lst) new_fasta = {key: value for key, value in fasta.items() if "unavailable" not in value} return new_fasta
def read_seq(fs): fasta = {} with open(fs, 'r') as f: for line in f: if line.startswith(">"): seq_lst = [] name = line.strip("\n").replace(">", "") else: seq_lst.append(line.strip("\n")) fasta[name] = seq_lst return fasta
def read_ref(fs): ref = {} with open(fs, 'r') as f: for line in f: if line.startswith(">"): name = line.strip("\n").replace(">", "") else: ref[name] = line.strip("\n")[118:155] return ref
def read_duiying(fs): duiying = {} with open(fs, 'r') as f: for line in f: lable, gene = line.split("\t") duiying[lable.strip("\n")] = gene.strip("\n") return duiying
def motif_Z(trap, site): if trap[10:30][site: site + 3] in ["CAG", "CGA", "CAA"]: return 1 else: return 0
def motif_F(trap, site): if "CCA" in trap[10:30][site-2: site + 3]: return 1 else: return 0
lable_map_gene = read_duiying("../duiying.txt") all_cds = read_cds("../mart_export.txt") all_ref = read_ref("ref.fa") all_trapseq = read_seq("new508.filter.fa") with open("pass_lst_f.txt", 'a') as wocao: wocao.write( "TRAP ID\tGene\tgRNA sequence\tstrand\tC>T position\tC>T efficiency\n") with open("pass_lst_z.txt", 'a') as wocao: wocao.write( "TRAP ID\tGene\tgRNA sequence\tstrand\tC>T position\tC>T efficiency\n") for name, traps in all_ref.items(): gene = lable_map_gene[name] cds_lst = [vals for keys, vals in all_cds.items() if gene in keys] pass_lst = [] for m in re.finditer("C", traps[10:30]): if motif_Z(traps, m.start()): if stop_test_for_Z((m.start() + 1), cds_lst, traps) and name in list(all_trapseq.keys()): i = 0 for gg in all_trapseq[name]: if gg[10:30][m.start()] == "T": i = i + 1 eff = i/len(all_trapseq[name]) pass_lst.append( [name, traps[10:33], "+", (m.start() + 1), eff]) if len(pass_lst) > 0: with open("pass_lst_z.txt", 'a') as wocao: for nima in pass_lst: [trap_id, trap_seq, strand, ctposition, efftion] = nima wocao.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( trap_id, gene, trap_seq, strand, ctposition, efftion))
for name, traps in all_ref.items(): gene = lable_map_gene[name] cds_lst = [vals for keys, vals in all_cds.items() if gene in keys] pass_lst = [] for m in re.finditer("C", traps[10:30]): if motif_F(traps, m.start()): if stop_test_for_F(((traps[10:30][m.start()-2: m.start() + 3]).find("CCA")+1+2), cds_lst, traps) and name in list(all_trapseq.keys()): j = 0 for gg in all_trapseq[name]: if gg[10:30][m.start()] == "T": j = j + 1 eff = j/len(all_trapseq[name]) pass_lst.append( [name, traps[10:33], "-", (m.start() + 1), eff]) if len(pass_lst) > 0: with open("pass_lst_f.txt", 'a') as wocao: for nima in pass_lst: [trap_id, trap_seq, strand, ctposition, efftion] = nima wocao.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( trap_id, gene, trap_seq, strand, ctposition, efftion))
|