1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
| import re
def complement(seq): return seq.translate(str.maketrans('ACGT', 'TGCA'))
def revcomp(seq): return complement(seq)[::-1]
def stop_test_for_Z(n, cds_lst, trap): new_cds_lst = [cds for cds in cds_lst if trap in cds] score_lst = [] if len(new_cds_lst) > 0: for haha in new_cds_lst: m = haha.find(trap) + 1 + 10 score = (m - 1 + n - 1) score_lst.append(score) if len([new_score for new_score in score_lst if score % 3 == 0]) > 0: return 1 else: return 0 else: return 0
def stop_test_for_F(n, cds_lst, trap): trap = revcomp(trap) new_cds_lst = [cds for cds in cds_lst if trap in cds] score_lst = [] if len(new_cds_lst) > 0: for haha in new_cds_lst: m = haha.find(trap) + 1 + 10 + 23 score = (m - n) score_lst.append(score) if len([new_score for new_score in score_lst if score % 3 == 0]) > 0: return 1 else: return 0 else: return 0
def read_cds(fs): fasta = {} with open(fs, 'r') as f: for line in f: if line.startswith(">"): seq_lst = [] name = line.strip("\n").replace(">", "") else: seq_lst.append(line.strip("\n")) fasta[name] = "".join(seq_lst) new_fasta = {key: value for key, value in fasta.items() if "unavailable" not in value} return new_fasta
def read_seq(fs): fasta = {} with open(fs, 'r') as f: for line in f: if line.startswith(">"): seq_lst = [] name = line.strip("\n").replace(">", "") else: seq_lst.append(line.strip("\n")) fasta[name] = seq_lst return fasta
def read_ref(fs): ref = {} with open(fs, 'r') as f: for line in f: if line.startswith(">"): name = line.strip("\n").replace(">", "") else: ref[name] = line.strip("\n")[118:155] return ref
def read_duiying(fs): duiying = {} with open(fs, 'r') as f: for line in f: lable, gene = line.split("\t") duiying[lable.strip("\n")] = gene.strip("\n") return duiying
def motif_Z(trap, site): if trap[10:30][site: site + 3] in ["CAG", "CGA", "CAA"]: return 1 else: return 0
def motif_F(trap, site): if "CCA" in trap[10:30][site-2: site + 3]: return 1 else: return 0
lable_map_gene = read_duiying("../duiying.txt") all_cds = read_cds("../mart_export.txt") all_ref = read_ref("ref.fa") all_trapseq = read_seq("new508.filter.fa") with open("pass_lst_f.txt", 'a') as wocao: wocao.write( "TRAP ID\tGene\tgRNA sequence\tstrand\tC>T position\tC>T efficiency\n") with open("pass_lst_z.txt", 'a') as wocao: wocao.write( "TRAP ID\tGene\tgRNA sequence\tstrand\tC>T position\tC>T efficiency\n") for name, traps in all_ref.items(): gene = lable_map_gene[name] cds_lst = [vals for keys, vals in all_cds.items() if gene in keys] pass_lst = [] for m in re.finditer("C", traps[10:30]): if motif_Z(traps, m.start()): if stop_test_for_Z((m.start() + 1), cds_lst, traps) and name in list(all_trapseq.keys()): i = 0 for gg in all_trapseq[name]: if gg[10:30][m.start()] == "T": i = i + 1 eff = i/len(all_trapseq[name]) pass_lst.append( [name, traps[10:33], "+", (m.start() + 1), eff]) if len(pass_lst) > 0: with open("pass_lst_z.txt", 'a') as wocao: for nima in pass_lst: [trap_id, trap_seq, strand, ctposition, efftion] = nima wocao.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( trap_id, gene, trap_seq, strand, ctposition, efftion))
for name, traps in all_ref.items(): gene = lable_map_gene[name] cds_lst = [vals for keys, vals in all_cds.items() if gene in keys] pass_lst = [] for m in re.finditer("C", traps[10:30]): if motif_F(traps, m.start()): if stop_test_for_F(((traps[10:30][m.start()-2: m.start() + 3]).find("CCA")+1+2), cds_lst, traps) and name in list(all_trapseq.keys()): j = 0 for gg in all_trapseq[name]: if gg[10:30][m.start()] == "T": j = j + 1 eff = j/len(all_trapseq[name]) pass_lst.append( [name, traps[10:33], "-", (m.start() + 1), eff]) if len(pass_lst) > 0: with open("pass_lst_f.txt", 'a') as wocao: for nima in pass_lst: [trap_id, trap_seq, strand, ctposition, efftion] = nima wocao.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( trap_id, gene, trap_seq, strand, ctposition, efftion))
|