usage: fastp -i <in1> -o <out1> [-I <in1> -O <out2>] [options...] options: # I/O options 即输入输出文件设置 -i, --in1 read1 input file name (string) -o, --out1 read1 output file name (string [=]) -I, --in2 read2 input file name (string [=]) -O, --out2 read2 output file name (string [=]) -6, --phred64 indicates the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33) -z, --compression compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 2. (int [=2]) --reads_to_process specify how many reads/pairs to be processed. Default 0 means process all reads. (int [=0]) # adapter trimming options 过滤序列接头参数设置 -A, --disable_adapter_trimming adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled -a, --adapter_sequence the adapter for read1. For SE data, if not specified, the adapter will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. (string [=auto]) --adapter_sequence_r2 the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as <adapter_sequence> (string [=]) # global trimming options 剪除序列起始和末端的低质量碱基数量参数 -f, --trim_front1 trimming how many bases in front for read1, default is 0 (int [=0]) -t, --trim_tail1 trimming how many bases in tail for read1, default is 0 (int [=0]) -F, --trim_front2 trimming how many bases in front for read2. If it's not specified, it will follow read1's settings (int [=0]) -T, --trim_tail2 trimming how many bases in tail for read2. If it's not specified, it will follow read1's settings (int [=0]) # polyG tail trimming, useful for NextSeq/NovaSeq data polyG剪裁 -g, --trim_poly_g force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data --poly_g_min_len the minimum length to detect polyG in the read tail. 10 by default. (int [=10]) -G, --disable_trim_poly_g disable polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data # polyX tail trimming -x, --trim_poly_x enable polyX trimming in 3' ends. --poly_x_min_len the minimum length to detect polyX in the readtail. 10 by default. (int [=10])
# per read cutting by quality options 划窗裁剪 -5, --cut_by_quality5 enable per read cutting by quality in front (5'), default is disabled (WARNING: this will interfere deduplication for both PE/SE data) -3, --cut_by_quality3 enable per read cutting by quality in tail (3'), default is disabled (WARNING: this will interfere deduplication for SE data) -W, --cut_window_size the size of the sliding window for sliding window trimming, default is 4 (int [=4]) -M, --cut_mean_quality the bases in the sliding window with mean quality below cutting_quality will be cut, default is Q20 (int [=20])
# quality filtering options 根据碱基质量来过滤序列 -Q, --disable_quality_filtering quality filtering is enabled by default. If this option is specified, quality filtering is disabled -q, --qualified_quality_phred the quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. (int [=15]) -u, --unqualified_percent_limit how many percents of bases are allowed to be unqualified (0~100). Default 40 means 40% (int [=40]) -n, --n_base_limit if one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5 (int [=5]) # length filtering options 根据序列长度来过滤序列 -L, --disable_length_filtering length filtering is enabled by default. If this option is specified, length filtering is disabled -l, --length_required reads shorter than length_required will be discarded, default is 15. (int [=15]) # low complexity filtering -y, --low_complexity_filter enable low complexity filter. The complexity is defined as the percentage of base that is different from its next base (base[i] != base[i+1]). -Y, --complexity_threshold the threshold for low complexity filter (0~100). Default is 30, which means 30% complexity is required. (int [=30]) # filter reads with unwanted indexes (to remove possible contamination) --filter_by_index1 specify a file contains a list of barcodes of index1 to be filtered out, one barcode per line (string [=]) --filter_by_index2 specify a file contains a list of barcodes of index2 to be filtered out, one barcode per line (string [=]) --filter_by_index_threshold the allowed difference of index barcode for index filtering, default 0 means completely identical. (int [=0]) # base correction by overlap analysis options 通过overlap来校正碱基 -c, --correction enable base correction in overlapped regions (only for PE data), default is disabled # UMI processing -U, --umi enable unique molecular identifer (UMI) preprocessing --umi_loc specify the location of UMI, can be (index1/index2/read1/read2/per_index/per_read, default is none (string [=]) --umi_len if the UMI is in read1/read2, its length should be provided (int [=0]) --umi_prefix if specified, an underline will be used to connect prefix and UMI (i.e. prefix=UMI, UMI=AATTCG, final=UMI_AATTCG). No prefix by default (string [=]) --umi_skip if the UMI is in read1/read2, fastp can skip several bases following UMI, default is 0 (int [=0]) # overrepresented sequence analysis -p, --overrepresentation_analysis enable overrepresented sequence analysis. -P, --overrepresentation_sampling One in (--overrepresentation_sampling) reads will be computed for overrepresentation analysis (1~10000), smaller is slower, default is 20. (int [=20]) # reporting options -j, --json the json format report file name (string [=fastp.json]) -h, --html the html format report file name (string [=fastp.html]) -R, --report_title should be quoted with ' or ", default is "fastp report" (string [=fastp report]) # threading options 设置线程数 -w, --thread worker thread number, default is 3 (int [=3]) # output splitting options -s, --split split output by limiting total split file number with this option (2~999), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default (int [=0]) -S, --split_by_lines split output by limiting lines of each file with this option(>=1000), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default (long [=0]) -d, --split_prefix_digits the digits for the sequential number padding (1~10), default is 4, so the filename will be padded as 0001.xxx, 0 to disable padding (int [=4]) # help -?, --help print this message
定义: RPKM: Reads Per Kilobase of exon model per Million mapped reads (每千个碱基的转录每百万映射读取的 reads); FPKM: Fragments Per Kilobase of exon model per Million mapped fragments(每千个碱基的转录每百万映射读取的 fragments) 公式:
If we take the fragment length to be fixed, then the effective length is how many fragments can occur in the transcript. This turns out to be length - frag_len +1. The number of fragments coming from a transcript will be proportional to this number, regardless of whether you sequenced one or both ends of the fragment. In turn, when it comes to probabilistically assigning reads to transcripts the effective length plays a similar role again. Thus for short transcripts, there can be quite a difference between two fragment lengths. To go back to your example if you have transcript of length 310, your effective length is 10 (if fragment length is 300) or 160 (if fragment length is 150) in either case, which explains the discrepancy you see.
From @Rob
The effective length is computed by using the fragment length distribution to determine the effective number of positions that can be sampled on each transcript. You can think of this as convolving the fragment length distribution with the characteristic function (the function that simply takes a value of 1) over the transcript. For example if we observe fragments of length 50 — 1000, a position more than 1000 bases from the end of the transcript will contribute a value of 1 to the effective length, while a position 150 bases will contribute a value of F(150), where F is the cumulative distribution function of the fragment length distribution. For single end data, where we can’t learn an empirical FLD, we use a gaussian whose mean and standard deviation can be set with –fldMean and –fldSD respectively.
From Harold Pimentel’s post above. He is in Lior Pachter’s group.
Effective length refers to the number of possible start sites a feature could have generated a fragment of that particular length. In practice, the effective length is usually computed as:
where uFDL is the mean of the fragment length distribution which was learned from the aligned read. If the abundance estimation method you’re using incorporates sequence bias modeling (such as eXpress or Cufflinks), the bias is often incorporated into the effective length by making the feature shorter or longer depending on the effect of the bias.
for fs in fc: sample_name=fs.replace("lncRNA/normal/tiqu_trans/","").replace(".lncRNA.trans.counts","") df=pd.read_csv(fs,sep="\t",header=0,skiprows=1).iloc[:,[0,1]].set_index("Geneid") df.columns=['count'] df2=pd.read_csv(os.path.join(sm_dir,"{}_quant".format(sample_name),"quant.sf"),sep="\t",header=0).set_index("Name") df1=df.join(df2[['EffectiveLength']]) df1.to_csv(os.path.join(fc_dir,"{}.lncRNA.trans_eff.counts".format(sample_name)),sep="\t")
import pandas as pd import glob df=pd.read_csv("sample.lncRNA.trans.TPM",sep="\t") df=df[['TPM']] for name in glob.glob("*.TPM"): sample=name.replace(".lncRNA.trans.TPM","") df2=pd.read_csv(name,sep="\t") df2=df2[['TPM']] df2.columns=[sample] df=df.join(df2)
# Map over all gene groups (a gene and its associated transcripts) for g, txps in tgmap.groupby('g').groups.iteritems(): if j % 500 == 1: print("Processed {} genes".format(j)) j += 1 # The set of transcripts present in our salmon index tset = [] for t in txps: if t in ttable.index: tset.append(t) # If at least one of the transcripts was present iflen(tset) > 0: # The denominator is the sum of all TPMs totlen = ttable.loc[tset,'TPM'].sum() # Turn the relative TPMs into a proper partition of unity if totlen > 0: tpm_fracs = ttable.loc[tset, 'TPM'].values / ttable.loc[tset,'TPM'].sum() else: tpm_fracs = np.ones(len(tset)) / float(len(tset)) # Compute the gene's effective length as the abundance-weight # sum of the transcript lengths elen = 0.0 for i,t inenumerate(tset): elen += tpm_fracs[i] * ttable.loc[t, 'EffectiveLength'] gene_lengths[g] = elen
# Give the table an effective length field gtable['EffectiveLength'] = gtable.apply(lambda r : gene_lengths[r.name] if r.name in gene_lengths else1.0, axis=1) # Write it to the output file gtable.to_csv(args.output, sep='\t', index_label='Name')
import pandas as pd import subprocess,os,re from Bio import SeqIO hg19="LncRNA/testdir/all_mRNA_know.fa" noncode="LncRNA/testdir/all_lncRNA_know.fa" base="LncRNA/readscounts/salmon/gene_hit" hg19_fasta={seq_record.id:str(seq_record.seq) for seq_record in SeqIO.parse(hg19,'fasta')} lnc_fasta={seq_record.id:str(seq_record.seq) for seq_record in SeqIO.parse(noncode,'fasta')}
defget_energy(lncRNA,mRNA):
fa="lnc_tmp.fasta" fa2="mRNA_tmp.fasta" withopen(fa2,'w') as f: f.write(">{}\n{}".format(mRNA,hg19_fasta[mRNA])) withopen(fa,'w') as l: l.write(">{}\n{}".format(lncRNA,lnc_fasta[lncRNA]))
out,err=cm3.communicate() ans=0 for line in out.decode("utf-8").splitlines(): ifnot line.startswith(">"): ans=re.findall("\(-?\w+.?\w+\)",line)[0].replace("(","").replace(")","") #print(ans)
os.remove(fa2) os.remove(fa) return ans
withopen("xiangguanxing2.txt") as f: for line2 in f: if line2.startswith("NONH"): lncRNA,mRNA,*ot=line2.split(" ") print("{}--{}:".format(lncRNA,mRNA),get_energy(lncRNA,mRNA))