├── biastools ├── __init__.py ├── biastools_compare.sh ├── vcf_to_bed.py ├── biastools_predict.sh ├── filter_het_VCF.py ├── biastools_align.sh ├── predict_model.py ├── biastools_simulation.sh ├── biastools_analysis.sh ├── compare_bias_with_RD.py ├── sample_baseline.py ├── merge_baseline.py ├── biastools_scan.py ├── golden_graph_report.py ├── biastools.py ├── predict_experiment.py ├── indel_balance_plot.py ├── golden_graph.py ├── scanning_bias.py ├── ref_bi_naive.py └── consensus_vcf_map_adaptive.py ├── tutorial ├── HG002.chr20.part.vcf.gz ├── run.sh └── README.md ├── figures ├── context-aware-assignment.png ├── HG002.GIAB.4.2.1.demo.indel_balance.png └── context_aware.md ├── .gitignore ├── LICENSE ├── setup.py └── README.md /biastools/__init__.py: -------------------------------------------------------------------------------- 1 | #biastools 2 | import biastools.biastools 3 | import biastools.biastools_scan 4 | -------------------------------------------------------------------------------- /tutorial/HG002.chr20.part.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maojanlin/biastools/HEAD/tutorial/HG002.chr20.part.vcf.gz -------------------------------------------------------------------------------- /figures/context-aware-assignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maojanlin/biastools/HEAD/figures/context-aware-assignment.png -------------------------------------------------------------------------------- /figures/HG002.GIAB.4.2.1.demo.indel_balance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maojanlin/biastools/HEAD/figures/HG002.GIAB.4.2.1.demo.indel_balance.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.vcf 2 | *.fna 3 | *.fa 4 | *.vcf.* 5 | *.?am 6 | *.f* 7 | find_reads_given_HET.py 8 | region_spec_refbi.py 9 | old_biastools.sh 10 | -------------------------------------------------------------------------------- /tutorial/run.sh: -------------------------------------------------------------------------------- 1 | biastools --simulate -g grch38_chr20_part.fa -v HG002.chr20.part.vcf.gz -s HG002_part -r tutorial 2 | biastools --align -a bowtie2 -g grch38_chr20_part.fa -v HG002.chr20.part.vcf.gz -s HG002_part -r tutorial 3 | biastools --analyze -g grch38_chr20_part.fa -v HG002.chr20.part.vcf.gz -s HG002_part -r tutorial 4 | biastools_scan --scan -g grch38_chr20_part.fa -s HG002_part -r tutorial -i out_dir/HG002_part.tutorial.sorted.bam 5 | -------------------------------------------------------------------------------- /biastools/biastools_compare.sh: -------------------------------------------------------------------------------- 1 | path_out=$1 2 | sample_id=$2 3 | run_id=$3 4 | target_bed=$4 5 | improve_bed=$5 6 | improve_lowRd=$6 7 | path_module=$7 8 | prefix=${path_out}/${sample_id} 9 | 10 | bedtools subtract -a ${improve_bed} -b ${improve_lowRd} > ${prefix}.improve.goodRd.bias.bed 11 | bedtools intersect -a ${target_bed} -b ${improve_lowRd} > ${prefix}.improve.skipped.bias.bed 12 | 13 | python3 ${path_module}compare_bias_with_RD.py -lt ${target_bed} -li ${prefix}.improve.goodRd.bias.bed -lrd ${prefix}.improve.skipped.bias.bed -out ${prefix}.${run_id}.improve.bias.bed 14 | #python3 check_inside_centromere.py -lr1 centromere_extend.bed -lr2 ${prefix}.${run_id}.improve.bias.bed 15 | -------------------------------------------------------------------------------- /biastools/vcf_to_bed.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pysam 3 | 4 | 5 | def main(): 6 | parser = argparse.ArgumentParser(description="Generate a BED file from a VCF file.") 7 | parser.add_argument('-v', '--vcf', help='list of vcf files for input', required=True) 8 | parser.add_argument('-o', '--out', help='output bed file', required=True) 9 | args = parser.parse_args() 10 | 11 | vcf_path = args.vcf 12 | vcf = pysam.VariantFile(vcf_path) 13 | fo = open(args.out, 'w') 14 | for record in vcf: 15 | chrom = record.chrom 16 | start = record.start 17 | end = record.stop 18 | fo.write(f"{chrom}\t{start}\t{end}\n") 19 | fo.close() 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /biastools/biastools_predict.sh: -------------------------------------------------------------------------------- 1 | path_out=$1 2 | sample_id=$2 3 | run_id=$3 4 | flag_real=$4 5 | report_real=$5 6 | report_simulation=$6 7 | path_module=$7 8 | prefix=${path_out}/${sample_id} 9 | 10 | 11 | if [[ ${report_real} == 'none' ]]; then 12 | report_real=${prefix}.real.${run_id}.bias 13 | fi 14 | 15 | if [[ ${flag_real} == 1 || ${report_simulation} == 'none' ]]; then 16 | echo "[Biastools] Real report bias prediction." 17 | python3 ${path_module}predict_model.py -rr ${report_real} -out ${prefix}.real.${run_id} 18 | else 19 | echo "[Biastools] Bias prediction based on simulation report!" 20 | python3 ${path_module}predict_experiment.py -sr ${report_simulation} \ 21 | -rr ${report_real} \ 22 | -out ${prefix}.sim.${run_id} 23 | fi 24 | -------------------------------------------------------------------------------- /tutorial/README.md: -------------------------------------------------------------------------------- 1 | # Tutorial: running biastools 2 | 3 | In the tutorial, there are two initial files: 4 | - ```grch38_chr20_part.fa```, which is the beginning 506,000 bases of the chr20 of GRCh38 5 | - ```HG002.chr20.part.vcf.gz```, is the VCF file containing the first 1612 variants of HG002's chr20 called by Q100 project 6 | 7 | After installation, the user can run the ```run.sh``` script, which simulate the reads from the reference genome and VCF file, and then align the simulated reads with Bowtie 2, 8 | and then analyze the alignment with context-aware assignment method. Finally, the biastools scan mode is used to scan the whole alignment bam file. 9 | 10 | If the biastools is not installed, users can also directly called 11 | ``` 12 | python3 biastools/biastools.py 13 | ``` 14 | and 15 | ``` 16 | python3 biastools/biastools_scan.py 17 | ``` 18 | to run the procedure. 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Mao-Jan Lin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /biastools/filter_het_VCF.py: -------------------------------------------------------------------------------- 1 | #program to find HET sites from VCF 2 | import argparse 3 | import pysam 4 | 5 | def parse_het_site(fn_vcf, fn_output): 6 | in_vcf_file = pysam.VariantFile(fn_vcf, 'r') 7 | out_vcf_file = pysam.VariantFile(fn_output, 'w', header=in_vcf_file.header) 8 | for segment in in_vcf_file: 9 | #hap_info = str(segment).split()[9].split('|') # "0|0", "1|0", "0|1" tag 10 | #if hap_info[0] != hap_info[1]: 11 | phase_info = segment.samples[0]['GT'] 12 | if len(phase_info) != 2: 13 | print("WARNING! non diploid haplotype.") 14 | continue 15 | hap_0, hap_1 = phase_info 16 | if hap_0 == None or hap_1 == None: 17 | print("WARNING! one haplotype information is missing.") 18 | continue 19 | if hap_0 + hap_1 != 0: 20 | out_vcf_file.write(segment) 21 | in_vcf_file.close() 22 | out_vcf_file.close() 23 | 24 | if __name__ == "__main__": 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('-v', '--vcf', help='vcf/vcf.gz file for chromosomes') 27 | parser.add_argument('-o', '--out', help='output vcf.gz file with HET sites') 28 | args = parser.parse_args() 29 | 30 | fn_vcf = args.vcf 31 | fn_output = args.out 32 | 33 | parse_het_site(fn_vcf, fn_output) 34 | -------------------------------------------------------------------------------- /biastools/biastools_align.sh: -------------------------------------------------------------------------------- 1 | path_ref=$1 2 | path_vcf=$2 3 | path_out=$3 4 | sample_id=$4 5 | THR=$5 6 | ALN=$6 7 | ALN_IDX=$7 8 | run_id=$8 9 | path_module=$9 10 | prefix=${path_out}/${sample_id} 11 | 12 | echo "[Biastools] Align sequences to the original reference" 13 | if [[ ${ALN_IDX} == 'none' ]]; then 14 | ALN_IDX=${path_ref} 15 | fi 16 | 17 | if [[ ${ALN} == "bowtie2" ]]; then 18 | echo "[Biastools] Align with bowtie2" 19 | if [ ! -f ${ALN_IDX}.1.bt2 ]; then 20 | bowtie2-build ${path_ref} ${ALN_IDX} 21 | fi 22 | bowtie2 -p ${THR} -x ${ALN_IDX} --rg-id ${run_id}_hapA --rg SM:${sample_id} -1 ${prefix}.hapA_1.fq.gz -2 ${prefix}.hapA_2.fq.gz |\ 23 | samtools sort -o ${prefix}.hapA.${run_id}.sorted.bam 24 | bowtie2 -p ${THR} -x ${ALN_IDX} --rg-id ${run_id}_hapB --rg SM:${sample_id} -1 ${prefix}.hapB_1.fq.gz -2 ${prefix}.hapB_2.fq.gz |\ 25 | samtools sort -o ${prefix}.hapB.${run_id}.sorted.bam 26 | elif [[ ${ALN} == "bwamem" ]]; then 27 | echo "[Biastools] Align with BWA MEM" 28 | if [ ! -f ${ALN_IDX}.bwt ]; then 29 | bwa index ${path_ref} -p ${ALN_IDX} 30 | fi 31 | bwa mem -t ${THR} ${ALN_IDX} ${prefix}.hapA_1.fq.gz ${prefix}.hapA_2.fq.gz -R "@RG\tID:${run_id}_hapA\tSM:${sample_id}" |\ 32 | samtools sort -@ ${THR} -o ${prefix}.hapA.${run_id}.sorted.bam - 33 | bwa mem -t ${THR} ${ALN_IDX} ${prefix}.hapB_1.fq.gz ${prefix}.hapB_2.fq.gz -R "@RG\tID:${run_id}_hapB\tSM:${sample_id}" |\ 34 | samtools sort -@ ${THR} -o ${prefix}.hapB.${run_id}.sorted.bam - 35 | fi 36 | samtools merge -f ${prefix}.${run_id}.sorted.bam ${prefix}.hapA.${run_id}.sorted.bam ${prefix}.hapB.${run_id}.sorted.bam 37 | 38 | 39 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | setup( 3 | name = 'biastools', 4 | packages = ['biastools'], 5 | version = '0.3.1', 6 | license='MIT', 7 | description = 'The toolkits to analyze reference bias of short DNA read alignment.', 8 | author = 'Mao-Jan Lin', 9 | author_email = 'mj.maojanlin@gmail.com', 10 | url = 'https://github.com/maojanlin/biastools', 11 | download_url = 'https://github.com/maojanlin/biastools/tarball/master', 12 | keywords = ['biastools', 'reference bias', 'alignment'], 13 | install_requires=[ 14 | 'numpy', 15 | 'pysam', 16 | 'pandas', 17 | 'matplotlib', 18 | 'seaborn', 19 | 'scikit-learn', 20 | 'scipy' 21 | ], 22 | include_package_data=True, 23 | data_files=[('biastools', ['biastools/biastools_align.sh', 24 | 'biastools/biastools_compare.sh', 25 | 'biastools/biastools_simulation.sh', 26 | 'biastools/biastools_analysis.sh', 27 | 'biastools/biastools_predict.sh'])], 28 | zip_safe = False, 29 | classifiers=[ 30 | 'Development Status :: 3 - Alpha', 31 | 'Intended Audience :: Developers', 32 | 'Topic :: Software Development :: Build Tools', 33 | 'License :: OSI Approved :: MIT License', 34 | 'Programming Language :: Python :: 3', 35 | 'Programming Language :: Python :: 3.4', 36 | 'Programming Language :: Python :: 3.5', 37 | 'Programming Language :: Python :: 3.6', 38 | ], 39 | entry_points={"console_scripts": ["biastools = biastools.biastools:main","biastools_scan = biastools.biastools_scan:main"],}, 40 | ) 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /biastools/predict_model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | 7 | def predict_bias(real_feature, miss_info, best_threshold, out_prefix): 8 | """ 9 | quality score * balance score 10 | """ 11 | real_feature['z_MAPQ'] = ((real_feature['AVG_MAPQ'] - 45) * -1).clip(lower=0) 12 | real_feature['combine_score'] = (real_feature['z_MAPQ']) * (real_feature['BALANCE']) #* (real_feature['BALANCE']) 13 | real_feature['plus_score'] = (real_feature['z_MAPQ']/45) + 1.5*real_feature['BALANCE'] 14 | 15 | print(real_feature[real_feature['plus_score'] > best_threshold]) 16 | real_feature[real_feature['plus_score'] > best_threshold].to_csv(out_prefix + "_bias.tsv", index=False, sep = "\t") 17 | 18 | 19 | 20 | 21 | if __name__ == '__main__': 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('-rr', '--real_report', help='the real data bias report') 24 | parser.add_argument('-thr', '--threshold', help='the threshold for prediction model [1.5]', type=int, default=1.5) 25 | parser.add_argument('-out', '--out_prefix', help='the prefix for reports [predict]', type=str, default='predict') 26 | args = parser.parse_args() 27 | 28 | fn_real = args.real_report 29 | best_th = args.threshold 30 | out_prefix = args.out_prefix 31 | 32 | df_real = pd.read_csv(fn_real, sep='\t') 33 | 34 | # filter out the sites suspicious of imcomplete vcf information 35 | miss_info = (df_real['OTHER'] > df_real['NUM_READS'] * 0.9) + (df_real['OTHER'] > df_real['NUM_READS'] * 0.4) * \ 36 | ( (df_real['REF'] == 0) + (df_real['ALT'] == 0 )) 37 | 38 | df_real[miss_info].to_csv(out_prefix + "_suspicious.tsv", index=False, sep = "\t") 39 | print("filtered number:", sum(miss_info)) 40 | 41 | df_real_test = df_real[~miss_info] 42 | predict_bias(df_real_test, miss_info, best_th, out_prefix) 43 | 44 | 45 | -------------------------------------------------------------------------------- /biastools/biastools_simulation.sh: -------------------------------------------------------------------------------- 1 | path_ref=$1 2 | path_vcf=$2 3 | path_out=$3 4 | sample_id=$4 5 | THR=$5 6 | coverage=$6 7 | path_module=$7 8 | prefix=${path_out}/${sample_id} 9 | 10 | if [ ! -f "${path_ref}.fai" ]; then 11 | samtools faidx ${path_ref} 12 | fi 13 | 14 | bcftools norm -f ${path_ref} ${path_vcf} -m +any -Oz -o ${prefix}.normalized.vcf.gz 15 | bcftools index ${prefix}.normalized.vcf.gz 16 | 17 | echo "[Biastools] Generate haplotype consensus reference sequence" 18 | bcftools consensus -f ${path_ref} -o ${prefix}.hapA.fa -H 1 ${prefix}.normalized.vcf.gz -c ${prefix}.ref2hapA.chain 19 | bcftools consensus -f ${path_ref} -o ${prefix}.hapB.fa -H 2 ${prefix}.normalized.vcf.gz -c ${prefix}.ref2hapB.chain 20 | samtools faidx ${prefix}.hapA.fa 21 | samtools faidx ${prefix}.hapB.fa 22 | 23 | echo "[Biastools] Calculate how many reads should be generated" 24 | total_base=$(( $( cut -f2 ${path_ref}.fai | paste -s -d+ ) )) 25 | read_num=$(expr ${total_base} / 151 / 4 \* ${coverage}) 26 | echo "generating ${read_num} 2x151 reads in each haplotype" 27 | 28 | echo "[Biastools] Simulate sequences" 29 | mason_simulator --illumina-read-length 151 --num-threads ${THR} -ir ${prefix}.hapA.fa -o ${prefix}.hapA_1.fq -or ${prefix}.hapA_2.fq -oa ${prefix}.gt.hapA.sam -n ${read_num} 30 | mason_simulator --illumina-read-length 151 --num-threads ${THR} -ir ${prefix}.hapB.fa -o ${prefix}.hapB_1.fq -or ${prefix}.hapB_2.fq -oa ${prefix}.gt.hapB.sam -n ${read_num} --seed 9388 31 | samtools sort -@ ${THR} ${prefix}.gt.hapA.sam > ${prefix}.gt.hapA.sorted.bam 32 | samtools sort -@ ${THR} ${prefix}.gt.hapB.sam > ${prefix}.gt.hapB.sorted.bam 33 | samtools index ${prefix}.gt.hapA.sorted.bam 34 | samtools index ${prefix}.gt.hapB.sorted.bam 35 | rm ${prefix}.gt.hapA.sam 36 | rm ${prefix}.gt.hapB.sam 37 | 38 | gzip -f ${prefix}.hapA_1.fq 39 | gzip -f ${prefix}.hapA_2.fq 40 | gzip -f ${prefix}.hapB_1.fq 41 | gzip -f ${prefix}.hapB_2.fq 42 | 43 | echo "[Biastools] Filter the heterozygous site in vcf file" 44 | python3 ${path_module}filter_het_VCF.py -v ${prefix}.normalized.vcf.gz -o ${prefix}.het.vcf.gz 45 | tabix -p vcf ${prefix}.het.vcf.gz 46 | 47 | echo "[Biastools] Generate golden distribution report" 48 | python3 ${path_module}consensus_vcf_map_adaptive.py -v ${prefix}.het.vcf.gz \ 49 | -c0 ${prefix}.ref2hapA.chain \ 50 | -c1 ${prefix}.ref2hapB.chain \ 51 | -f0 ${prefix}.hapA.fa \ 52 | -f1 ${prefix}.hapB.fa \ 53 | -s0 ${prefix}.gt.hapA.sorted.bam \ 54 | -s1 ${prefix}.gt.hapB.sorted.bam \ 55 | -o ${prefix}.golden.rpt 56 | 57 | 58 | -------------------------------------------------------------------------------- /figures/context_aware.md: -------------------------------------------------------------------------------- 1 | ## Context-aware assignment algorithm 2 | This method works by searching for the REF and ALT alleles, together with some of their flanking sequence, within the sequences of all the reads that aligned overlapping the variant. 3 | Details were shown in the [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-024-03240-8). 4 | ![diagram](./context-aware-assignment.png) 5 | 6 | ### Cohort assignment 7 | Variants appearing within a short distance (default: 25 bp) together into a “cohort.” The cohorts are compared in the same style of local assignment. 8 | 9 | ### Local assignment 10 | The read sequence is compared to `hap1` and `hap2` alleles of each variant. To account for discripancies between read alignment and VCF representation, multiple anchor points are tested (see Fig. 7 in the [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-024-03240-8)). If a read perfectly matches to one of `hap1` or `hap2`, it is assigned accordingly. 11 | If the read matches to both `hap1` and `hap2`, a situation commonly observed in short tandem repeats, the read is assigned as `both`. 12 | Reads that do not perfectly match to either `hap1` or `hap2` under any anchor point are categorized as `others`. 13 | 14 | As shown in the pipeline figure example, the variant is an insertion. The `hap1` sequence includes the inserted segment along with 5 bp of flanking sequence on both sides, while `hap2` consists of only the flanking sequences without the insertion. 15 | Read1 and read2 can be successfully assigned, whereas read3 and read4 are categorized as `others` due to sequencing error. 16 | 17 | ### Edit-distance assignment 18 | Starting from biastools v0.3.0, an additional edit-distance assignment step is introduced to mitigate the impact of sequencing errors on reads previously categorized as ``others`` in the first two stages. 19 | As shown in the figure, the edit distance between read3 and `hap1` is computed (value: 1) as well as the distance to `hap2` (value: 3). The edit-distances are then normalized 20 | by dividing by the length of `hap1` (longer) `hap2` (shorter), respectively. Since the normalized edit distance to `hap1` is shorter, read3 is assigned to hap1. 21 | 22 | #### Conditions 23 | By default, the edit-distance assignment is applied only when the following conditions are met: 24 | 1. The edit-distance between the read and the final assignned haplotype is less than or equal to 5. 25 | 2. The length of the gap is shorter than 20 bp. 26 | 27 | These two conditions are selected based on empirical observations. 28 | -------------------------------------------------------------------------------- /biastools/biastools_analysis.sh: -------------------------------------------------------------------------------- 1 | path_ref=$1 2 | path_vcf=$2 3 | path_out=$3 4 | sample_id=$4 5 | THR=$5 6 | run_id=$6 7 | flag_real=$7 8 | flag_naive=$8 9 | boundary=$9 10 | path_module=${10} 11 | prefix=${path_out}/${sample_id} 12 | bam_file=${11} 13 | 14 | 15 | echo "[Biastools] Intersect the bam file and vcf file" 16 | if [ ! -f ${prefix}.het.vcf.gz ]; then 17 | bcftools norm -f ${path_ref} ${path_vcf} -m +any -Oz -o ${prefix}.normalized.vcf.gz 18 | bcftools index ${prefix}.normalized.vcf.gz 19 | python3 ${path_module}filter_het_VCF.py -v ${prefix}.normalized.vcf.gz -o ${prefix}.het.vcf.gz 20 | tabix -p vcf ${prefix}.het.vcf.gz 21 | fi 22 | if [ ! -f ${prefix}.${run_id}.sorted.het.bam ]; then 23 | python3 ${path_module}vcf_to_bed.py -v ${prefix}.het.vcf.gz -o ${prefix}.het.bed 24 | samtools view -h -L ${prefix}.het.bed ${bam_file} -@ ${THR} | samtools sort -@ ${THR} > ${prefix}.${run_id}.sorted.het.bam 25 | samtools index ${prefix}.${run_id}.sorted.het.bam 26 | fi 27 | 28 | 29 | echo "[Biastools] Reference bias analysis" 30 | if [[ ${flag_naive} == 1 ]]; then 31 | assign_method=${path_module}"ref_bi_naive.py" 32 | else 33 | assign_method=${path_module}"ref_bi_context.py" 34 | fi 35 | 36 | mkdir -p ${path_out}/${run_id}"_report" 37 | r_prefix=${path_out}/${run_id}"_report"/${sample_id} 38 | if [[ ${flag_real} == 1 ]]; then 39 | python3 ${assign_method} -s ${prefix}.${run_id}.sorted.het.bam \ 40 | -v ${prefix}.het.vcf.gz \ 41 | -f ${path_ref} \ 42 | -p ${prefix}.golden.rpt.pickle \ 43 | -o ${r_prefix}.${run_id}.real.bias \ 44 | --real 45 | # indel balance plot 46 | python3 ${path_module}indel_balance_plot.py -lr ${r_prefix}.${run_id}.real.bias.all \ 47 | -ln ${run_id} \ 48 | -vcf ${prefix}.het.vcf.gz \ 49 | -bd ${boundary} \ 50 | -map \ 51 | -out ${r_prefix}.${run_id}.real \ 52 | -real 53 | else 54 | python3 ${assign_method} -s ${prefix}.${run_id}.sorted.het.bam \ 55 | -v ${prefix}.het.vcf.gz \ 56 | -f ${path_ref} \ 57 | -p ${prefix}.golden.rpt.pickle \ 58 | -o ${r_prefix}.${run_id}.sim.bias 59 | 60 | # report the bias categories and report 61 | python3 ${path_module}golden_graph_report.py -mb ${r_prefix}.${run_id}.sim.bias.snp -out ${r_prefix}.${run_id}.snp 62 | python3 ${path_module}golden_graph_report.py -mb ${r_prefix}.${run_id}.sim.bias.gap -out ${r_prefix}.${run_id}.gap 63 | # plot the measures with NMB and NAB 64 | python3 ${path_module}golden_graph.py -mb ${r_prefix}.${run_id}.sim.bias.snp -out ${r_prefix}.${run_id}.snp 65 | python3 ${path_module}golden_graph.py -mb ${r_prefix}.${run_id}.sim.bias.gap -out ${r_prefix}.${run_id}.gap 66 | # indel balance plot 67 | python3 ${path_module}indel_balance_plot.py -lr ${r_prefix}.${run_id}.sim.bias.all \ 68 | -ln ${run_id} \ 69 | -vcf ${prefix}.het.vcf.gz \ 70 | -bd ${boundary} \ 71 | -map \ 72 | -out ${r_prefix}.${run_id}.sim 73 | fi 74 | -------------------------------------------------------------------------------- /biastools/compare_bias_with_RD.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | 4 | 5 | def read_list_bed(list_bed): 6 | dict_chr_bias = {} 7 | for fn_bed in list_bed: 8 | f = open(fn_bed) 9 | f.readline() 10 | for line in f: 11 | fields = line.split() 12 | contig = fields[0] 13 | start = int(fields[1]) 14 | stop = int(fields[2]) 15 | if dict_chr_bias.get(contig): 16 | dict_chr_bias[contig].append((start, stop)) 17 | else: 18 | dict_chr_bias[contig] = [(start, stop)] 19 | return dict_chr_bias 20 | 21 | 22 | def compare_bias_regions(dict_target, dict_improve, dict_lowRd, out_file): 23 | assert sorted(dict_target.keys()) == sorted(dict_improve.keys()), "discrepancy on the reference of the two lists" 24 | f_o = open(out_file, 'w') 25 | f_o.write('#chrom\tchromStart\tchromEnd\tname(%;initial;improve;lowRd)\n') 26 | 27 | total_100 = [] 28 | total_75 = [] 29 | total_50 = [] 30 | total_25 = [] 31 | total_under_25 = [] 32 | for contig in sorted(dict_target.keys()): 33 | local_100 = [] 34 | local_75 = [] 35 | local_50 = [] 36 | local_25 = [] 37 | local_under_25 = [] 38 | 39 | region_target = dict_target [contig] 40 | region_improve = dict_improve[contig] 41 | region_lowRd = dict_lowRd [contig] 42 | idx_2 = 0 43 | idx_3 = 0 44 | for region in region_target: 45 | start_1, stop_1 = region 46 | #if stop_1 - start_1 < 1000: 47 | # continue 48 | contain_region_2 = [] 49 | contain_lowRd = [] 50 | for idx in range(idx_2, len(region_improve)): 51 | start_2, stop_2 = region_improve[idx] 52 | if stop_2 < start_1: 53 | continue 54 | elif start_2 < stop_1: 55 | contain_region_2.append(region_improve[idx]) 56 | else: 57 | idx_2 = idx-1 58 | break 59 | for idx in range(idx_3, len(region_lowRd)): 60 | start_3, stop_3 = region_lowRd[idx] 61 | if stop_3 < start_1: 62 | continue 63 | elif start_3 < stop_1: 64 | contain_lowRd.append(region_lowRd[idx]) 65 | else: 66 | idx_3 = idx-1 67 | break 68 | 69 | len_region_1 = stop_1 - start_1 70 | len_region_2 = sum([ele[1]-ele[0] for ele in contain_region_2]) 71 | len_region_3 = sum([ele[1]-ele[0] for ele in contain_lowRd]) 72 | improve_len = len_region_1 - len_region_2 - len_region_3 73 | if improve_len == len_region_1: 74 | local_100.append(region) 75 | f_o.write(contig + '\t' + str(start_1) + '\t' + str(stop_1) + '\t' + '100;' + str(len_region_1) + ';' + str(len_region_2) + ';' + str(len_region_3) + '\n') 76 | elif improve_len >= len_region_1*0.75: 77 | local_75.append(region) 78 | f_o.write(contig + '\t' + str(start_1) + '\t' + str(stop_1) + '\t' + '75;' + str(len_region_1) + ';' + str(len_region_2) + ';' + str(len_region_3) + '\n') 79 | elif improve_len >= len_region_1*0.5: 80 | local_50.append(region) 81 | f_o.write(contig + '\t' + str(start_1) + '\t' + str(stop_1) + '\t' + '50;' + str(len_region_1) + ';' + str(len_region_2) + ';' + str(len_region_3) + '\n') 82 | elif improve_len >= len_region_1*0.25: 83 | local_25.append(region) 84 | f_o.write(contig + '\t' + str(start_1) + '\t' + str(stop_1) + '\t' + '25;' + str(len_region_1) + ';' + str(len_region_2) + ';' + str(len_region_3) + '\n') 85 | else: 86 | local_under_25.append(region) 87 | total_100 += local_100 88 | total_75 += local_75 89 | total_50 += local_50 90 | total_25 += local_25 91 | total_under_25 += local_under_25 92 | print(contig, len(local_100), len(local_75), len(local_50), len(local_25), len(local_under_25)) 93 | f_o.close() 94 | len_total = sum([len(total_100), len(total_75), len(total_50), len(local_25), len(total_under_25)]) 95 | print(len(total_100), round(len(total_100)/len_total,3), \ 96 | len(total_75), round(len(total_75)/len_total,3), \ 97 | len(total_50), round(len(total_50)/len_total,3), \ 98 | len(total_25), round(len(total_25)/len_total,3), \ 99 | len(total_under_25), round(len(total_under_25)/len_total,3)) 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | if __name__ == "__main__": 108 | parser = argparse.ArgumentParser() 109 | parser.add_argument('-lt', '--list_target', nargs='+', required=True, help='the first list of scanning bias bed report') 110 | parser.add_argument('-li', '--list_improve', nargs='+', required=True, help='the second list of scanning bias bed report, the region should contain in list 1') 111 | parser.add_argument('-lrd', '--list_lowRd', nargs='+', required=True, help='the second list of scanning bias bed report, the region should contain in list 1') 112 | parser.add_argument('-out', '--output_improve', help="output the improve regions") 113 | args = parser.parse_args() 114 | 115 | list_target = args.list_target 116 | list_improve = args.list_improve 117 | list_lowRd = args.list_lowRd 118 | out_file = args.output_improve 119 | 120 | dict_target = read_list_bed(list_target) 121 | dict_improve = read_list_bed(list_improve) 122 | dict_lowRd = read_list_bed(list_lowRd) 123 | 124 | compare_bias_regions(dict_target, dict_improve, dict_lowRd, out_file) 125 | 126 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | _Updated: Apr 17, 2025_ 3 | # Biastools: Measuring, visualizing and diagnosing reference bias 4 | 5 | This github is originally forked from https://github.com/sheila12345/biastools 6 | 7 | ## Prerequisite programs 8 | - samtools=v1.11 9 | - bcftools=v1.9 10 | - bedtools=v2.30.0 11 | - gzip=v1.9 12 | - tabix=v1.9 13 | - bowtie2=v2.4.2 14 | - bwa=v0.7.17 15 | - mason_simulator=v2.0.9 (only for biastools --simulate) 16 | - SeqAn=v2.4.0 (only for biastools --simulate) 17 | 18 | 19 | ## Installation 20 | - [pip](https://pypi.org/project/biastools/) 21 | ``` 22 | pip install biastools 23 | ``` 24 | - [Github](https://github.com/maojanlin/biastools.git) 25 | ``` 26 | git clone https://github.com/maojanlin/biastools.git 27 | cd biastools 28 | ``` 29 | Though optional, it is a good practice to install a virtual environment to manage the dependancies: 30 | 31 | ``` 32 | python -m venv venv 33 | source venv/bin/activate 34 | ``` 35 | Now a virtual environment (named venv) is activated. Install biastools: 36 | 37 | ``` 38 | python setup.py install 39 | ``` 40 | 41 | 42 | ## Usage 43 | 44 | ### Simulation, plotting, and analysis 45 | ``` 46 | $ biastools --simulate --align --analyze -o -g -v -s -r 47 | ``` 48 | 49 | With the example command, biastools 50 | 1. Simulates reads based on `` and ``, generating pair-end `.fq.gz` files for both haplotypes (`work_dir/sample_name.hap{A,B}_{1,2}.fq.gz`). 51 | 2. Aligns the reads to the reference ``, generating a BAM file with phasing information (`work_dir/sample_name.run_id.sorted.bam`). 52 | 3. Analyzes the BAM file with the context-aware assignment method, generating bias reports and plots. 53 | 54 | #### Other aligners 55 | Biastools supports [Bowtie 2](https://github.com/BenLangmead/bowtie2) and [bwa mem](https://github.com/lh3/bwa) aligners. BAM files from other aligners (named with `` and tagged with haplotype information) can be analyzed with 56 | 57 | ``` 58 | $ biastools --analyze -o -g -v -s -r 59 | ``` 60 | 61 | #### Direct Analysis on Real sequence data 62 | Biastools can also analyze real sequence data with the `--real` option using the context-aware assignment algorithm. The resulting plot does not include simulation information (`sample_id.real.indel_balance.pdf`). 63 | ``` 64 | $ biastools --analyze --real -t -o -g -v -s -r \ 65 | --bam 66 | ``` 67 | Biastools first fetches the relevant alignments from the target BAM file, focusing only on heterozygous variant sites specified in the VCF file. These sites are then analyzed using a [context-aware algorithm](figures/context_aware.md). Finally, Biastools generates a bias report along with a bias-by-allele-length plot, both included in the output folder. 68 | 69 | 70 | #### Combined Bias-by-allele-length plot 71 | Multiple analysis results can be combined into a single Bias-by-allele-length plot. In biastools version 0.3.1, the default plotting module displays the 25th percentile, mean, and 75th percentile of the fraction of ALT alleles for variants stratified by allele length, using ticks to indicate the interquartile range and a central dot to mark the mean. 72 | 73 | ``` 74 | $ biastools --analyze -o -g -v -s -r \ 75 | -lr file1.bias.all file2.bias.all file3.bias.all... \ 76 | -ld run_id1 run_id2 run_id3... 77 | ``` 78 | 79 | The output file `sample_name.combine.sim.indel_balance.pdf` plots the fraction of ALT alleles merged from the bias reports specified after the `-lr` option. Users can use `-ld` option to specify the tool names, which will appear in the legend. To generate a combined plot using only real data bias reports (excluding simulation information), use the `--real` option. 80 | 81 | An example of a combined bias-by-allele-length plot: 82 | ![multiple_indel_plot](figures/HG002.GIAB.4.2.1.demo.indel_balance.png?raw=true "multiple_indel_plot") 83 | 84 | 85 | ### Bias prediction from bias report 86 | #### Real data 87 | Biastools can predict if a variant is bias or not by: 88 | 89 | ``` 90 | $ biastools --predict -o -g -v -s -r -pr 91 | ``` 92 | 93 | With the example command, biastools 94 | 4. Generates two files: `sample_name.real.pd_id_bias.tsv` and `sample_name.real.pd_id_suspicious.tsv`. The `bias.tsv` report contains all sites predicted to be biased by the model. The `suspicious.tsv` file contains the sites which suspicious of lacking enough information from the VCF file. In another word, the reads align to the site shows different pattern to the haplotype indicated by the VCF file. 95 | 96 | #### Simulated guided prediction 97 | 98 | ``` 99 | $ biastools --predict -o -g -v -s -r \ 100 | -pr \ 101 | -ps 102 | ``` 103 | 104 | If the report of the sample based on simulated data is presented, biastools can generate cross prediction experiment result. In the experiment, the ground truth bias sites are based on simulation data. 105 | 106 | ### Scanning bias without vcf information 107 | #### Scanning 108 | ``` 109 | $ biastools_scan --scan -o -g -s -r -i 110 | ``` 111 | 112 | Biastools transforms the `` into the mpileup format and generates baised and suspicious regions (`sample_name.run_id.bias.bed` and `sample_name.run_id.suspicious.bed`). 113 | 114 | 115 | #### Compare two bam files with common baseline 116 | ``` 117 | $ biastools_scan --compare_bam -o -g -s -r \ 118 | -i \ 119 | -i2 \ 120 | -m \ 121 | -m2 122 | ``` 123 | Biastools generates a common baseline from `path_to_target.bam` and `path_to_second.bam`, and uses the new common baseline to recalculate the bias regions based on the two mpileup files. The mpileup files can be generated by running **scanning** first, or directly run the **bcftools consensus**. 124 | 125 | 126 | 127 | #### Directly compare two bias reports 128 | User can also generate the comparison of the bias reports without a common baseline (not recommended): 129 | ``` 130 | $ biastools_scan --compare_rpt -o -s -r \ 131 | -b1 \ 132 | -b2 \ 133 | -l2 134 | ``` 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /biastools/sample_baseline.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import argparse 3 | import random 4 | import numpy as np 5 | import os 6 | from subprocess import call 7 | 8 | from scanning_bias import scanning_bias, calculate_measures 9 | 10 | 11 | def baseline( 12 | f_mpileup :pysam.VariantRecord, 13 | fn_sample :str, 14 | window_size :int 15 | ) -> tuple: 16 | """ 17 | Take in the sample mpileup, and output the average read_depth/variant Density/non Diploid portion 18 | """ 19 | dict_ref_info = scanning_bias(f_gvcf=f_mpileup) 20 | dict_3D_measures = calculate_measures( 21 | dict_ref_info=dict_ref_info, 22 | window_size=window_size 23 | ) 24 | 25 | total_read_depth = np.array([]) 26 | total_var_density = np.array([]) 27 | total_dip_density = np.array([]) 28 | 29 | fo = open(fn_sample + '.baseline', 'w') 30 | fo.write('#chr pos segment_len RD_mean RD_std VD_mean VD_std ND_mean ND_std\n') 31 | for ref_name, dict_array in dict_3D_measures.items(): 32 | for start_pos, array_info in dict_array.items(): 33 | array_read_depth, array_var_density, array_dip_density = array_info 34 | 35 | avg_read_depth = np.mean(array_read_depth) 36 | std_read_depth = np.std(array_read_depth) 37 | #positive_avg_var = np.mean(array_var_density) 38 | #positive_std_var = np.std(array_var_density) 39 | #positive_avg_dip = np.mean(array_dip_density) 40 | #positive_std_dip = np.std(array_dip_density) 41 | 42 | fo.write(ref_name + ' ' + str(start_pos) + ' ' + str(len(array_read_depth)) + ' ') 43 | fo.write(str(round(avg_read_depth,2)) + ' ' + str(round(std_read_depth,2)) + ' ') 44 | #positive_var = array_var_density[array_var_density != 0] 45 | positive_var = array_var_density 46 | if len(positive_var) > 0: 47 | positive_avg_var = np.mean(positive_var) 48 | positive_std_var = np.std(positive_var) 49 | fo.write(str(round(positive_avg_var,2)) + ' ' + str(round(positive_std_var,2)) + ' ') 50 | 51 | #positive_dip = array_dip_density[array_var_density != 0] 52 | positive_dip = array_dip_density 53 | if len(positive_dip) > 0: 54 | positive_avg_dip = np.mean(positive_dip) 55 | positive_std_dip = np.std(positive_dip) 56 | fo.write(str(round(positive_avg_dip,2)) + ' ' + str( round(positive_std_dip, 2)) + '\n') 57 | else: 58 | fo.write('\n') 59 | 60 | total_read_depth = np.concatenate((total_read_depth , array_read_depth)) 61 | total_var_density = np.concatenate((total_var_density, positive_var)) 62 | total_dip_density = np.concatenate((total_dip_density, positive_dip)) 63 | #total_var_density = np.concatenate((total_var_density, array_var_density)) 64 | #total_dip_density = np.concatenate((total_dip_density, array_dip_density)) 65 | 66 | fo.write('#total sample len: ' + str(len(total_read_depth)) + '\n') 67 | fo.write('#total_statistics:\n') 68 | fo.write('#chr pos segment_len RD_mean RD_std VD_mean VD_std ND_mean ND_std\n# ') 69 | fo.write(str(round(np.mean(total_read_depth),5)) + ' ' + str(round(np.std(total_read_depth),5)) + ' ') 70 | fo.write(str(round(np.mean(total_var_density),5)) + ' ' + str(round(np.std(total_var_density),5)) + ' ') 71 | fo.write(str(round(np.mean(total_dip_density),5)) + ' ' + str(round(np.std(total_dip_density),5))) 72 | fo.close() 73 | return np.mean(total_read_depth), np.mean(total_var_density), np.mean(total_dip_density) 74 | 75 | 76 | def sample_select( 77 | fn_sample :str, 78 | seed :int, 79 | min_len :int, 80 | f_bam :pysam.AlignmentFile 81 | ): 82 | """ 83 | Take out the contig length greater than min_len (threshold_contig) 84 | For each contig, takes 100 segments totally equal to 1/1000 of the contig length 85 | """ 86 | random.seed(seed) 87 | fo = open(fn_sample + '.bed', 'w') 88 | write_flag = False 89 | for idx, name in enumerate(f_bam.header.references): 90 | contig_len = f_bam.header.lengths[idx] 91 | if contig_len > min_len: 92 | write_flag = True 93 | thousandth = int(contig_len / 100000) 94 | list_sample_start = random.sample(range(100000 - 1), 100) 95 | for sample_start in sorted(list_sample_start): 96 | fo.write(name + ' ' + str(sample_start*thousandth) + ' ' + str(sample_start*thousandth+thousandth) + '\n') 97 | elif write_flag == False: 98 | fo.write(name + ' 1 ' + str(contig_len) + '\n') 99 | write_flag = True 100 | fo.close() 101 | 102 | 103 | 104 | 105 | 106 | if __name__ == "__main__": 107 | parser = argparse.ArgumentParser() 108 | parser.add_argument('-b', '--bam_file', help='the bam file we want to sample') 109 | parser.add_argument('-f', '--reference_fasta', help='the reference fasta file for mpileup building') 110 | parser.add_argument('-o', '--sample_bed', help='the sampled 1/1000 bed file') 111 | parser.add_argument('-w', '--window_size', help='window size for average depth, density analysis', type=int, default=400) 112 | parser.add_argument('-th', '--threshold_contig', help='the minimum contig length for sampling', type=int, default=10000000) 113 | parser.add_argument('--seed', help='seed for random sampling', type=int, default=0) 114 | parser.add_argument('-k', '--kill', help='kill all storage files', action='store_true') 115 | args = parser.parse_args() 116 | 117 | fn_bam = args.bam_file 118 | fn_ref = args.reference_fasta 119 | fn_sample = args.sample_bed 120 | min_len = args.threshold_contig 121 | window_size = args.window_size 122 | seed = args.seed 123 | kill_flag = args.kill 124 | 125 | f_bam = pysam.AlignmentFile(fn_bam) 126 | 127 | # sample bed file according to the bam file information 128 | sample_select(fn_sample, seed, min_len, f_bam) 129 | 130 | 131 | # SAMTOOLS command for extract the sample region bam file 132 | if os.path.exists(fn_sample + '.bam') and not kill_flag: 133 | print(fn_sample + '.bam already exist.') 134 | else: 135 | command = ('samtools view -h ' + fn_bam + ' -L ' + fn_sample + '.bed -o ' + fn_sample + '.bam') 136 | print(command) 137 | call(command, shell=True) 138 | 139 | # BCFTOOLS command for mpileup the bam file 140 | if os.path.exists(fn_sample + '.mpileup') and not kill_flag: 141 | print(fn_sample + '.mpileup already exist.') 142 | else: 143 | command = ('bcftools mpileup --count-orphans --annotate FORMAT/AD,FORMAT/DP -f ' + fn_ref + ' --min-BQ 0 --min-MQ 0 ' \ 144 | + fn_sample + '.bam -o ' + fn_sample + '.mpileup') 145 | print(command) 146 | call(command, shell=True) 147 | 148 | f_mpileup = pysam.VariantFile(fn_sample + '.mpileup') 149 | baseline(f_mpileup, fn_sample, window_size) 150 | 151 | 152 | -------------------------------------------------------------------------------- /biastools/merge_baseline.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import argparse 3 | import random 4 | import numpy as np 5 | import os 6 | from subprocess import call 7 | 8 | from scanning_bias import scanning_bias, calculate_measures 9 | from sample_baseline import sample_select 10 | import pickle 11 | 12 | 13 | def baseline( 14 | f_mpileup_1 :pysam.VariantRecord, 15 | f_mpileup_2 :pysam.VariantRecord, 16 | fn_sample :str, 17 | window_size :int 18 | ) -> tuple: 19 | """ 20 | Take in the sample mpileup, and output the average read_depth/variant Density/non Diploid portion 21 | """ 22 | dict_ref_info_1 = scanning_bias(f_gvcf=f_mpileup_1) 23 | dict_3D_measures_1 = calculate_measures( 24 | dict_ref_info=dict_ref_info_1, 25 | window_size=window_size 26 | ) 27 | dict_ref_info_2 = scanning_bias(f_gvcf=f_mpileup_2) 28 | dict_3D_measures_2 = calculate_measures( 29 | dict_ref_info=dict_ref_info_2, 30 | window_size=window_size 31 | ) 32 | 33 | total_read_depth = np.array([]) 34 | total_var_density = np.array([]) 35 | total_dip_density = np.array([]) 36 | 37 | fo = open(fn_sample + '.baseline', 'w') 38 | fo.write('#chr pos segment_len RD_mean RD_std VD_mean VD_std ND_mean ND_std\n') 39 | for ref_name, dict_array in dict_3D_measures_1.items(): 40 | for start_pos, array_info in dict_array.items(): 41 | array_read_depth, array_var_density, array_dip_density = array_info 42 | 43 | avg_read_depth = np.mean(array_read_depth) 44 | std_read_depth = np.std(array_read_depth) 45 | 46 | fo.write(ref_name + ' ' + str(start_pos) + ' ' + str(len(array_read_depth)) + ' ') 47 | fo.write(str(round(avg_read_depth,2)) + ' ' + str(round(std_read_depth,2)) + ' ') 48 | #positive_var = array_var_density[array_var_density != 0] 49 | positive_var = array_var_density 50 | if len(positive_var) > 0: 51 | positive_avg_var = np.mean(positive_var) 52 | positive_std_var = np.std(positive_var) 53 | fo.write(str(round(positive_avg_var,2)) + ' ' + str(round(positive_std_var,2)) + ' ') 54 | 55 | #positive_dip = array_dip_density[array_var_density != 0] 56 | positive_dip = array_dip_density 57 | if len(positive_dip) > 0: 58 | positive_avg_dip = np.mean(positive_dip) 59 | positive_std_dip = np.std(positive_dip) 60 | fo.write(str(round(positive_avg_dip,2)) + ' ' + str( round(positive_std_dip, 2)) + '\n') 61 | else: 62 | fo.write('\n') 63 | 64 | total_read_depth = np.concatenate((total_read_depth , array_read_depth)) 65 | total_var_density = np.concatenate((total_var_density, positive_var)) 66 | total_dip_density = np.concatenate((total_dip_density, positive_dip)) 67 | for ref_name, dict_array in dict_3D_measures_2.items(): 68 | for start_pos, array_info in dict_array.items(): 69 | array_read_depth, array_var_density, array_dip_density = array_info 70 | 71 | avg_read_depth = np.mean(array_read_depth) 72 | std_read_depth = np.std(array_read_depth) 73 | 74 | fo.write(ref_name + ' ' + str(start_pos) + ' ' + str(len(array_read_depth)) + ' ') 75 | fo.write(str(round(avg_read_depth,2)) + ' ' + str(round(std_read_depth,2)) + ' ') 76 | #positive_var = array_var_density[array_var_density != 0] 77 | positive_var = array_var_density 78 | if len(positive_var) > 0: 79 | positive_avg_var = np.mean(positive_var) 80 | positive_std_var = np.std(positive_var) 81 | fo.write(str(round(positive_avg_var,2)) + ' ' + str(round(positive_std_var,2)) + ' ') 82 | 83 | #positive_dip = array_dip_density[array_var_density != 0] 84 | positive_dip = array_dip_density 85 | if len(positive_dip) > 0: 86 | positive_avg_dip = np.mean(positive_dip) 87 | positive_std_dip = np.std(positive_dip) 88 | fo.write(str(round(positive_avg_dip,2)) + ' ' + str( round(positive_std_dip, 2)) + '\n') 89 | else: 90 | fo.write('\n') 91 | 92 | total_read_depth = np.concatenate((total_read_depth , array_read_depth)) 93 | total_var_density = np.concatenate((total_var_density, positive_var)) 94 | total_dip_density = np.concatenate((total_dip_density, positive_dip)) 95 | 96 | fo.write('#total sample len: ' + str(len(total_read_depth)) + '\n') 97 | fo.write('#total_statistics:\n') 98 | fo.write('#chr pos segment_len RD_mean RD_std VD_mean VD_std ND_mean ND_std\n# ') 99 | fo.write(str(round(np.mean(total_read_depth),5)) + ' ' + str(round(np.std(total_read_depth),5)) + ' ') 100 | fo.write(str(round(np.mean(total_var_density),5)) + ' ' + str(round(np.std(total_var_density),5)) + ' ') 101 | fo.write(str(round(np.mean(total_dip_density),5)) + ' ' + str(round(np.std(total_dip_density),5))) 102 | fo.close() 103 | print("[Biastools] Generate " + fn_sample + '.baseline') 104 | return np.mean(total_read_depth), np.mean(total_var_density), np.mean(total_dip_density) 105 | 106 | 107 | 108 | 109 | 110 | if __name__ == "__main__": 111 | parser = argparse.ArgumentParser() 112 | parser.add_argument('-b1', '--bam_file_1', help='the bam file we want to sample') 113 | parser.add_argument('-b2', '--bam_file_2', help='the bam file we want to sample') 114 | parser.add_argument('-f', '--reference_fasta', help='the reference fasta file for mpileup building') 115 | parser.add_argument('-o', '--sample_bed', help='the sampled 1/1000 bed file') 116 | parser.add_argument('-w', '--window_size', help='window size for average depth, density analysis', type=int, default=400) 117 | parser.add_argument('-th', '--threshold_contig', help='the minimum contig length for sampling', type=int, default=10000000) 118 | parser.add_argument('--seed', help='seed for random sampling', type=int, default=0) 119 | args = parser.parse_args() 120 | 121 | fn_bam_1 = args.bam_file_1 122 | fn_bam_2 = args.bam_file_2 123 | fn_ref = args.reference_fasta 124 | fn_sample = args.sample_bed 125 | min_len = args.threshold_contig 126 | window_size = args.window_size 127 | seed = args.seed 128 | 129 | # sample bed file according to the bam file information 130 | f_bam = pysam.AlignmentFile(fn_bam_1) 131 | sample_select(fn_sample, seed, min_len, f_bam) 132 | 133 | 134 | # SAMTOOLS command for extract the sample region bam file 135 | command = ('samtools view -h ' + fn_bam_1 + ' -L ' + fn_sample + '.bed -o ' + fn_bam_1 + '.sample.bam') 136 | print(command) 137 | call(command, shell=True) 138 | command = ('samtools view -h ' + fn_bam_2 + ' -L ' + fn_sample + '.bed -o ' + fn_bam_2 + '.sample.bam') 139 | print(command) 140 | call(command, shell=True) 141 | 142 | # BCFTOOLS command for mpileup the bam file 143 | command = ('bcftools mpileup --count-orphans --annotate FORMAT/AD,FORMAT/DP -f ' + fn_ref + ' --min-BQ 0 --min-MQ 0 ' \ 144 | + fn_bam_1 + '.sample.bam -o ' + fn_bam_1 + '.sample.mpileup') 145 | print(command) 146 | call(command, shell=True) 147 | command = ('bcftools mpileup --count-orphans --annotate FORMAT/AD,FORMAT/DP -f ' + fn_ref + ' --min-BQ 0 --min-MQ 0 ' \ 148 | + fn_bam_2 + '.sample.bam -o ' + fn_bam_2 + '.sample.mpileup') 149 | print(command) 150 | call(command, shell=True) 151 | 152 | f_mpileup_1 = pysam.VariantFile(fn_bam_1 + '.sample.mpileup') 153 | f_mpileup_2 = pysam.VariantFile(fn_bam_2 + '.sample.mpileup') 154 | 155 | print('[Biastools] Generate sample baseline') 156 | baseline(f_mpileup_1, f_mpileup_2, fn_sample, window_size) 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /biastools/biastools_scan.py: -------------------------------------------------------------------------------- 1 | # Wrap up python file for the biastools 3rd module 2 | import subprocess 3 | import sys 4 | import os 5 | import argparse 6 | from biastools.biastools import check_program_install, catch_assert 7 | 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-o', '--out', help="Path to output directory ['out_dir'].", default="out_dir") 12 | parser.add_argument('-g', '--genome', help="Path to the reference genome.") 13 | parser.add_argument('-i', '--bam', help="Path to the alignment bam file, should be SORTED.") 14 | parser.add_argument('-s', '--sample_id', help="Sample ID ['sample'].", default="sample") 15 | parser.add_argument('-r', '--run_id', help="Run ID ['run'].", default="run") 16 | # Process options 17 | parser.add_argument('--scan', help='[1] Option to scan and report bias region.', action='store_true') 18 | parser.add_argument('--compare_bam', help='[2] Option to generate common baseline and compare.', action='store_true') 19 | parser.add_argument('--compare_rpt', help='[3] Option to directly compare two bias report.', action='store_true') 20 | 21 | parser.add_argument('-t', '--thread', help="Number of threads to use [max].", type=int) 22 | parser.add_argument('--force', help="running the program without checking prerequisite programs.", action='store_true') 23 | # [1] 24 | parser.add_argument('-w', '--wig', help="Generate the wig files for the three measures, VERY SLOW [False]", action='store_true') 25 | parser.add_argument('-R', '--range', help="The range in the bam file targeted for analysis.") 26 | # [2] 27 | parser.add_argument('-i2', '--bam2', help="Path to the second alignment bam file want to compare, should be SORTED.") 28 | parser.add_argument('-m', '--mpileup', help="Path to the mpileup file of the first bam file.") 29 | parser.add_argument('-m2', '--mpileup2', help="Path to the mpileup file of the second bam file.") 30 | # [3] 31 | parser.add_argument('-b1', '--bed1', help="Path to the first bed file for comparison.") 32 | parser.add_argument('-b2', '--bed2', help="Path to the second bed file for comparison.") 33 | parser.add_argument('-l2', '--lowRd2', help="Path to the .lowRd.bed report of the second file.") 34 | args = parser.parse_args() 35 | 36 | path_output = args.out 37 | path_ref = args.genome 38 | bam_file = args.bam 39 | sample_id = args.sample_id 40 | run_id = args.run_id 41 | 42 | flag_scan = args.scan 43 | flag_compare_bam = args.compare_bam 44 | flag_compare_rpt = args.compare_rpt 45 | try: 46 | assert flag_scan + flag_compare_bam + flag_compare_rpt >= 1 47 | except AssertionError: 48 | catch_assert(parser, "At least one of the --scan/compare_bam/compare_rpt option should be specified.") 49 | 50 | flag_force = args.force 51 | thread = args.thread 52 | if thread == None: 53 | if sys.platform == "darwin": 54 | result = subprocess.run(["sysctl -n hw.ncpu"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True) 55 | else: 56 | result = subprocess.run(["nproc"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True) 57 | thread = int(result.stdout.strip()) 58 | flag_wig = args.wig 59 | Range = args.range 60 | bam_file2 = args.bam2 61 | mpileup_file = args.mpileup 62 | mpileup_file2 = args.mpileup2 63 | bed_file1 = args.bed1 64 | bed_file2 = args.bed2 65 | lowRd_file2 = args.lowRd2 66 | 67 | 68 | # Checking prerequisite programs are installed 69 | if flag_force != True: 70 | check_program_install(["bedtools", \ 71 | "samtools", \ 72 | "bcftools"]) 73 | 74 | # Start running 75 | command = "mkdir -p " + path_output 76 | subprocess.call(command, shell=True) 77 | prefix = path_output + '/' + sample_id 78 | path_module = os.path.dirname(__file__) + '/' 79 | if flag_scan: 80 | print("[Biastools] Scanning...") 81 | if os.path.exists(bam_file+'.bai'): 82 | pass 83 | else: 84 | command = ["samtools", "index", bam_file] 85 | subprocess.call(command) 86 | 87 | print("[BIASTOOLS] SAMPLE", bam_file, " as ", sample_id + ".baseline ...") 88 | command = ["python3", path_module+"sample_baseline.py", "-b", bam_file, "-f", path_ref, "-o", prefix+".sample"] 89 | print(' '.join(command)) 90 | subprocess.call(command) 91 | 92 | if Range == None: 93 | print("[BIASTOOLS] Process the whole bam file...") 94 | target_bam = bam_file 95 | else: 96 | print("[BIASTOOLS] Extract reads from " + Range + "...") 97 | target_bam = prefix + '.range.bam' 98 | command = ["samtools", "view", " -h", bam_file, Range, "-o", target_bam, "-@", thread] 99 | print(' '.join(command)) 100 | subprocess.call(command) 101 | 102 | print("[BIASTOOLS] Format the mpileup...") 103 | if os.path.exists(prefix+'.'+run_id+'.mpileup'): 104 | print(prefix+'.'+run_id+'.mpileup already exist!') 105 | else: 106 | command = ["bcftools", "mpileup", "--count-orphans", "--annotate", "FORMAT/AD,FORMAT/DP", \ 107 | "-f", path_ref, \ 108 | "--min-BQ", "0", \ 109 | "--min-MQ", "0", \ 110 | "--threads", str(thread), target_bam, "-o", prefix+'.'+run_id+'.mpileup'] 111 | print(' '.join(command)) 112 | subprocess.call(command) 113 | print("[BIASTOOLS] Scanning bias...") 114 | if flag_wig: 115 | command = ["python3", path_module+"scanning_bias.py", "-g", prefix+'.'+run_id+'.mpileup', "--sample", "-b", prefix+".sample.baseline", \ 116 | "-wig", "-o", prefix+'.'+run_id+'.scanning'] 117 | else: 118 | command = ["python3", path_module+"scanning_bias.py", "-g", prefix+'.'+run_id+'.mpileup', "--sample", "-b", prefix+".sample.baseline", \ 119 | "-o", prefix+'.'+run_id+'.scanning'] 120 | print(' '.join(command)) 121 | subprocess.call(command) 122 | 123 | if flag_compare_bam: 124 | if os.path.exists(bam_file+'.bai'): 125 | pass 126 | else: 127 | command = ["samtools", "index", bam_file] 128 | subprocess.call(command) 129 | if os.path.exists(bam_file2+'.bai'): 130 | pass 131 | else: 132 | command = ["samtools", "index", bam_file2] 133 | subprocess.call(command) 134 | 135 | print("[Biastools] Generate common baseline...") 136 | baseline = prefix+"."+run_id+".combine" 137 | command = ["python3", path_module+"merge_baseline.py", "-b1", bam_file, "-b2", bam_file2, "-f", path_ref, "-o", baseline] 138 | #print(' '.join(command)) 139 | subprocess.call(command) 140 | command = ' '.join(["python3", path_module+"scanning_bias.py", "-g", mpileup_file, "-b", baseline+".baseline", "-o", baseline+".1.scanning", ">", prefix+"."+run_id+".log"]) 141 | #print(command) 142 | subprocess.call(command, shell=True) 143 | command = ' '.join(["python3", path_module+"scanning_bias.py", "-g", mpileup_file2, "-b", baseline+".baseline", "-o", baseline+".2.scanning", ">", prefix+"."+run_id+".log"]) 144 | #print(command) 145 | subprocess.call(command, shell=True) 146 | 147 | print("[Biastools] Compare two bam files with common baseline...") 148 | command = ' '.join(["bash", path_module+"biastools_compare.sh", path_output, sample_id, run_id, \ 149 | baseline+".1.scanning.bias.bed", \ 150 | baseline+".2.scanning.bias.bed", \ 151 | baseline+".2.scanning.lowRd.bed", \ 152 | path_module]) 153 | print(command) 154 | subprocess.call(command, shell=True) 155 | if flag_compare_rpt: 156 | print("[Biastools] Compare two bed files...") 157 | command = ' '.join(["bash", path_module+"biastools_compare.sh", path_output, sample_id, run_id, bed_file1, bed_file2, lowRd_file2, path_module]) 158 | print(command) 159 | subprocess.call(command, shell=True) 160 | 161 | 162 | 163 | 164 | if __name__ == "__main__": 165 | main() 166 | -------------------------------------------------------------------------------- /biastools/golden_graph_report.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | 6 | import math 7 | import random 8 | import numpy as np 9 | 10 | 11 | colors = ["#bce4ff", "#8bd0fe", "#59bcfc", "#0099fc", "#0086dd", "#006bb1", "#004a7a", "#002740"] 12 | colors = ["#f2dad5", "#e8bfc1", "#d9a4b2", "#c78ba6", "#aa719a", "#8b5b89", "#634271", "#3c2a4f"] 13 | 14 | def map_mapq_to_size(mapq): 15 | if mapq >= 40: 16 | return 0 17 | elif mapq >= 30: 18 | return 1 19 | elif mapq >= 20: 20 | return 2 21 | elif mapq >= 10: 22 | return 3 23 | elif mapq >= 5: 24 | return 4 25 | elif mapq >= 3: 26 | return 5 27 | elif mapq >= 1: 28 | return 6 29 | return 7 30 | 31 | labels = ['>40', '30~40', '20~30', '10~20', '5~10', '3~5', '1~3', '<1'] 32 | 33 | def map_color( 34 | var:float 35 | )-> int: 36 | """ 37 | color_code = int(var/2) 38 | if color_code > 20: 39 | color_code = 20 40 | return color_code 41 | """ 42 | if var > 0.5: 43 | return 0 44 | elif var > 0.3: 45 | return 1 46 | elif var > 0.1: 47 | return 2 48 | elif var > 0.05: 49 | return 3 50 | elif var > 0.01: 51 | return 4 52 | else: 53 | return 5 54 | 55 | p_labels = ['>0.5', '0.3~0.5', '0.1~0.3', '0.05~0.1', '0.01~0.05', '<0.01'] 56 | 57 | def map_num_to_size(num): 58 | if num == 0: 59 | return 0 60 | elif num <= 3: 61 | return 1 62 | elif num <= 5: 63 | return 2 64 | elif num <= 10: 65 | return 3 66 | elif num <= 15: 67 | return 4 68 | elif num <= 20: 69 | return 5 70 | elif num <= 30: 71 | return 6 72 | return 7 73 | 74 | n_labels = ['0', '1~3', '4~6', '7~10', '11~15', '16~20', '21~30', '>30'] 75 | 76 | 77 | def dist_origin(a, b): 78 | return math.dist((0,a) + (b,0)) 79 | 80 | 81 | 82 | def plot_golden(out_prefix, df_use): 83 | # Add columns 84 | mapQ = list(df_use['AVG_MAPQ']) 85 | pValue = list(df_use['EVEN_P_VALUE']) 86 | 87 | sp = pd.DataFrame() 88 | sp['ALLELIC BALANCE'] = list(df_use['BALANCE']) 89 | sp['MAPPING BALANCE'] = list(df_use['MAP_BALANCE']) 90 | sp['SIMULATION BALANCE'] = list(df_use['SIM_BALANCE']) 91 | sp.head() 92 | 93 | mapped_mapQ = [map_mapq_to_size(q) for q in mapQ] 94 | mapped_p = [map_color(p) for p in pValue] 95 | sp['Avg_MapQ_code'] = mapped_mapQ 96 | sp['Even_p_value'] = mapped_p 97 | sp['Assign_other'] = [map_num_to_size(n) for n in list(df_use['OTHER']) ] 98 | sp['Map_other'] = [map_num_to_size(n) for n in list(df_use['MIS_MAP']) ] 99 | sp['MapQ'] = list(mapQ) 100 | 101 | #================== color map ==================== 102 | set_mapQ_value = set(sp['Avg_MapQ_code']) 103 | color_mapQ = [] 104 | for idx in sorted(set_mapQ_value): 105 | color_mapQ.append(colors[idx]) 106 | 107 | set_misMap_value = set(sp['Map_other']) 108 | color_misMap = [] 109 | for idx in sorted(set_misMap_value): 110 | color_misMap.append(colors[idx]) 111 | 112 | #=========================== all merged plot ============================ 113 | print("Ploting the Merged golden distribution Plot!") 114 | sp['Normalized Assignment Balance'] = list(df_use['BALANCE']-df_use['SIM_BALANCE']) # the average map_q score 115 | sp['Normalized Mapping Balance'] = list(df_use['MAP_BALANCE']-df_use['SIM_BALANCE']) # the average map_q score 116 | #ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Assignment Balance", hue = "Avg_MapQ_code", data = sp, \ 117 | # xlim=(-0.6,0.6), ylim=(-0.6,0.6), palette=sns.color_palette(color_mapQ)) 118 | ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Assignment Balance", hue = "Map_other", data = sp, \ 119 | xlim=(-0.8,0.8), ylim=(-0.8,0.8), palette=sns.color_palette(color_misMap)) 120 | ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.2) 121 | ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.2) 122 | ax.ax_joint.get_legend().remove() 123 | h, l = ax.ax_joint.get_legend_handles_labels() 124 | #plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(0, 0), loc='lower right', borderaxespad=0.2) 125 | plt.legend(h, n_labels, title="Mismapped Gain#", bbox_to_anchor=(0, 0), loc='lower right', borderaxespad=0.2) 126 | #plt.savefig(out_prefix + '.mismap.pdf') 127 | 128 | #print(df_use[sp['Normalized Assignment Balance']**2 + sp['Normalized Mapping Balance']**2 > 0.01]) 129 | biased = (sp['Normalized Assignment Balance']**2 + sp['Normalized Mapping Balance']**2 > 0.01) 130 | b_loss = ((sp['Normalized Assignment Balance'] < sp['Normalized Mapping Balance']*2 + 0.1)*(sp['Normalized Assignment Balance']*2 + \ 131 | 0.1 > sp['Normalized Mapping Balance'])) + \ 132 | ((sp['Normalized Assignment Balance'] + 0.1 > sp['Normalized Mapping Balance']*2)*(sp['Normalized Assignment Balance']*2 \ 133 | < sp['Normalized Mapping Balance'] + 0.1)) 134 | b_flux = (sp['Normalized Assignment Balance'] > 0.1)*(sp['Map_other'] >= 3) + \ 135 | (sp['Normalized Assignment Balance'] < -0.1)*(sp['Map_other'] >= 3) 136 | b_artifact = (sp['Normalized Assignment Balance'] > 0.1)*(sp['Map_other'] < 3) + \ 137 | (sp['Normalized Assignment Balance'] < -0.1)*(sp['Map_other'] < 3) 138 | 139 | sp['Category'] = biased*4 140 | sp['Category'] -= (biased * b_loss)*3 141 | sp['Category'] -= (biased * ~b_loss * b_flux)*2 142 | sp['Category'] -= (biased * ~b_loss * b_artifact)*1 143 | labels = ['Balanced', 'Bias (Loss)', 'Bias (Flux)', 'Bias (Local)', 'Outliers'] 144 | 145 | custom_palette = sns.color_palette('Set2') 146 | custom_palette = custom_palette[:4] + custom_palette[-1:] 147 | ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Assignment Balance", hue = "Category", data = sp, \ 148 | xlim=(-0.8,0.8), ylim=(-0.8,0.8), palette=custom_palette) 149 | ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.2) 150 | ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.2) 151 | ax.ax_joint.get_legend().remove() 152 | h, l = ax.ax_joint.get_legend_handles_labels() 153 | plt.legend(h, labels, title="Category#", bbox_to_anchor=(1, 0), loc='lower right', borderaxespad=0.2) 154 | plt.savefig(out_prefix + '.category.pdf') 155 | 156 | print("-------------------------------------------") 157 | print("Number of balanced:", sum(sp['Category'] == 0)) 158 | print("Number of bias_loss:", sum(sp['Category'] == 1)) 159 | print("Number of bias_flux:", sum(sp['Category'] == 2)) 160 | print("Number of bias_local:", sum(sp['Category'] == 3)) 161 | print("Number of outliers:", sum(sp['Category'] == 4)) 162 | print("-------------------------------------------") 163 | 164 | df_use.loc[(sp['Category'] == 0).values, :].to_csv(out_prefix + '.balanced.tsv', index=False, sep="\t") 165 | df_use.loc[((sp['Category'] == 1)*(sp['Normalized Assignment Balance'] > 0)).values, :].to_csv(out_prefix + '.bias-loss.1.tsv', index=False, sep="\t") 166 | df_use.loc[((sp['Category'] == 1)*(sp['Normalized Assignment Balance'] < 0)).values, :].to_csv(out_prefix + '.bias-loss.2.tsv', index=False, sep="\t") 167 | df_use.loc[((sp['Category'] == 2)*(sp['Normalized Assignment Balance'] > 0)).values, :].to_csv(out_prefix + '.bias-flux.1.tsv', index=False, sep="\t") 168 | df_use.loc[((sp['Category'] == 2)*(sp['Normalized Assignment Balance'] < 0)).values, :].to_csv(out_prefix + '.bias-flux.2.tsv', index=False, sep="\t") 169 | df_use.loc[((sp['Category'] == 3)*(sp['Normalized Assignment Balance'] > 0)).values, :].to_csv(out_prefix + '.bias-local.1.tsv', index=False, sep="\t") 170 | df_use.loc[((sp['Category'] == 3)*(sp['Normalized Assignment Balance'] < 0)).values, :].to_csv(out_prefix + '.bias-local.2.tsv', index=False, sep="\t") 171 | df_use.loc[(sp['Category'] == 4).values, :].to_csv(out_prefix + '.bias-outlier.tsv', index=False, sep="\t") 172 | df_use.loc[(sp['Map_other'] > 4).values, :].to_csv(out_prefix + '.bias-mismap_gain.tsv', index=False, sep="\t") 173 | 174 | 175 | 176 | 177 | if __name__ == "__main__": 178 | parser = argparse.ArgumentParser() 179 | parser.add_argument('-mb', '--bias_report', help='bias report, must contain the golden information') 180 | parser.add_argument('-qt', '--quality_threshold', help='threshold that filtered the sites with avg_mapQ below the threshold', type=int) 181 | parser.add_argument('-out', '--output_prefix', help='the prefix for the output plots and report') 182 | args = parser.parse_args() 183 | 184 | fn_bias = args.bias_report 185 | mapQ_th = args.quality_threshold 186 | output_prefix = args.output_prefix 187 | if output_prefix == None: 188 | output_prefix = fn_bias 189 | 190 | df_use = pd.read_csv(fn_bias, sep='\t') 191 | if mapQ_th: 192 | df_use = df_use[df_use['AVG_MAPQ'] >= mapQ_th] 193 | df_use.head() 194 | 195 | plot_golden(output_prefix, df_use) 196 | 197 | -------------------------------------------------------------------------------- /biastools/biastools.py: -------------------------------------------------------------------------------- 1 | # Wrap up python file for the biastools 1st and 2nd module 2 | import subprocess 3 | import sys 4 | import os 5 | import argparse 6 | from shutil import which 7 | 8 | def is_tool(name): 9 | """Check whether `name` is on PATH and marked as executable.""" 10 | return which(name) is not None 11 | 12 | 13 | def check_program_install(list_names): 14 | flag_violate = False 15 | for name in list_names: 16 | if is_tool(name) == False: 17 | print(name, "is a prerequisite program, please install it before running biastools") 18 | flag_violate = True 19 | if flag_violate: 20 | print("Use --force option if you want to disable the prerequisite program check.") 21 | exit(1) 22 | 23 | 24 | def bool2str(flag): 25 | if flag: 26 | return "1" 27 | else: 28 | return "0" 29 | 30 | 31 | def catch_assert(parser, message): 32 | print('\n', message, '\n') 33 | parser.print_usage() 34 | exit(1) 35 | 36 | 37 | 38 | 39 | def main(): 40 | parser = argparse.ArgumentParser(description="Simulation/Alignment/Analyzing/Prediction module of the Biastools v0.3.1") 41 | parser.add_argument('--version', action='version', version='%(prog)s 0.3.1') 42 | parser.add_argument('-o', '--out', help="Path to output directory ['out_dir'].", default="out_dir") 43 | parser.add_argument('-g', '--genome', help="Path to the reference genome.") 44 | parser.add_argument('-v', '--vcf', help="Path to the personal vcf file.") 45 | parser.add_argument('-s', '--sample_id', help="Sample ID ['sample'].", default="sample") 46 | parser.add_argument('-r', '--run_id', help="Run ID ['run'].", default="run") 47 | # Process options 48 | parser.add_argument('--simulate', help='[1] Option to run biastools simulation.', action='store_true') 49 | parser.add_argument('--align', help='[2] Option to run biastools align.', action='store_true') 50 | parser.add_argument('--analyze', help='[3] Option to run biastools analyze.', action='store_true') 51 | parser.add_argument('--predict', help='[4] Option to predict bias from analysis report.', action='store_true') 52 | 53 | parser.add_argument('-t', '--thread', help="Number of threads to use [max].", type=int) 54 | parser.add_argument('--force', help="running the program without checking prerequisite programs.", action='store_true') 55 | # [1] 56 | parser.add_argument('-x', '--coverage', help="Read coverage to simulate [30].", type=int, default=30) 57 | # [2] 58 | parser.add_argument('-a', '--aligner', help="Aligner to use (bowtie2|bwamem) [bowtie2]", default="bowtie2") 59 | parser.add_argument('-b', '--align_index', help="Path to the aligner index (target reference)") 60 | # [3] 61 | parser.add_argument('-i', '--bam', help="Path to the alignment bam file, should be sorted [out_dir/sample.run_id.sorted.bam].") 62 | parser.add_argument('-n', '--naive', help= "Option to run the naive assignment method [False].", action='store_true') 63 | parser.add_argument('-R', '--real', help= "Option for performing analysis on real data [False].", action='store_true') 64 | parser.add_argument('-d', '--boundary', help= "Boundary to plot the indel balance plot [20]", type=int, default=20) 65 | parser.add_argument('-lr', '--list_report', help= "List of bias report to plot the indel balance plot", nargs='+') 66 | parser.add_argument('-ld', '--list_run_id', help= "List of run ID for namings in the indel balance plot", nargs='+') 67 | # [4] 68 | parser.add_argument('-ps', '--sim_report', help= "Path to the simulation report.") 69 | parser.add_argument('-pr', '--real_report', help= "Path to the real read report [out_dir/sample.real.run.bias].") 70 | args = parser.parse_args() 71 | 72 | ##### Parameters for biastool_analysis 73 | path_output = args.out 74 | path_ref = args.genome 75 | path_vcf = args.vcf 76 | sample_id = args.sample_id 77 | run_id = args.run_id 78 | bam_file = args.bam 79 | if bam_file == None: 80 | bam_file = path_output + '/' + sample_id + '.' + run_id + '.sorted.bam' 81 | 82 | flag_simulate = args.simulate 83 | flag_align = args.align 84 | flag_analyze = args.analyze 85 | flag_predict = args.predict 86 | 87 | path_module = os.path.dirname(__file__) + '/' 88 | try: 89 | assert flag_simulate + flag_align + flag_analyze + flag_predict >= 1 90 | except AssertionError: 91 | catch_assert(parser, "At least one of the --simulate/align/analyze/predict option should be specified.") 92 | 93 | flag_force = args.force 94 | thread = args.thread 95 | if thread == None: 96 | if sys.platform == "darwin": 97 | result = subprocess.run(["sysctl -n hw.ncpu"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True) 98 | else: 99 | result = subprocess.run(["nproc"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True) 100 | thread = int(result.stdout.strip()) 101 | 102 | coverage = args.coverage 103 | aligner = args.aligner 104 | align_index = args.align_index 105 | try: 106 | assert aligner=="bowtie2" or aligner=="bwamem" 107 | except AssertionError: 108 | catch_assert(parser, "Only bowtie2 and bwamem are supported.") 109 | 110 | flag_naive = args.naive 111 | flag_real = args.real 112 | boundary = args.boundary 113 | list_report = args.list_report 114 | list_run_id = args.list_run_id 115 | if list_report: 116 | try: 117 | assert len(list_report) == len(list_run_id) 118 | except AssertionError: 119 | catch_assert(parser, "Number of list --list_report and --list_run_id entries are inconsistent.") 120 | 121 | sim_report = args.sim_report 122 | real_report = args.real_report 123 | if flag_predict: 124 | try: 125 | assert real_report != None 126 | except AssertionError: 127 | catch_assert(parser, " should be specified when using --predict") 128 | 129 | 130 | 131 | # Checking prerequisite programs are installed 132 | if flag_force != True: 133 | list_program = ["bedtools", \ 134 | "samtools", \ 135 | "bcftools", \ 136 | "gzip", \ 137 | "tabix"] 138 | if flag_align: 139 | list_program += ["bwa", "bowtie2"] 140 | if flag_simulate: 141 | list_program.append("mason_simulator") 142 | check_program_install( list_program ) 143 | 144 | # Start running 145 | command = "mkdir -p " + path_output 146 | subprocess.call(command, shell=True) 147 | 148 | if flag_simulate: 149 | try: 150 | assert path_ref != None 151 | assert path_vcf != None 152 | except AssertionError: 153 | catch_assert(parser, " and should be specified when using --simulate") 154 | print("[Biastools] Simulate...") 155 | command = ' '.join(["bash", path_module+"biastools_simulation.sh", path_ref, path_vcf, path_output, sample_id, str(thread), str(coverage), path_module]) 156 | #print(command) 157 | subprocess.call(command, shell=True) 158 | if flag_align: 159 | try: 160 | assert path_ref != None 161 | assert path_vcf != None 162 | except AssertionError: 163 | catch_assert(parser, " and should be specified when using --align") 164 | if align_index == None: 165 | align_index = path_ref 166 | print("[Biastools] Align...") 167 | command = ' '.join(["bash", path_module+"biastools_align.sh", path_ref, path_vcf, path_output, sample_id, str(thread), aligner, align_index, run_id, path_module]) 168 | #print(command) 169 | subprocess.call(command, shell=True) 170 | if flag_analyze: 171 | if list_report != None: 172 | print("[Biastools] Plot the indel balance plot for multiple bias reports...") 173 | if flag_real: 174 | subprocess.call(['python3', path_module+'indel_balance_plot.py', "-lr"] + list_report + ["-ln"] + list_run_id + [ \ 175 | "-vcf", path_output+"/"+sample_id+".het.vcf.gz", "-bd", str(boundary), "-map", \ 176 | "-out", path_output+"/"+sample_id+"."+run_id+".real", "-real"]) 177 | else: 178 | subprocess.call(['python3', path_module+'indel_balance_plot.py', "-lr"] + list_report + ["-ln"] + list_run_id + [ \ 179 | "-vcf", path_output+"/"+sample_id+".het.vcf.gz", "-bd", str(boundary), "-map", \ 180 | "-out", path_output+"/"+sample_id+"."+run_id+".sim"]) 181 | else: 182 | try: 183 | assert path_ref != None 184 | assert path_vcf != None 185 | except AssertionError: 186 | catch_assert(parser, " and should be specified when using --analyze") 187 | print("[Biastools] Analyze and plot...") 188 | command = ' '.join(["bash", path_module+"biastools_analysis.sh", path_ref, path_vcf, path_output, sample_id, str(thread), run_id, bool2str(flag_real), \ 189 | bool2str(flag_naive), str(boundary), path_module, bam_file]) 190 | #print(command) 191 | subprocess.call(command, shell=True) 192 | if flag_predict: 193 | print("[Biastools] Predict bias...") 194 | command = ' '.join(["bash", path_module+"biastools_predict.sh", path_output, sample_id, run_id, bool2str(flag_real), real_report, sim_report, path_module]) 195 | #print(command) 196 | subprocess.call(command, shell=True) 197 | 198 | 199 | 200 | 201 | 202 | if __name__ == "__main__": 203 | main() 204 | 205 | 206 | 207 | 208 | -------------------------------------------------------------------------------- /biastools/predict_experiment.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | import numpy as np 5 | from matplotlib.colors import ListedColormap 6 | import pandas as pd 7 | import random 8 | 9 | from sklearn import datasets, metrics 10 | from sklearn.metrics import roc_curve, precision_recall_curve, auc 11 | 12 | 13 | 14 | def get_label(df_simulation): 15 | """ 16 | sort and label the simulated data, real data 17 | """ 18 | sp = pd.DataFrame() 19 | sp['Map_other'] = list(df_simulation['MIS_MAP']) 20 | sp['Normalized Allelic Balance'] = list(df_simulation['BALANCE']-df_simulation['SIM_BALANCE']) # the average map_q score 21 | sp['Normalized Mapping Balance'] = list(df_simulation['MAP_BALANCE']-df_simulation['SIM_BALANCE']) # the average map_q score 22 | 23 | biased = (sp['Normalized Allelic Balance']**2 + sp['Normalized Mapping Balance']**2 > 0.01) 24 | b_loss = ((sp['Normalized Allelic Balance'] < sp['Normalized Mapping Balance']*2 + 0.1) * \ 25 | (sp['Normalized Allelic Balance']*2 + 0.1 > sp['Normalized Mapping Balance'])) 26 | b_flux = (sp['Normalized Allelic Balance'] > 0.1)*(sp['Map_other'] > 4) 27 | b_artifact = (sp['Normalized Allelic Balance'] > 0.1)*(sp['Map_other'] <= 4) 28 | 29 | sp['Category'] = biased*4 30 | sp['Category'] -= (biased * b_loss)*3 31 | sp['Category'] -= (biased * ~b_loss * b_flux)*2 32 | sp['Category'] -= (biased * ~b_loss * b_artifact)*1 33 | 34 | sp['binary_category'] = (sp['Category'] > 0) 35 | return sp 36 | 37 | 38 | def print_accuracy(predict, label): 39 | print("Correct Num:", np.sum(predict == label)) 40 | TP = np.sum((predict == label) * (predict != 0)) 41 | FP = np.sum((predict != label) * (predict != 0)) 42 | FN = np.sum((predict != label) * (predict == 0)) 43 | print("True Positive:", TP) 44 | print("False Positive:", FP) 45 | print("False Negative:", FN) 46 | print("Precision:", TP/(TP+FP)) 47 | print("Recall:", TP/(TP+FN)) 48 | 49 | 50 | def combine_score(sim_feature, sim_label, real_feature, real_label, miss_info, best_threshold, out_prefix): 51 | """ 52 | quality score * balance score 53 | """ 54 | sim_feature['label'] = sim_label 55 | sim_feature['z_MAPQ'] = ((sim_feature['AVG_MAPQ'] - 45) * -1).clip(lower=0) 56 | sim_feature['combine_score'] = (sim_feature['z_MAPQ']) * (sim_feature['BALANCE']) #* (sim_feature['BALANCE']) 57 | sim_feature['plus_score'] = (sim_feature['z_MAPQ']/45) + 1.5*sim_feature['BALANCE'] 58 | sim_feature['mix_score'] = sim_feature['plus_score'] + sim_feature['combine_score'] / 20 59 | 60 | fpr_m, tpr_m, thresholds = metrics.roc_curve(sim_feature['label'], sim_feature['combine_score'], pos_label=True) 61 | fpr_p, tpr_p, thresholds = metrics.roc_curve(sim_feature['label'], sim_feature['plus_score'], pos_label=True) 62 | plt.plot(fpr_m, tpr_m, label="simulation_mul, auc="+str(round(auc(fpr_m,tpr_m),2))) 63 | plt.plot(fpr_p, tpr_p, label="simulation_add, auc="+str(round(auc(fpr_p,tpr_p),2))) 64 | 65 | real_feature['label'] = real_label 66 | real_feature['z_MAPQ'] = ((real_feature['AVG_MAPQ'] - 45) * -1).clip(lower=0) 67 | real_feature['combine_score'] = (real_feature['z_MAPQ']) * (real_feature['BALANCE']) #* (real_feature['BALANCE']) 68 | real_feature['plus_score'] = (real_feature['z_MAPQ']/45) + 1.5*real_feature['BALANCE'] 69 | r_fpr_m, r_tpr_m, thresholds = metrics.roc_curve(real_feature['label'], real_feature['combine_score'], pos_label=True) 70 | r_fpr_p, r_tpr_p, thresholds = metrics.roc_curve(real_feature['label'], real_feature['plus_score'], pos_label=True) 71 | plt.plot(r_fpr_m, r_tpr_m, label="real_mul, auc="+str(round(auc(r_fpr_m, r_tpr_m),2))) 72 | plt.plot(r_fpr_p, r_tpr_p, label="real_add, auc="+str(round(auc(r_fpr_p, r_tpr_p),2))) 73 | 74 | plt.xlabel('False Positive Rate') 75 | plt.ylabel('True Positive Rate') 76 | plt.legend() 77 | plt.savefig(out_prefix + "_ROC.pdf") 78 | plt.clf() 79 | 80 | 81 | precision, recall, thresholds = precision_recall_curve(sim_feature['label'], sim_feature['combine_score']) 82 | precision_p, recall_p, thresholds = precision_recall_curve(sim_feature['label'], sim_feature['plus_score']) 83 | r_precision, r_recall, thresholds = precision_recall_curve(real_feature['label'], real_feature['combine_score']) 84 | r_precision_p, r_recall_p, thresholds = precision_recall_curve(real_feature['label'], real_feature['plus_score']) 85 | 86 | plt.plot(recall, precision, label="simulation_mul, auc="+str(round(auc(recall, precision),2))) 87 | plt.plot(recall_p, precision_p, label="simulation_add, auc="+str(round(auc(recall_p, precision_p),2))) 88 | plt.plot(r_recall, r_precision, label="real_mul, auc="+str(round(auc(r_recall, r_precision),2))) 89 | plt.plot(r_recall_p, r_precision_p, label="real_add, auc="+str(round(auc(r_recall_p, r_precision_p),2))) 90 | 91 | plt.xlabel('Recall') 92 | plt.ylabel('Precision') 93 | plt.legend() 94 | plt.savefig(out_prefix + "_PRC.pdf") 95 | plt.clf() 96 | 97 | print("====== sim featue ========") 98 | print_accuracy(sim_feature['plus_score'] > best_threshold, sim_feature['label']) 99 | print("====== real featue ========") 100 | print_accuracy(real_feature['plus_score'] > best_threshold, real_feature['label']) 101 | print("======= overlap =========") 102 | print_accuracy(sim_feature[~miss_info]['plus_score'] > best_threshold, real_feature['plus_score'] > 1.5) 103 | print("sim label True", np.sum(sim_feature['label'])) 104 | print("sim feature", np.sum(sim_feature['plus_score'] > best_threshold)) 105 | print("real feature", np.sum(real_feature['plus_score'] > best_threshold)) 106 | FP = (sim_feature['plus_score'] > best_threshold)* ~(sim_feature['label']) 107 | FN = (sim_feature['plus_score'] <= best_threshold)* (sim_feature['label']) 108 | return FP, FN 109 | 110 | 111 | 112 | 113 | if __name__ == '__main__': 114 | parser = argparse.ArgumentParser() 115 | parser.add_argument('-sr', '--simulation_report', help='the simulation bias report') 116 | parser.add_argument('-rr', '--real_report', help='the real data bias report') 117 | parser.add_argument('-thr', '--threshold', help='the threshold for prediction model [1.5]', type=int, default=1.5) 118 | parser.add_argument('-out', '--out_prefix', help='the prefix for plottings [predict]', type=str, default='predict') 119 | args = parser.parse_args() 120 | 121 | fn_simulation = args.simulation_report 122 | fn_real = args.real_report 123 | best_th = args.threshold 124 | out_prefix = args.out_prefix 125 | 126 | df_simulation = pd.read_csv(fn_simulation, sep='\t') 127 | df_real = pd.read_csv(fn_real, sep='\t') 128 | 129 | sp_label = get_label(df_simulation) 130 | 131 | # filter out the sites suspicious of imcomplete vcf information 132 | miss_info = (df_real['OTHER'] > df_real['NUM_READS'] * 0.9) + (df_real['OTHER'] > df_real['NUM_READS'] * 0.4) * \ 133 | ( (df_real['REF'] == 0) + (df_real['ALT'] == 0 )) 134 | no_info = df_simulation['AVG_MAPQ'].isnull() 135 | no_info += df_simulation['MAP_BALANCE'].isnull() 136 | no_info += df_simulation['BALANCE'].isnull() 137 | miss_info += no_info 138 | df_simulation = df_simulation[~no_info] 139 | sp_label = sp_label[~no_info] 140 | print("filtered number:", sum(miss_info)) 141 | 142 | df_real_test = df_real[~miss_info] 143 | sp_real_label = sp_label[~miss_info] 144 | FP, FN = combine_score(df_simulation, sp_label.iloc[:, 4].values, df_real_test, sp_real_label.iloc[:, 4].values, \ 145 | miss_info, best_th, out_prefix) 146 | 147 | # print data of FP and FN 148 | with pd.option_context('display.max_rows', None): # more options can be specified also 149 | print("False Positive:") 150 | print(df_simulation[FP]) 151 | print("==================================================") 152 | print("False Negative:") 153 | print(df_simulation[FN]) 154 | 155 | # plot false positive 156 | labels = ['Balanced', 'Bias (Loss)', 'Bias (Flux)', 'Bias (Local)', 'Outliers'] 157 | idx_cat = set(sp_label[FP+FN]["Category"]) 158 | labels = [labels[idx] for idx in sorted(idx_cat)] 159 | ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Allelic Balance", hue = "Category", data = sp_label[FP+FN], \ 160 | xlim=(-0.8,0.8), ylim=(-0.8,0.8), palette='Set2') 161 | ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.2) 162 | ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.2) 163 | ax.fig.suptitle("False Positive and False Negative") 164 | ax.fig.tight_layout() 165 | ax.ax_joint.get_legend().remove() 166 | h, l = ax.ax_joint.get_legend_handles_labels() 167 | plt.legend(h, labels, title="Category#", bbox_to_anchor=(0, 0), loc='lower left', borderaxespad=0.2) 168 | plt.savefig(out_prefix + '_FP_and_FN.pdf') 169 | plt.clf() 170 | 171 | labels = ['Balanced', 'Bias (Loss)', 'Bias (Flux)', 'Bias (Local)', 'Outliers'] 172 | idx_cat = set(sp_label[~(FP+FN)]["Category"]) 173 | labels = [labels[idx] for idx in sorted(idx_cat)] 174 | ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Allelic Balance", hue = "Category", data = sp_label[~(FP+FN)], \ 175 | xlim=(-0.8,0.8), ylim=(-0.8,0.8), palette='Set2') 176 | ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.2) 177 | ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.2) 178 | ax.fig.suptitle("True Positive and True Negative") 179 | ax.fig.tight_layout() 180 | ax.ax_joint.get_legend().remove() 181 | h, l = ax.ax_joint.get_legend_handles_labels() 182 | plt.legend(h, labels, title="Category#", bbox_to_anchor=(0, 0), loc='lower left', borderaxespad=0.2) 183 | plt.savefig(out_prefix + '_TP_and_TN.pdf') 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /biastools/indel_balance_plot.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | 5 | import math 6 | import numpy as np 7 | import pysam 8 | 9 | 10 | 11 | def read_bias_report(fn_bias_report): 12 | list_bias_SNP = [] 13 | list_bias_gap = [] 14 | f = open(fn_bias_report, 'r') 15 | header = f.readline() 16 | for line in f: 17 | fields = line.split() 18 | if fields[-1] == '.': 19 | list_bias_gap.append(fields) 20 | else: 21 | list_bias_SNP.append(fields) 22 | f.close() 23 | return list_bias_SNP, list_bias_gap 24 | 25 | 26 | def calculate_SNP_balance(assign_SNP, flag_real): 27 | """ 28 | Return for simulated read: 29 | [[simulate_balance], [map_balance], [assign_balance]] 30 | Return for real read: 31 | [assign_balance] 32 | """ 33 | if flag_real: 34 | record = [[float(fields[5]) for fields in assign_SNP]] 35 | else: 36 | record = [[],[],[]] 37 | for idx in range(len(assign_SNP)): 38 | record[0].append(float(assign_SNP[idx][14])) 39 | record[1].append(float(assign_SNP[idx][10])) 40 | record[2].append(float(assign_SNP[idx][5])) 41 | return record 42 | 43 | 44 | def calculate_gap_balance(assign_gap, f_vcf, len_bd, get_idx): 45 | list_insert = [ [] for _ in range(len_bd) ] 46 | list_delete = [ [] for _ in range(len_bd) ] 47 | for idx in range(len(assign_gap)): 48 | ref_name = assign_gap[idx][0] 49 | var_start = int(assign_gap[idx][1]) 50 | var_segment = f_vcf.fetch(contig=ref_name, start=var_start-1, stop=var_start+1) # get exactly the variant at the site 51 | for var in var_segment: 52 | if var.start+1 != var_start: 53 | continue 54 | len_ref = len(var.ref) 55 | if len(var.alts) == 1: 56 | len_alt = len(var.alts[0]) 57 | else: 58 | hap = var.samples[0]['GT'] 59 | if hap[0] != 0: 60 | len_alt = len(var.alts[hap[0]-1]) 61 | else: 62 | len_alt = len(var.alts[hap[1]-1]) 63 | 64 | if len_ref > len_alt: # deletion 65 | diff = min(len_ref - len_alt -1, len_bd-1) 66 | record = float(assign_gap[idx][get_idx]) 67 | list_delete[diff].append(record) 68 | else: # 0 and insertions 69 | diff = min(len_alt - len_ref -1, len_bd-1) 70 | record = float(assign_gap[idx][get_idx]) 71 | list_insert[diff].append(record) 72 | return list_insert, list_delete 73 | 74 | 75 | def addlabels(x, y, len_bd): 76 | for i in range(len(x)): 77 | # Format numbers: use 'k' for values ≥1000, no decimal points 78 | if y[i] >= 1000: 79 | label = f'{int(y[i]/1000)}k' 80 | else: 81 | label = str(int(y[i])) 82 | plt.text(i-len_bd, y[i], label, ha='center', va='bottom', fontsize=8) # Added 30 degree rotation 83 | 84 | 85 | def plot_balance(balance_delete, balance_SNP, balance_insert, output_name, len_bd, list_incidents, list_plot_name, use_median=False): 86 | len_plot = len(list_plot_name) 87 | balance_list = [np.zeros(2*len_bd+1) for idx in range(len_plot)] 88 | balance_25th = [np.zeros(2*len_bd+1) for idx in range(len_plot)] 89 | balance_75th = [np.zeros(2*len_bd+1) for idx in range(len_plot)] 90 | 91 | # Process deletions 92 | for idy, list_delete in enumerate(balance_delete): 93 | for idx in range(len_bd): 94 | list_balance = np.array(list_delete[idx]) 95 | if len(list_balance) > 1: 96 | valid_balance = list_balance[~np.isnan(list_balance)] 97 | # Calculate 1 - value for all statistics 98 | flipped_balance = 1 - valid_balance 99 | balance_list[idy][len_bd-1-idx] = np.median(flipped_balance) if use_median else np.mean(flipped_balance) 100 | # Note: when we flip values, 75th becomes 25th and vice versa 101 | balance_25th[idy][len_bd-1-idx] = np.quantile(flipped_balance, 0.25) # Was 0.75 102 | balance_75th[idy][len_bd-1-idx] = np.quantile(flipped_balance, 0.75) # Was 0.25 103 | else: 104 | balance_list[idy][len_bd-1-idx] = np.nan 105 | balance_25th[idy][len_bd-1-idx] = np.nan 106 | balance_75th[idy][len_bd-1-idx] = np.nan 107 | 108 | # Process SNPs 109 | for idy, list_balance in enumerate(np.array(balance_SNP)): 110 | valid_balance = list_balance[~np.isnan(list_balance)] 111 | flipped_balance = 1 - valid_balance 112 | balance_list[idy][len_bd] = np.median(flipped_balance) if use_median else np.mean(flipped_balance) 113 | balance_25th[idy][len_bd] = np.quantile(flipped_balance, 0.25) # Was 0.75 114 | balance_75th[idy][len_bd] = np.quantile(flipped_balance, 0.75) # Was 0.25 115 | 116 | # Process insertions 117 | for idy, list_insert in enumerate(balance_insert): 118 | for idx in range(len_bd): 119 | list_balance = np.array(list_insert[idx]) 120 | if len(list_balance) > 1: 121 | valid_balance = list_balance[~np.isnan(list_balance)] 122 | flipped_balance = 1 - valid_balance 123 | balance_list[idy][len_bd+1+idx] = np.median(flipped_balance) if use_median else np.mean(flipped_balance) 124 | balance_25th[idy][len_bd+1+idx] = np.quantile(flipped_balance, 0.25) # Was 0.75 125 | balance_75th[idy][len_bd+1+idx] = np.quantile(flipped_balance, 0.75) # Was 0.25 126 | else: 127 | balance_list[idy][idx+len_bd+1] = np.nan 128 | balance_25th[idy][idx+len_bd+1] = np.nan 129 | balance_75th[idy][idx+len_bd+1] = np.nan 130 | 131 | t = list(range(-len_bd, len_bd+1)) 132 | f, (a0, a1) = plt.subplots(2, 1, gridspec_kw={ 133 | 'height_ratios': [3, 1], 134 | 'hspace': 0.1 135 | }) 136 | f.set_size_inches(20, 10) # Slightly taller to accommodate labels 137 | 138 | prop_cycle = plt.rcParams['axes.prop_cycle'] 139 | colors = prop_cycle.by_key()['color'] 140 | 141 | # Adjust the subplot parameters to give specified padding 142 | f.subplots_adjust(right=0.85, hspace=0.1) # Make room for legend on right 143 | 144 | for idx, name in enumerate(list_plot_name): 145 | # Calculate error bar lengths 146 | yerr_minus = balance_list[idx] - balance_25th[idx] 147 | yerr_plus = balance_75th[idx] - balance_list[idx] 148 | # make sure the error bar is not negative 149 | yerr_minus = np.maximum(yerr_minus, 0) 150 | yerr_plus = np.maximum(yerr_plus, 0) 151 | yerr = np.vstack((yerr_minus, yerr_plus)) 152 | 153 | # Plot with asymmetric error bars 154 | a0.errorbar(t, balance_list[idx], # Removed (1-balance_list[idx]) since we already flipped 155 | yerr=yerr, 156 | capsize=3, fmt='-o', label=name, color=colors[idx], 157 | markersize=6, elinewidth=1, capthick=1) 158 | 159 | # Move legend inside the upper panel, near the bottom 160 | a0.legend(frameon=True, fancybox=True, framealpha=0.9, 161 | loc='lower center', # Place at bottom center 162 | bbox_to_anchor=(0.5, 0.05), # Position slightly above bottom 163 | ncol=2) # Two columns for better space usage 164 | 165 | a0.axhline(y=0.5, color='gray', linestyle='dashdot', linewidth=0.9) 166 | a0.set(ylabel='Fraction of alternate allele') 167 | a0.grid(True, linestyle='--', alpha=0.3) 168 | 169 | a1.set(xlabel='Insertion (+) or deletion (-) length') 170 | a1.set(ylabel='# of variants') 171 | 172 | # Increase bar width 173 | width = 0.65 # Changed from 0.5 to 0.8 for thicker bars 174 | bars = a1.bar(t, list_incidents, align='center', width=width, log=True, linewidth=1) 175 | a1.set_ylim([1, max(list_incidents)*5]) 176 | 177 | # Create x-ticks only for multiples of 5 and boundaries 178 | xticks = [] 179 | xticklabels = [] 180 | for x in range(-len_bd, len_bd + 1): 181 | if x == -len_bd or x == len_bd or x % 5 == 0: 182 | xticks.append(x) 183 | if x == -len_bd: 184 | xticklabels.append(f"≤-{len_bd}") 185 | elif x == len_bd: 186 | xticklabels.append(f"≥{len_bd}") 187 | else: 188 | xticklabels.append(str(x)) 189 | 190 | a1.set_xticks(xticks) 191 | a1.set_xticklabels(xticklabels) # Remove rotation 192 | 193 | # Use the same x-ticks for the upper plot 194 | a0.set_xticks(xticks) 195 | a0.set_xticklabels(xticklabels) # Remove rotation 196 | 197 | addlabels(t, list_incidents, len_bd) 198 | a1.grid(axis='y', linestyle='--', alpha=0.3) 199 | 200 | a0.set_xlim(a1.get_xlim()) 201 | 202 | # Adjust subplot spacing 203 | f.subplots_adjust(hspace=0.1) # Keep minimal space between plots 204 | 205 | plt.savefig(output_name + '.indel_balance.pdf', bbox_inches='tight', dpi=300) 206 | 207 | 208 | if __name__ == "__main__": 209 | parser = argparse.ArgumentParser() 210 | parser.add_argument('-lr', '--list_report', nargs='+', required=True, help='the list of assignment bias report') 211 | parser.add_argument('-ln', '--list_name', nargs='+', required=True, help='the second bias report') 212 | parser.add_argument('-vcf', '--vcf_report', help='the vcf report for the bias report regions') 213 | parser.add_argument('-bd', '--boundary', type=int, default=40, help='the boundary indel lengths extend from 0') 214 | parser.add_argument('-map', '--flag_mapping', action='store_true', help='show the mapping rather than local result') 215 | parser.add_argument('-real', '--flag_real', action='store_true', help='specify if the report contains no simulation information') 216 | parser.add_argument('-out', '--output_name', help="output file name") 217 | parser.add_argument('-median', '--use_median', action='store_true', 218 | help='Use median instead of mean for central tendency') 219 | args = parser.parse_args() 220 | 221 | list_report = args.list_report 222 | list_name = args.list_name 223 | fn_vcf = args.vcf_report 224 | boundary = args.boundary 225 | flag_map = args.flag_mapping 226 | flag_real = args.flag_real 227 | output_name = args.output_name 228 | if output_name == None: 229 | output_name = list_name[0] 230 | 231 | assert len(list_report) == len(list_name), "Number of bias_report and bias names are different." 232 | 233 | f_vcf = pysam.VariantFile(fn_vcf) 234 | # read the bias report 235 | list_bias_report = [] 236 | for fn_assign_report in list_report: 237 | assign_report = read_bias_report(fn_assign_report) 238 | list_bias_report.append(assign_report) 239 | 240 | # fetch the SNP balance information 241 | list_balance_SNP = [] 242 | for assign_SNP, assign_gap in list_bias_report: 243 | balance_SNP = calculate_SNP_balance(assign_SNP, flag_real) 244 | list_balance_SNP.append(balance_SNP) 245 | 246 | if flag_real: # no simulation of mapping information provided 247 | list_plot_name = list_name #[name + '(real)' for name in list_name] 248 | 249 | # fetch the gap balance information 250 | list_balance_delete = [] 251 | list_balance_insert = [] 252 | for assign_SNP, assign_gap in list_bias_report: 253 | balance_insert, balance_delete = calculate_gap_balance(assign_gap, f_vcf, boundary, 5) 254 | list_balance_insert.append(balance_insert) 255 | list_balance_delete.append(balance_delete) 256 | 257 | balance_SNP = list_balance_SNP 258 | balance_delete = list_balance_delete 259 | balance_insert = list_balance_insert 260 | else: # to plot the simulated reads, the first entry is the simulated balance information, then we can choose map or local_assignment 261 | flag_choice = 2 262 | gap_choice = 5 263 | list_plot_name = ["simulated"] 264 | if flag_map: 265 | flag_choice = 1 266 | gap_choice = 10 267 | list_plot_name += [name + '(map)' for name in list_name] 268 | else: 269 | list_plot_name += [name + '(assign)' for name in list_name] 270 | 271 | # fetch the gap balance information 272 | balance_insert, balance_delete = calculate_gap_balance(list_bias_report[0][1], f_vcf, boundary, 14) # getting the simulated information 273 | list_balance_delete = [balance_delete] 274 | list_balance_insert = [balance_insert] 275 | for assign_SNP, assign_gap in list_bias_report: 276 | balance_insert, balance_delete = calculate_gap_balance(assign_gap, f_vcf, boundary, gap_choice) 277 | list_balance_insert.append(balance_insert) 278 | list_balance_delete.append(balance_delete) 279 | 280 | balance_SNP = [list_balance_SNP[0][0]] + [balance[flag_choice] for balance in list_balance_SNP] 281 | balance_delete = list_balance_delete 282 | balance_insert = list_balance_insert 283 | 284 | 285 | # get the incident numbers of the indels 286 | list_incidents = [len(balance) for balance in list_balance_delete[0]][::-1] + [len(list_balance_SNP[0][0])] + [len(balance) for balance in list_balance_insert[0]] 287 | 288 | plot_balance(balance_delete, balance_SNP, balance_insert, output_name, boundary, list_incidents, list_plot_name, args.use_median) 289 | 290 | -------------------------------------------------------------------------------- /biastools/golden_graph.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | 6 | import math 7 | import random 8 | import numpy as np 9 | 10 | 11 | colors = ["#bce4ff", "#8bd0fe", "#59bcfc", "#0099fc", "#0086dd", "#006bb1", "#004a7a", "#002740"] 12 | colors = ["#f2dad5", "#e8bfc1", "#d9a4b2", "#c78ba6", "#aa719a", "#8b5b89", "#634271", "#3c2a4f"] 13 | 14 | def map_mapq_to_size(mapq): 15 | if mapq >= 40: 16 | return 0 17 | elif mapq >= 30: 18 | return 1 19 | elif mapq >= 20: 20 | return 2 21 | elif mapq >= 10: 22 | return 3 23 | elif mapq >= 5: 24 | return 4 25 | elif mapq >= 3: 26 | return 5 27 | elif mapq >= 1: 28 | return 6 29 | return 7 30 | 31 | labels = ['>40', '30~40', '20~30', '10~20', '5~10', '3~5', '1~3', '<1'] 32 | 33 | def map_color( 34 | var:float 35 | )-> int: 36 | """ 37 | color_code = int(var/2) 38 | if color_code > 20: 39 | color_code = 20 40 | return color_code 41 | """ 42 | if var > 0.5: 43 | return 0 44 | elif var > 0.3: 45 | return 1 46 | elif var > 0.1: 47 | return 2 48 | elif var > 0.05: 49 | return 3 50 | elif var > 0.01: 51 | return 4 52 | else: 53 | return 5 54 | 55 | p_labels = ['>0.5', '0.3~0.5', '0.1~0.3', '0.05~0.1', '0.01~0.05', '<0.01'] 56 | 57 | def map_num_to_size(num): 58 | if num == 0: 59 | return 0 60 | elif num <= 3: 61 | return 1 62 | elif num <= 5: 63 | return 2 64 | elif num <= 10: 65 | return 3 66 | elif num <= 15: 67 | return 4 68 | elif num <= 20: 69 | return 5 70 | elif num <= 30: 71 | return 6 72 | return 7 73 | 74 | n_labels = ['0', '1~3', '4~6', '7~10', '11~15', '16~20', '21~30', '>30'] 75 | 76 | def map_waste_to_color(value): 77 | return int(math.ceil(value*8)) 78 | 79 | 80 | def plot_golden(out_prefix, df_use): 81 | 82 | # Add columns 83 | df_use['WASTE_INFO'] = (df_use['OTHER'])/(df_use['NUM_READS']+0.01) 84 | mapQ = list(df_use['AVG_MAPQ']) 85 | pValue = list(df_use['EVEN_P_VALUE']) 86 | 87 | sp = pd.DataFrame() 88 | sp['ASSIGNMENT BALANCE'] = list(df_use['BALANCE']) 89 | sp['MAPPING BALANCE'] = list(df_use['MAP_BALANCE']) 90 | sp['SIMULATION BALANCE'] = list(df_use['SIM_BALANCE']) 91 | sp.head() 92 | 93 | mapped_mapQ = [map_mapq_to_size(q) for q in mapQ] 94 | mapped_p = [map_color(p) for p in pValue] 95 | waste_value = [map_waste_to_color(q) for q in list(df_use['WASTE_INFO'])] 96 | sp['Avg_MapQ_code'] = mapped_mapQ 97 | sp['Even_p_value'] = mapped_p 98 | sp['Waste_value'] = waste_value 99 | sp['Assign_other'] = [map_num_to_size(n) for n in list(df_use['OTHER']) ] 100 | sp['Map_other'] = [map_num_to_size(n) for n in list(df_use['MIS_MAP']) ] 101 | sp['MapQ'] = list(mapQ) 102 | 103 | #================== color map ==================== 104 | set_mapQ_value = set(sp['Avg_MapQ_code']) 105 | color_mapQ = [] 106 | for idx in sorted(set_mapQ_value): 107 | color_mapQ.append(colors[idx]) 108 | 109 | set_misMap_value = set(sp['Map_other']) 110 | color_misMap = [] 111 | for idx in sorted(set_misMap_value): 112 | color_misMap.append(colors[idx]) 113 | 114 | #=========================== standard ref_bias to read_distribute plot ============================ 115 | print("Ploting the Standard Ref Bias Plot!") 116 | plt.clf() 117 | ax = sns.scatterplot(y="ASSIGNMENT BALANCE", x="MAPPING BALANCE", hue = "Avg_MapQ_code", data = sp, palette=sns.color_palette(color_mapQ)) 118 | #ax = sns.scatterplot(y="ASSIGNMENT BALANCE", x="MAPPING BALANCE", hue = "Even_p_value", data = sp)#hue="size", size="size", data=tips) 119 | #ax = sns.scatterplot(y="ASSIGNMENT BALANCE", x="MAPPING BALANCE", hue = "Waste_value", data = sp)#hue="size", size="size", data=tips) 120 | h, l = ax.get_legend_handles_labels() 121 | plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(0.92, 1), loc=2, borderaxespad=0., framealpha=1) 122 | plt.xlim([0,1]) 123 | plt.ylim([0,1]) 124 | 125 | FN_FIG = out_prefix + '.diff-assign2map_dot.pdf' 126 | plt.savefig(FN_FIG) 127 | 128 | #=========================== golden to read_distribute plot ============================ 129 | print("Ploting the Golden distribution Plot!") 130 | plt.clf() 131 | ax = sns.scatterplot(x="SIMULATION BALANCE", y="MAPPING BALANCE", hue = "Avg_MapQ_code", data = sp, palette=sns.color_palette(color_mapQ))#hue="size", size="size", data=tips) 132 | h, l = ax.get_legend_handles_labels() 133 | plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(0.92, 1), loc=2, borderaxespad=0., framealpha=1) 134 | plt.xlim([0,1]) 135 | plt.ylim([0,1]) 136 | 137 | FN_FIG = out_prefix + '.diff-sim2map_dot.pdf' 138 | plt.savefig(FN_FIG) 139 | 140 | #=========================== golden to ref_bias plot ============================ 141 | plt.clf() 142 | ax = sns.scatterplot(x="SIMULATION BALANCE", y="ASSIGNMENT BALANCE", hue = "Avg_MapQ_code", data = sp, palette=sns.color_palette(color_mapQ))#hue="size", size="size", data=tips) 143 | h, l = ax.get_legend_handles_labels() 144 | plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(0.92, 1), loc=2, borderaxespad=0., framealpha=1) 145 | plt.xlim([0,1]) 146 | plt.ylim([0,1]) 147 | 148 | FN_FIG = out_prefix + '.diff-sim2assign_dot.pdf' 149 | plt.savefig(FN_FIG) 150 | 151 | #=========================== all merged plot ============================ 152 | print("Ploting the Merged golden distribution Plot!") 153 | plt.clf() 154 | sp['Normalized Assignment Balance'] = list(df_use['BALANCE']-df_use['SIM_BALANCE']) # the average map_q score 155 | sp['Normalized Mapping Balance'] = list(df_use['MAP_BALANCE']-df_use['SIM_BALANCE']) # the average map_q score 156 | ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Assignment Balance", hue = "Avg_MapQ_code", data = sp, \ 157 | xlim=(-0.8,0.8), ylim=(-0.8,0.8), palette=sns.color_palette(color_mapQ)) 158 | #ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Assignment Balance", hue = "Map_other", data = sp, \ 159 | # xlim=(-0.8,0.8), ylim=(-0.8,0.8), palette=sns.color_palette(color_misMap)) 160 | ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.2) 161 | ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.2) 162 | ax.ax_joint.get_legend().remove() 163 | h, l = ax.ax_joint.get_legend_handles_labels() 164 | plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(0, 0), loc='lower left', borderaxespad=0.2) 165 | #plt.legend(h, n_labels, title="Mismapped Gain#", bbox_to_anchor=(1,0), loc='lower right', borderaxespad=0.2) 166 | 167 | FN_FIG = out_prefix + '.category.MapQ.pdf' 168 | plt.savefig(FN_FIG) 169 | 170 | #======================= allelic difference plot ========================= 171 | plt.clf() 172 | list_ref_diff = list(df_use['REF']-df_use['SIM_REF']) 173 | list_alt_diff = list(df_use['ALT']-df_use['SIM_ALT']) 174 | for idx in range(len(list_ref_diff)): 175 | list_ref_diff[idx] += random.uniform(-0.3, 0.3) # scatter plot 176 | list_alt_diff[idx] += random.uniform(-0.3, 0.3) 177 | sp['Ref# - Simulation Ref#'] = list_ref_diff 178 | sp['Alt# - Simulation Alt#'] = list_alt_diff 179 | 180 | #ax = sns.jointplot(x="Ref# - Simulation Ref#", y="Alt# - Simulation Alt#", hue = "Even_p_value", data = sp, xlim=(-20,20), ylim=(-20,15)) 181 | ax = sns.jointplot(x="Ref# - Simulation Ref#", y="Alt# - Simulation Alt#", hue = "Avg_MapQ_code", data = sp, xlim=(-30,30), ylim=(-30,15), palette=sns.color_palette(color_mapQ)) 182 | ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.1) 183 | ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.1) 184 | ax.ax_joint.get_legend().remove() 185 | h, l = ax.ax_joint.get_legend_handles_labels() 186 | #plt.legend(h, p_labels, title="Even P Value", bbox_to_anchor=(0,1), loc='upper right') 187 | plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(1,1), loc='upper right') 188 | 189 | FN_FIG = out_prefix + '.diff2-assign2sim.pdf' 190 | plt.savefig(FN_FIG) 191 | 192 | 193 | plt.clf() 194 | #ax = sns.jointplot(x="Ref# - Simulation Ref#", y="Alt# - Simulation Alt#", hue = "Map_other", data = sp, xlim=(-30,30), ylim=(-30,15), palette=sns.color_palette(color_misMap)) 195 | #ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.1) 196 | #ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.1) 197 | #ax.ax_joint.get_legend().remove() 198 | #h, l = ax.ax_joint.get_legend_handles_labels() 199 | #plt.legend(h, n_labels, title="Mismapped Gain#", bbox_to_anchor=(1,1), loc='upper right') 200 | # 201 | #FN_FIG = out_prefix + '-read_diff_allelic.mismap.pdf' 202 | #plt.savefig(FN_FIG) 203 | #====================== mapping difference plot ========================= 204 | plt.clf() 205 | list_m_ref_diff = list(df_use['MAP_REF']-df_use['SIM_REF']) 206 | list_m_alt_diff = list(df_use['MAP_ALT']-df_use['SIM_ALT']) 207 | for idx in range(len(list_m_ref_diff)): 208 | list_m_ref_diff[idx] += random.uniform(-0.3, 0.3) # scatter plot 209 | list_m_alt_diff[idx] += random.uniform(-0.3, 0.3) 210 | sp['Mapping Ref# - Simulation Ref#'] = list_m_ref_diff 211 | sp['Mapping Alt# - Simulation Alt#'] = list_m_alt_diff 212 | 213 | #ax = sns.jointplot(x="Mapping Ref# - Simulation Ref#", y="Mapping Alt# - Simulation Alt#", hue = "Even_p_value", data = sp, xlim=(-20,20), ylim=(-20,15)) 214 | ax = sns.jointplot(x="Mapping Ref# - Simulation Ref#", y="Mapping Alt# - Simulation Alt#", hue = "Avg_MapQ_code", data = sp, xlim=(-30,30), ylim=(-30,15), palette=sns.color_palette(color_mapQ)) 215 | ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.1) 216 | ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.1) 217 | ax.ax_joint.get_legend().remove() 218 | h, l = ax.ax_joint.get_legend_handles_labels() 219 | #plt.legend(h, p_labels, title="Even P Value", bbox_to_anchor=(0,1), loc='upper right') 220 | plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(1,1), loc='upper right') 221 | 222 | FN_FIG = out_prefix + '.diff2-map2sim.pdf' 223 | plt.savefig(FN_FIG) 224 | 225 | 226 | plt.clf() 227 | #ax = sns.jointplot(x="Mapping Ref# - Simulation Ref#", y="Mapping Alt# - Simulation Alt#", hue = "Map_other", data = sp, xlim=(-30,30), ylim=(-30,15), palette=sns.color_palette(color_misMap)) 228 | #ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.1) 229 | #ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.1) 230 | #ax.ax_joint.get_legend().remove() 231 | #h, l = ax.ax_joint.get_legend_handles_labels() 232 | #plt.legend(h, n_labels, title="Mismapped Gain#", bbox_to_anchor=(1,1), loc='upper right') 233 | # 234 | #FN_FIG = out_prefix + '-read_diff_mapping.mismap.pdf' 235 | #plt.savefig(FN_FIG) 236 | #======================== read loss-gain plot =========================== 237 | plt.clf() 238 | array_m_ref_diff = -np.array(df_use['MAP_REF']-df_use['SIM_REF']) 239 | array_m_alt_diff = -np.array(df_use['MAP_ALT']-df_use['SIM_ALT']) 240 | list_read_loss = list(np.where(array_m_ref_diff < 0, 0, array_m_ref_diff) + np.where(array_m_alt_diff < 0, 0, array_m_alt_diff)) 241 | list_read_gain = list(df_use["MIS_MAP"]) 242 | for idx in range(len(list_m_ref_diff)): 243 | list_read_loss[idx] += random.uniform(0,0.5) # scatter plot 244 | list_read_gain[idx] += random.uniform(0,0.5) 245 | sp["Loss of Read (Ref + Alt)"] = list_read_loss 246 | sp["Gain of Read"] = list_read_gain 247 | 248 | #ax = sns.jointplot(x="Loss of Read (Ref + Alt)", y="Gain of Read", hue = "Avg_MapQ_code", data = sp, xlim=(0,30), ylim=(0,30), palette=sns.color_palette(color_mapQ)) 249 | #ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.1) 250 | #ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.1) 251 | #ax.ax_joint.get_legend().remove() 252 | #h, l = ax.ax_joint.get_legend_handles_labels() 253 | #plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(1,1), loc='upper right') 254 | # 255 | #FN_FIG = out_prefix + '-loss_gain.pdf' 256 | #plt.savefig(FN_FIG) 257 | 258 | plt.close("all") 259 | sns.color_palette() 260 | ref_loss = list(df_use['SIM_REF']-df_use['MAP_REF']) 261 | alt_loss = list(df_use['SIM_ALT']-df_use['MAP_ALT']) 262 | read_gain = list(df_use["MIS_MAP"]) 263 | hist_data = pd.DataFrame() 264 | hist_data['loss/gain in a variant'] = read_gain + ref_loss + alt_loss 265 | hist_data['category'] = ["MisMap gain"]*len(read_gain) + ["Ref loss"]*len(ref_loss) + ["Alt loss"]*len(alt_loss) 266 | 267 | bin_num = max(hist_data['loss/gain in a variant']) - min(hist_data['loss/gain in a variant']) 268 | plt.clf() 269 | ax = sns.displot(hist_data, x="loss/gain in a variant", bins=bin_num, hue="category", log_scale=(False,True), element="step") 270 | ax.set(ylabel="occurence") 271 | FN_FIG = out_prefix + '.loss_gain_occurence.pdf' 272 | plt.savefig(FN_FIG) 273 | 274 | #plt.clf() 275 | #ax = sns.displot(hist_data, x="loss/gain in a variant", bins=int(bin_num/3), hue="category", log_scale=(False,True), multiple="dodge") 276 | #ax.set(ylabel="occurence") 277 | #FN_FIG = out_prefix + '-loss_gain_occurence.dodge.pdf' 278 | #plt.savefig(FN_FIG) 279 | 280 | 281 | 282 | 283 | if __name__ == "__main__": 284 | parser = argparse.ArgumentParser() 285 | parser.add_argument('-mb', '--bias_report', help='bias report, must contain the golden information') 286 | parser.add_argument('-qt', '--quality_threshold', help='threshold that filtered the sites with avg_mapQ below the threshold', type=int, default=0) 287 | parser.add_argument('-out', '--output_prefix', help='the prefix for the output plots and report') 288 | args = parser.parse_args() 289 | 290 | fn_bias = args.bias_report 291 | mapQ_th = args.quality_threshold 292 | output_prefix = args.output_prefix 293 | if output_prefix == None: 294 | output_prefix = fn_bias 295 | 296 | df_use = pd.read_csv(fn_bias, sep='\t') 297 | df_use = df_use[df_use['AVG_MAPQ'] >= mapQ_th] 298 | df_use.head() 299 | 300 | plot_golden(output_prefix, df_use) 301 | 302 | -------------------------------------------------------------------------------- /biastools/scanning_bias.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import gzip 3 | 4 | import numpy as np 5 | import os 6 | import argparse 7 | import pickle 8 | 9 | 10 | def output_wig( 11 | output_name :str, 12 | data_name :str, 13 | list_data :list 14 | ) -> None: 15 | """ 16 | output single wig file 17 | """ 18 | f_o = gzip.open(output_name, 'wt') 19 | for array_info in list_data: 20 | ref_name, wig_start, array_wig = array_info 21 | wig_end = wig_start + len(array_wig) 22 | 23 | f_o.write("browser position " + ref_name + ":" + str(wig_start) + "-" + str(wig_end) + '\n') 24 | f_o.write("browser hide all\n") 25 | f_o.write("track type=wiggle_0 name=\"" + data_name + "\" description=\"variableStep format\" visibility=hide autoScale=on" + \ 26 | "color=50,150,255 graphType=points priority=10\n") 27 | f_o.write("variableStep chrom=" + ref_name + '\n') 28 | for idx, depth in enumerate(array_wig): 29 | f_o.write(str(wig_start+idx) + ' ' + str(round(depth, 2)) + '\n') 30 | f_o.close() 31 | 32 | 33 | 34 | def report_wig( 35 | fn_output :str, 36 | dict_3D_measures :dict, 37 | ) -> None: 38 | """ 39 | output the wig format for read_depth, var_density, and dip_density 40 | this whole process take times 41 | """ 42 | # list_info is composed of array_RD, array_VD, array_ND, array_score 43 | # (ref_name, region_begin, array_info) 44 | list_info = [[],[],[],[]] 45 | #ref_name, region_begin, array_read_depth, array_var_density, array_dip_density, array_score, array_score_sum = wig_info 46 | for ref_name, dict_array in dict_3D_measures.items(): 47 | for start_pos, array_info in dict_array.items(): 48 | array_RD, array_VD, array_ND, array_score = array_info 49 | list_info[0].append((ref_name, start_pos, array_RD)) 50 | list_info[1].append((ref_name, start_pos, array_VD)) 51 | list_info[2].append((ref_name, start_pos, array_ND)) 52 | list_info[3].append((ref_name, start_pos, array_score)) 53 | 54 | output_wig( 55 | output_name = (fn_output + '.read_depth.wig.gz'), 56 | data_name = 'avg_read_depth', 57 | list_data = list_info[0] 58 | ) 59 | output_wig( 60 | output_name = (fn_output + '.var_density.wig.gz'), 61 | data_name = 'var_density', 62 | list_data = list_info[1] 63 | ) 64 | output_wig( 65 | output_name = (fn_output + '.dip_density.wig.gz'), 66 | data_name = 'non_diploid_density', 67 | list_data = list_info[2] 68 | ) 69 | output_wig( 70 | output_name = (fn_output + '.score_sum.wig.gz'), 71 | data_name = '3D_scoring_sum', 72 | list_data = list_info[3] 73 | ) 74 | 75 | 76 | def scanning_bias( 77 | f_gvcf :pysam.VariantRecord 78 | ) -> dict: 79 | """ 80 | Scanning the fn_gvcf to find the region with 81 | - high read depth, 82 | - high density of variants, or 83 | - non diploid evidence. 84 | return the raw numbers 85 | """ 86 | # Extract the read_depth and variant informations 87 | ref_name = None # record the reference name 88 | last_pos = -2 # record the last mpileup position 89 | start_pos = None # record the starting position of each region 90 | dict_ref_info = {} 91 | for var in f_gvcf: 92 | if ref_name != var.contig: # new chromosome 93 | ref_name = var.contig 94 | dict_ref_info[ref_name] = {} 95 | 96 | start_pos = var.start 97 | dict_ref_info[ref_name][start_pos] = {'depth':[], 'var':[]} 98 | elif var.start > last_pos + 1: # the same chromsome, new position 99 | start_pos = var.start 100 | dict_ref_info[ref_name][start_pos] = {'depth':[], 'var':[]} 101 | elif var.start == last_pos: # duplicate position, pop the last read depth info 102 | dict_ref_info[ref_name][start_pos]['depth'].pop() 103 | last_pos = var.start 104 | 105 | ref_name = var.contig 106 | total_depth = var.samples[0]['DP'] 107 | 108 | # store the read depth 109 | dict_ref_info[ref_name][start_pos]['depth'].append((var.start, total_depth)) 110 | 111 | alt_depth = None 112 | if var.samples[0].get('AD'): 113 | alt_depth = list(var.samples[0]['AD']) 114 | else: 115 | alt_depth = [0, total_depth] 116 | 117 | # calculate diploid score 118 | list_alleles = list(var.alleles) 119 | if sum(alt_depth) != total_depth: # often happens at indels 120 | alt_depth.append(total_depth - sum(alt_depth)) 121 | list_alleles.append('Others') 122 | list_alt_depth = sorted(alt_depth, reverse=True) 123 | #max_alt_depth = list_alt_depth[0] 124 | num_var = 0 125 | for idx, depth in enumerate(list_alt_depth): 126 | if depth > total_depth*15/100: # consider as variant, exclude the 0,0 case 127 | num_var = idx + 1 128 | else: 129 | break 130 | if num_var > 1: 131 | nonDip_flag = False 132 | if num_var > 2 or list_alt_depth[1]*2 < list_alt_depth[0]: 133 | nonDip_flag = True 134 | dict_ref_info[ref_name][start_pos]['var'].append([var.start, total_depth, list_alt_depth[:num_var], nonDip_flag, \ 135 | alt_depth, list_alleles]) 136 | # -> for debug purpose 137 | return dict_ref_info 138 | 139 | 140 | def boundary_compensate( 141 | target_array :np.array, 142 | window_size :int 143 | ) -> np.array: 144 | """ 145 | compensate for padding zeros 146 | """ 147 | if len(target_array) < window_size: 148 | return target_array 149 | 150 | half_window = int(window_size/2) 151 | # compensate left side 152 | for idx in range(half_window): 153 | target_array[idx] *= (window_size / (half_window+idx)) 154 | # compensate right side 155 | for idx in range(-1, -half_window-1, -1): 156 | target_array[idx] *= (window_size / (half_window-idx-1)) 157 | return target_array 158 | 159 | 160 | def calculate_measures( 161 | dict_ref_info :dict, 162 | window_size :int=400 163 | ) -> dict: 164 | """ 165 | Take the raw data and calculate 166 | - the moving average of read_depth 167 | - over window number of variants 168 | - over window number of non_diploid site 169 | """ 170 | # Two parameters we have: 171 | # list_depth 172 | # list_var_sites 173 | 174 | # Analyze the density of the variants 175 | dict_3D_measures = {} 176 | for ref_name, dict_start_pos in dict_ref_info.items(): 177 | dict_3D_measures[ref_name] = {} 178 | for start_pos, dict_var_info in dict_start_pos.items(): 179 | list_depth = dict_var_info['depth'] 180 | list_var_sites = dict_var_info['var'] 181 | 182 | half_window = round(window_size/2) 183 | # Counting average readepth over the window (moving average) 184 | region_begin = list_depth[0][0] 185 | region_end = list_depth[-1][0] + 1 186 | assert(start_pos == region_begin) 187 | array_read_depth = np.zeros(region_end - region_begin + window_size) 188 | for site_info in list_depth: 189 | index = site_info[0] - region_begin 190 | depth = site_info[1] 191 | array_read_depth[index:index+window_size] += depth 192 | array_read_depth /= window_size 193 | array_read_depth = array_read_depth[half_window:-half_window] 194 | array_read_depth = boundary_compensate(array_read_depth, window_size) 195 | 196 | # Calculate variant density over the window 197 | array_var_density = np.zeros(region_end - region_begin + window_size) 198 | array_dip_density = np.zeros(region_end - region_begin + window_size) 199 | for site_info in list_var_sites: 200 | index = site_info[0] - region_begin 201 | nonDip_flag = site_info[3] 202 | 203 | array_var_density[index:index+window_size] += 1 204 | if nonDip_flag: 205 | array_dip_density[index:index+window_size] += 1 206 | array_var_density = array_var_density[half_window:-half_window] 207 | array_dip_density = array_dip_density[half_window:-half_window] 208 | #array_var_density = boundary_compensate(array_var_density, window_size) 209 | #array_dip_density = boundary_compensate(array_dip_density, window_size) 210 | 211 | dict_3D_measures[ref_name][region_begin] = [array_read_depth, array_var_density, array_dip_density] 212 | return dict_3D_measures 213 | 214 | 215 | def link_bias_region_and_report( 216 | array_score :np.array, 217 | region_begin :int, 218 | ref_name :str, 219 | f_ob , 220 | f_os , 221 | threshold_1 :int=3, 222 | threshold_2 :int=5, 223 | link_dist :int=1000 224 | ) -> tuple: 225 | """ 226 | Find and link the bias region according to thresholds 227 | report files: 228 | - bed file: bias region 229 | - bed file: suspicious region 230 | - csv file: detailed report of bias and suspicious region 231 | """ 232 | list_region = [] 233 | pos_start = -1 234 | pos_stop = -link_dist -1 235 | for idx, score in enumerate(array_score): 236 | if score > threshold_1: 237 | if idx > pos_stop + link_dist: 238 | list_region.append((pos_start, pos_stop+1)) 239 | #print(idx, pos_start, pos_stop+1) 240 | pos_start = idx 241 | pos_stop = idx 242 | else: 243 | pos_stop = idx 244 | if len(list_region) == 0 or list_region[-1] != (pos_start, pos_stop+1): 245 | list_region.append((pos_start, pos_stop+1)) 246 | list_region = list_region[1:] # first region is decoy 247 | 248 | # report bias region and suspicious region 249 | list_bias = [] 250 | list_suspicious = [] 251 | for pos_start, pos_stop in list_region: 252 | max_score = max(array_score[pos_start:pos_stop]) 253 | avg_score = np.mean(array_score[pos_start:pos_stop]) 254 | if max_score > threshold_2: 255 | list_bias.append((pos_start + region_begin, pos_stop + region_begin, max_score, avg_score)) 256 | else: 257 | list_suspicious.append((pos_start + region_begin, pos_stop + region_begin, max_score, avg_score)) 258 | if f_ob: 259 | for segment in list_bias: 260 | f_ob.write(ref_name + '\t' + str(segment[0]) + '\t' + str(segment[1]) + '\tlen:' + str(segment[1]-segment[0]) + ',max:' + str(round(segment[2],2)) + ',avg:' + str(round(segment[3],2)) + '\n') 261 | if f_os: 262 | for segment in list_suspicious: 263 | f_os.write(ref_name + '\t' + str(segment[0]) + '\t' + str(segment[1]) + '\tlen:' + str(segment[1]-segment[0]) + ',max:' + str(round(segment[2],2)) + ',avg:' + str(round(segment[3],2)) + '\n') 264 | for segment in sorted(list_bias, key=lambda ele: (ele[1]-ele[0])*ele[2]*ele[3], reverse=True)[:5]: 265 | print(ref_name + ' ' + str(segment[0]) + ' ' + str(segment[1]) + ' len:' + str(segment[1]-segment[0]) + ',max:' + str(round(segment[2],2)) + ',avg:' + str(round(segment[3],2))) 266 | pass 267 | return list_bias, list_suspicious 268 | 269 | 270 | def calculate_3D_score( 271 | dict_3D_measures :dict, 272 | fn_out_report :str, 273 | list_statistics :list 274 | ) -> tuple: 275 | """ 276 | Take in the 3D measures and output the 3D score 277 | """ 278 | avg_RD, std_RD, avg_VD, std_VD, avg_ND, std_ND = list_statistics 279 | 280 | f_ob = open(fn_out_report + '.bias.bed', 'w') 281 | f_os = open(fn_out_report + '.suspicious.bed', 'w') 282 | f_ob.write('#chrom\tchromStart\tchromEnd\tname\n') 283 | f_os.write('#chrom\tchromStart\tchromEnd\tname\n') 284 | 285 | link_dist = 1000 286 | for ref_name, dict_region_begin in dict_3D_measures.items(): 287 | old_region_begin = -link_dist 288 | old_array = np.array([]) 289 | for region_begin, array_info in sorted(dict_region_begin.items()): 290 | array_read_depth, array_var_density, array_dip_density = array_info 291 | #print(region_begin, region_begin+len(array_info[0])) 292 | 293 | #array_score_product = np.round(array_read_depth/avg_RD) * (array_var_density/avg_VD+0.1) * (array_dip_density/avg_ND+0.1) 294 | #array_score_product = np.where(array_score_product > 30, 30, array_score_product) 295 | """ 296 | array_score_sum = np.round(array_read_depth/avg_RD) + (array_var_density/avg_VD) + (array_dip_density/avg_ND) 297 | array_score_sum = np.where(array_read_depth > avg_RD/2, array_score_sum, 0) 298 | """ 299 | array_Z_score_RD = (array_read_depth-avg_RD)/std_RD - 1 300 | array_Z_score_RD = np.where(array_Z_score_RD > 0, array_Z_score_RD, 0) 301 | array_Z_score_VD = (array_var_density-avg_VD)/std_VD 302 | array_Z_score_VD = np.where(array_Z_score_VD > 0, array_Z_score_VD, 0) 303 | array_Z_score_ND = (array_dip_density-avg_ND)/std_ND 304 | array_Z_score_ND = np.where(array_Z_score_ND > 0, array_Z_score_ND, 0) 305 | array_score_sum = array_Z_score_RD + array_Z_score_VD + array_Z_score_ND 306 | array_score_product = array_Z_score_RD * (array_Z_score_VD + array_Z_score_ND) 307 | #array_score_sum = (array_read_depth-avg_RD)/std_RD 308 | #array_score_sum = (array_var_density-avg_VD)/std_VD 309 | #array_score_sum = (array_dip_density-avg_ND)/std_ND 310 | #array_score_sum = array_read_depth/avg_RD 311 | #array_score_sum = array_var_density/avg_VD 312 | 313 | #array_score_sum = np.where(array_score_sum > 0, array_score_sum, 0) 314 | #array_score_sum = np.where(array_score_sum > 30, 30, array_score_sum) 315 | #link_bias_region_and_report(array_score_sum, region_begin, ref_name, f_ob, f_os) 316 | #link_bias_region_and_report(array_score_product, region_begin, ref_name, f_ob, f_os,20,30,1000) 317 | #link_bias_region_and_report(array_score_sum, region_begin, ref_name, f_ob, f_os,3,5,link_dist) 318 | #print(old_region_begin, old_region_begin+len(old_array), region_begin) 319 | dict_3D_measures[ref_name][region_begin].append(array_score_sum) 320 | if old_region_begin + len(old_array) + link_dist > region_begin: 321 | assert(old_region_begin + len(old_array) < region_begin) 322 | # Connect 323 | diff = region_begin - old_region_begin - len(old_array) 324 | old_array = np.concatenate((old_array, np.zeros(diff), array_score_sum)) 325 | else: 326 | if old_region_begin != -1000: 327 | link_bias_region_and_report(old_array, old_region_begin, ref_name, f_ob, f_os,3,5,link_dist) 328 | #dict_3D_measures[ref_name][old_region_begin].append(old_array) 329 | old_region_begin = region_begin 330 | old_array = array_score_sum 331 | link_bias_region_and_report(old_array, old_region_begin, ref_name, f_ob, f_os,3,5,link_dist) 332 | f_ob.close() 333 | f_os.close() 334 | 335 | # report the region with low Read depth 336 | f_or = open(fn_out_report + '.lowRd.bed', 'w') 337 | f_or.write('#chrom\tchromStart\tchromEnd\tname\n') 338 | rd_thresh = min(int(avg_RD/5),10) 339 | for ref_name, dict_region_begin in dict_3D_measures.items(): 340 | global_start = [] 341 | global_stop = [] 342 | for region_begin, array_info in sorted(dict_region_begin.items()): 343 | array_read_depth, *_ = array_info 344 | 345 | bool_low = array_read_depth < rd_thresh 346 | #print(bool_low) 347 | bool_low_shift = np.concatenate(([False], bool_low))[:-1] 348 | bool_start = bool_low > bool_low_shift 349 | bool_stop = bool_low < bool_low_shift 350 | 351 | list_start = [idx+region_begin for idx, x in enumerate(bool_start) if x] 352 | list_stop = [idx+region_begin for idx, x in enumerate(bool_stop ) if x] 353 | 354 | if len(list_start) == len(list_stop): 355 | list_start.append(region_begin + len(array_read_depth)) 356 | if global_start == []: 357 | global_start = list_start 358 | global_stop = list_stop 359 | else: 360 | if list_start[0] == region_begin: 361 | global_start += list_start[1:] 362 | global_stop += list_stop 363 | else: 364 | global_stop += [region_begin-1] 365 | global_start += list_start 366 | global_stop += list_stop 367 | global_start = global_start[:-1] 368 | assert(len(global_start) == len(global_stop)) 369 | for idx in range(len(global_start)): 370 | st = global_start[idx] 371 | ed = global_stop[idx] 372 | f_or.write(ref_name + '\t' + str(st) + '\t' + str(ed) + '\tlen:' + str(ed-st) + '\n') 373 | f_or.close() 374 | 375 | 376 | def get_baseline( 377 | fn_baseline :str 378 | ) -> list: 379 | """ 380 | Take and parse the last line of fn_baseline 381 | """ 382 | f = open(fn_baseline, 'r') 383 | for line in f: 384 | pass 385 | f.close() 386 | _, avg_RD, std_RD, avg_VD, std_VD, avg_ND, std_ND = line.split() 387 | return [float(avg_RD), float(std_RD), float(avg_VD), float(std_VD), float(avg_ND), float(std_ND)] 388 | 389 | 390 | def calculate_avg( 391 | dict_3D_measures :dict, 392 | ): 393 | total_read_depth = np.array([]) 394 | total_var_density = np.array([]) 395 | total_dip_density = np.array([]) 396 | for ref_name, dict_array in dict_3D_measures.items(): 397 | for start_pos, array_info in dict_array.items(): 398 | array_read_depth, array_var_density, array_dip_density = array_info 399 | positive_var = array_var_density[array_var_density != 0] 400 | positive_dip = array_dip_density[array_var_density != 0] 401 | 402 | total_read_depth = np.concatenate((total_read_depth , array_read_depth)) 403 | total_var_density = np.concatenate((total_var_density, positive_var)) 404 | total_dip_density = np.concatenate((total_dip_density, positive_dip)) 405 | return [np.mean(total_read_depth), np.std(total_read_depth), np.mean(total_var_density), \ 406 | np.std(total_var_density), np.mean(total_dip_density), np.std(total_dip_density)] 407 | 408 | 409 | 410 | 411 | 412 | if __name__ == "__main__": 413 | parser = argparse.ArgumentParser() 414 | parser.add_argument('-g', '--gvcf_file', help='the gvcf file of a specific region') 415 | parser.add_argument('-w', '--window_size', help='window size for average depth, density analysis', type=int, default=400) 416 | parser.add_argument('-rd', '--read_depth', help='the average sequence read depth') 417 | parser.add_argument('-b', '--baseline', help='the baseline report generate by sample_baseline.py') 418 | parser.add_argument('-s', '--sample', action='store_true', help='sample for the baseline') 419 | parser.add_argument('-o', '--out_report', help='scanning bed file and reports') 420 | parser.add_argument('-wig', '--out_wig', help='flag for wig output', action='store_true') 421 | args = parser.parse_args() 422 | 423 | fn_gvcf = args.gvcf_file 424 | rd_thresh = args.read_depth 425 | window_size = args.window_size 426 | fn_baseline = args.baseline 427 | flag_sample = args.sample 428 | fn_out_report = args.out_report 429 | flag_wig = args.out_wig 430 | 431 | f_gvcf = pysam.VariantFile(fn_gvcf) 432 | # load or calculate the 3D measures depending on pickle file existance 433 | if os.path.exists(fn_gvcf + '.pickle'): 434 | print("Pickle file", fn_gvcf + '.pickle', 'exist, load it instead of recalculate...') 435 | f_i = open(fn_gvcf + '.pickle', 'rb') 436 | dict_3D_measures = pickle.load(f_i) 437 | f_i.close() 438 | else: 439 | print("Process the mpileup file", fn_gvcf + '...') 440 | dict_ref_info = scanning_bias(f_gvcf=f_gvcf) 441 | dict_3D_measures = calculate_measures( 442 | dict_ref_info=dict_ref_info, 443 | window_size=window_size 444 | ) 445 | print("Store the measures information as", fn_gvcf + '.pickle...') 446 | f_o = open(fn_gvcf + '.pickle', 'wb') 447 | pickle.dump(dict_3D_measures, f_o) 448 | f_o.close() 449 | 450 | # Load or calculate the baseline of the measures 451 | if fn_baseline: 452 | # avg_RD, std_RD, avg_VD, std_VD, avg_ND, std_ND 453 | list_statistics = get_baseline(fn_baseline) 454 | elif flag_sample: 455 | list_statistics = calculate_avg(dict_3D_measures) 456 | else: 457 | list_statistics = [30, 10, 0.7, 1.6, 0.3, 1.2] 458 | if rd_thresh: 459 | list_statistics[0] = rd_thresh 460 | 461 | 462 | print("Calculate 3D scoring and output bed...") 463 | calculate_3D_score(dict_3D_measures, fn_out_report, list_statistics) 464 | 465 | if flag_wig: # output wig files if -ow option 466 | print("Output wig format...") 467 | report_wig( 468 | fn_output=fn_out_report, 469 | dict_3D_measures=dict_3D_measures 470 | ) 471 | -------------------------------------------------------------------------------- /biastools/ref_bi_naive.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import re 3 | import pickle 4 | import os.path 5 | from os import path 6 | import pysam 7 | import numpy as np 8 | from scipy.stats import chisquare 9 | from typing import List, Tuple, Dict, Union 10 | 11 | 12 | def chi_square_test(var_start: int, list_pos_start: List[int]) -> float: 13 | if len(list_pos_start) < 2: 14 | return 0 15 | bucket_num = 5 16 | bucket_len = int(100 / bucket_num) 17 | list_count = np.zeros(bucket_num) 18 | input_idx = np.minimum((var_start - np.array(list_pos_start)) // bucket_len, bucket_num - 1) 19 | try: 20 | np.add.at(list_count, input_idx, 1) 21 | except IndexError: 22 | print(var_start, list_pos_start) 23 | _, p_value = chisquare(list_count) 24 | return 0 if np.isnan(p_value) else p_value 25 | 26 | 27 | def get_division(num_1, num_2): 28 | if num_2 == 0: 29 | return 'nan' 30 | #return format(num_1 / (num_2+0.000001), '.4f') 31 | else: 32 | return format(num_1 / num_2, '.4f') 33 | 34 | 35 | def output_report( 36 | f_vcf :pysam.VariantFile, 37 | dict_ref_bias :dict, 38 | dict_set_conflict_vars :dict, 39 | flag_real :bool, 40 | fn_golden :str, 41 | fn_output :str 42 | ) -> None: 43 | """ 44 | Output the reference bias report to three different files: 45 | - f_all: containing all the variants 46 | - f_gap: contains only insertions and deletions 47 | - f_SNP: contains only SNPs 48 | """ 49 | if flag_real != True: 50 | with open(fn_golden, "rb") as f: 51 | dict_ref_var_name = pickle.load(f) 52 | 53 | f_all = open(fn_output, 'w') 54 | f_gap = open(fn_output + '.gap', 'w') 55 | f_SNP = open(fn_output + '.SNP', 'w') 56 | if flag_real: 57 | f_all.write("CHR\tHET_SITE\tNUM_READS\tAVG_MAPQ\tEVEN_P_VALUE\tBALANCE\tREF\tALT\tBOTH\tOTHER\tGAP\n") 58 | f_gap.write("CHR\tHET_SITE\tNUM_READS\tAVG_MAPQ\tEVEN_P_VALUE\tBALANCE\tREF\tALT\tBOTH\tOTHER\n") 59 | f_SNP.write("CHR\tHET_SITE\tNUM_READS\tAVG_MAPQ\tEVEN_P_VALUE\tBALANCE\tREF\tALT\tBOTH\tOTHER\n") 60 | else: 61 | f_all.write("CHR\tHET_SITE\tNUM_READS\tAVG_MAPQ\tEVEN_P_VALUE\tBALANCE\tREF\tALT\tBOTH\tOTHER\tMAP_BALANCE\tMAP_REF\tMAP_ALT\tMIS_MAP\tSIM_BALANCE\tSIM_REF\tSIM_ALT\tGAP\n") 62 | f_gap.write("CHR\tHET_SITE\tNUM_READS\tAVG_MAPQ\tEVEN_P_VALUE\tBALANCE\tREF\tALT\tBOTH\tOTHER\tMAP_BALANCE\tMAP_REF\tMAP_ALT\tMIS_MAP\tSIM_BALANCE\tSIM_REF\tSIM_ALT\n") 63 | f_SNP.write("CHR\tHET_SITE\tNUM_READS\tAVG_MAPQ\tEVEN_P_VALUE\tBALANCE\tREF\tALT\tBOTH\tOTHER\tMAP_BALANCE\tMAP_REF\tMAP_ALT\tMIS_MAP\tSIM_BALANCE\tSIM_REF\tSIM_ALT\n") 64 | for var in f_vcf: 65 | ref_name = var.contig 66 | hap = var.samples[0]['GT'] 67 | # Filtering all the homozygous alleles or the alleles without reference 68 | if (hap[0] != 0 and hap[1] != 0) or (hap[0] == 0 and hap[1] == 0): 69 | continue 70 | if hap[0] == 0: 71 | idx_ref, idx_alt = 0, 1 72 | else: 73 | idx_ref, idx_alt = 1, 0 74 | # Filtering the conflict vars 75 | if var.start in dict_set_conflict_vars[ref_name]: 76 | continue 77 | n_read = dict_ref_bias[ref_name][var.start]['n_read'] 78 | n_var = dict_ref_bias[ref_name][var.start]['n_var'] 79 | map_q = dict_ref_bias[ref_name][var.start]['map_q'] 80 | #p_value = interval_variance(var.start, dict_ref_bias[ref_name][var.start]['distribute']) 81 | p_value = chi_square_test(var.start, dict_ref_bias[ref_name][var.start]['distribute'][idx_alt]) 82 | p_value = min(p_value, chi_square_test(var.start, dict_ref_bias[ref_name][var.start]['distribute'][idx_ref])) 83 | 84 | output_string = (ref_name + '\t' + str(var.start+1) + '\t') 85 | output_string += (str(sum(n_read)) + "\t" + get_division(sum(map_q[:2]), sum(n_read[:2])) + "\t" + format(p_value, '.4f') + '\t') 86 | # n_var[0,1,2,3] = hap0, hap1, both, others 87 | output_string += get_division(n_var[idx_ref]+n_var[2]*0.5, sum(n_var[:3])) + "\t" + str(n_var[idx_ref]) + "\t" + str(n_var[idx_alt]) + "\t" + str(n_var[2]) + "\t" + str(n_var[3]) 88 | #output_string += get_division(n_var[idx_ref], sum(n_var[:2])) + "\t" + str(n_var[idx_ref]) + "\t" + str(n_var[idx_alt]) + "\t" + str(n_var[2]) + "\t" + str(n_var[3]) 89 | if flag_real != True: # Golden Information 90 | # mapping balance information 91 | output_string += "\t" + get_division(n_read[idx_ref], sum(n_read[:2])) + '\t' + str(n_read[idx_ref]) + '\t' + str(n_read[idx_alt]) + '\t' + str(n_read[2]) 92 | read_info = dict_ref_var_name[ref_name][var.start] 93 | # simulation balance information 94 | output_string += '\t' + get_division(read_info[idx_ref+2], sum(read_info[2:4])) + '\t' + str(read_info[idx_ref+2]) + '\t' + str(read_info[idx_alt+2]) 95 | 96 | if len(var.ref) == len(var.alts[ hap[idx_alt] - 1]): # length of ref is equal to length of 97 | f_all.write(output_string + '\t' + '\n') 98 | f_SNP.write(output_string + '\n') 99 | else: 100 | f_all.write(output_string + '\t' + '.\n') 101 | f_gap.write(output_string + '\n') 102 | 103 | f_all.close() 104 | f_gap.close() 105 | f_SNP.close() 106 | 107 | 108 | def hap_inside( 109 | seq_read :str, 110 | seq_hap :str, 111 | padding :int 112 | ) -> bool: 113 | """ 114 | Finding if the haplotype is in the read 115 | Also considering the boundary condition 116 | One padding side can be omitted 117 | """ 118 | if seq_hap in seq_read: 119 | return True 120 | else: 121 | len_hap = len(seq_hap) 122 | for idx in range(1,padding): 123 | # checking read left side 124 | if seq_hap[idx:] == seq_read[:len_hap - idx]: 125 | return True 126 | # checking read right side 127 | if seq_hap[:-idx] == seq_read[idx - len_hap:]: 128 | return True 129 | return False 130 | 131 | 132 | def return_locate_cigar( 133 | read_start :int, 134 | target_pos :int, 135 | cigar_tuples:tuple 136 | ) -> int: 137 | """ 138 | return the cigar value of a location 139 | according to the CIGAR string 140 | """ 141 | ref_curser = read_start -1 142 | read_curser = 0 143 | for pair_info in cigar_tuples: 144 | code, runs = pair_info 145 | if code == 0 or code == 7 or code == 8: # M or = or X 146 | ref_curser += runs 147 | if ref_curser > target_pos: 148 | return 0 149 | else: 150 | read_curser += runs 151 | elif code == 1: # I 152 | ref_curser += 1 153 | if ref_curser > target_pos: 154 | return -runs 155 | else: 156 | read_curser += runs 157 | elif code == 2: # D 158 | ref_curser += runs 159 | if ref_curser > target_pos: 160 | return runs 161 | else: 162 | read_curser += 1 163 | elif code == 4 or code == 5: # S or H, pysam already parsed 164 | pass 165 | else: 166 | print ("ERROR: unexpected cigar code in sequence") 167 | return 0 168 | 169 | 170 | def locate_by_cigar( 171 | read_start :int, 172 | target_pos :int, 173 | cigar_tuples:tuple 174 | ) -> int: 175 | """ 176 | return the location of a specific reference position in the read 177 | according to the CIGAR string 178 | """ 179 | ref_curser = read_start 180 | read_curser = 0 181 | for pair_info in cigar_tuples: 182 | code, runs = pair_info 183 | if code == 0 or code == 7 or code == 8: # M or = or X 184 | ref_curser += runs 185 | if ref_curser > target_pos: 186 | return read_curser + (runs - ref_curser + target_pos) 187 | else: 188 | read_curser += runs 189 | elif code == 1: # I 190 | #ref_curser += 1 191 | if ref_curser > target_pos: 192 | return read_curser 193 | else: 194 | read_curser += runs 195 | elif code == 2: # D 196 | ref_curser += runs 197 | if ref_curser > target_pos: 198 | return read_curser 199 | #else: 200 | # read_curser += 1 201 | elif code == 4 or code == 5: # S or H, pysam already parsed 202 | pass 203 | else: 204 | print ("ERROR: unexpected cigar code in sequence") 205 | return read_curser 206 | 207 | 208 | def match_to_hap( 209 | seq_name :str, # for debug 210 | read_start :int, 211 | read_end :int, 212 | var_start :int, 213 | seq_read :str, 214 | seq_hap :str, 215 | cigar_tuples:tuple, 216 | padding :int, 217 | l_min_req :int, 218 | r_min_req :int, 219 | start_flag :bool=True 220 | ) -> int: 221 | """ 222 | 1. Find the matching point of the variant on the read 223 | 2. Extend the padding on the read 224 | 3. compare the read to haplotype sequences 225 | """ 226 | if read_start > var_start: # Not cover 227 | return -1 228 | elif read_end < var_start: # Not cover 229 | return -1 230 | 231 | # locating the variant site on the read 232 | r_start = locate_by_cigar( 233 | read_start=read_start, 234 | target_pos=var_start, 235 | cigar_tuples=cigar_tuples 236 | ) 237 | 238 | # Matching 239 | if start_flag: # From var.start 240 | l_bound = r_start - padding 241 | r_bound = l_bound + len(seq_hap) 242 | else: # From var.stop 243 | r_bound = r_start + padding 244 | l_bound = r_bound - len(seq_hap) 245 | 246 | min_match = 0 # minimum match length 247 | if l_bound < 0: 248 | seq_hap = seq_hap[-l_bound:] 249 | l_bound = 0 250 | min_match = r_min_req # minimum len to cover variant 251 | if r_bound > len(seq_read): 252 | seq_hap = seq_hap[:len(seq_read)-r_bound] 253 | r_bound = len(seq_read) 254 | if min_match != 0: 255 | print("WARNING! Both l_bound and r_bound exceed the read!!") 256 | min_match = l_min_req # minimum len to cover variant 257 | if r_bound - l_bound < min_match: 258 | return -1 # Not cover 259 | if seq_read[l_bound:r_bound].upper() == seq_hap.upper(): 260 | return 1 # Match 261 | else: 262 | return 0 # Not match 263 | 264 | 265 | def compare_sam_to_haps( 266 | f_vcf :pysam.VariantFile, 267 | f_sam :pysam.AlignmentFile, 268 | dict_ref_alts :dict, 269 | dict_set_conflict_vars: dict, 270 | flag_real :bool, 271 | fn_golden :str, 272 | run_id :str 273 | ) -> dict: 274 | """ 275 | Input: f_sam file 276 | Output: ref bias dictionary according to variants 277 | """ 278 | if flag_real != True: 279 | with open(fn_golden, "rb") as f: 280 | dict_ref_var_name = pickle.load(f) 281 | 282 | # build up the ref bias dictionary 283 | dict_ref_var_bias = {} 284 | for ref_name in dict_ref_alts.keys(): 285 | dict_ref_var_bias[ref_name] = {} 286 | for start_pos in dict_ref_alts[ref_name]: 287 | # n_var has hap0, hap1, both, and others 288 | dict_ref_var_bias[ref_name][start_pos] = {'n_read':[0,0,0], 'n_var':[0,0,0,0], 'map_q':[0,0,0], 'distribute':[[],[],[],[]]} 289 | 290 | # parameters for pipeline design 291 | count_others = [0,0] 292 | count_both = [0,0] 293 | count_error = [0,0] 294 | count_correct = [0,0] 295 | 296 | # scanning all the read alignments 297 | dict_errors = {} 298 | for segment in f_sam: 299 | flag = segment.flag 300 | if (flag & 4): # bitwise AND 4, segment unmapped 301 | continue 302 | # aligned read information 303 | ref_name = segment.reference_name 304 | seq_name = segment.query_name 305 | flag_read_n = segment.is_read2 306 | pos_start = segment.reference_start # start position in genome coordiante, need +1 for vcf coordinate 307 | pos_end = segment.reference_end 308 | cigar_tuples = segment.cigartuples 309 | mapq = segment.mapping_quality 310 | rg_tag = segment.get_tag("RG") 311 | read_seq = segment.query_alignment_sequence # aligned sequence without SoftClip part 312 | 313 | #chr_tag, hap_tag = rg_tag.split('_') 314 | if '_' in rg_tag: 315 | chr_tag, hap_tag = rg_tag.split('_') 316 | else: 317 | chr_tag = None 318 | hap_tag = rg_tag 319 | related_vars = list(f_vcf.fetch(ref_name, pos_start, pos_end)) # list of pysam.variant 320 | #fetching the sequence in the read_seq regarding to the variant 321 | for var in related_vars: 322 | if var.start in dict_set_conflict_vars[ref_name]: # neglecting the conflict variant sites 323 | continue 324 | seq_hap0, seq_hap1, diff_hap0, diff_hap1 = dict_ref_alts[ref_name][var.start] 325 | if seq_hap0 == seq_hap1: 326 | continue 327 | 328 | if diff_hap0 !=0: # if hap0 is a gap: 329 | diff_read = return_locate_cigar( 330 | read_start=pos_start, 331 | target_pos=var.start, 332 | cigar_tuples=cigar_tuples 333 | ) 334 | if diff_read == diff_hap0: 335 | match_flag_0 = 1 336 | match_flag_1 = 0 337 | else: 338 | match_flag_0 = 0 339 | match_flag_1 = match_to_hap(seq_name, pos_start, pos_end, var.start, read_seq, seq_hap1, cigar_tuples, 0, 1, 1, True) 340 | elif diff_hap1 !=0: # if hap1 is a gap: 341 | diff_read = return_locate_cigar( 342 | read_start=pos_start, 343 | target_pos=var.start, 344 | cigar_tuples=cigar_tuples 345 | ) 346 | if diff_read == diff_hap1: 347 | match_flag_0 = 0 348 | match_flag_1 = 1 349 | else: 350 | match_flag_0 = match_to_hap(seq_name, pos_start, pos_end, var.start, read_seq, seq_hap0, cigar_tuples, 0, 1, 1, True) 351 | match_flag_1 = 0 352 | else: 353 | match_flag_0 = match_to_hap(seq_name, pos_start, pos_end, var.start, read_seq, seq_hap0, cigar_tuples, 0, 1, 1, True) 354 | match_flag_1 = match_to_hap(seq_name, pos_start, pos_end, var.start, read_seq, seq_hap1, cigar_tuples, 0, 1, 1, True) 355 | 356 | if match_flag_0 == 1 and match_flag_1 == 1: 357 | print("Both Trouble!", seq_name, var.start, seq_hap0, seq_hap1) 358 | 359 | # 5. Assign Values 360 | if match_flag_0 == -1 and match_flag_1 == -1: 361 | continue 362 | if match_flag_0 == 1 and match_flag_1 == 1: 363 | dict_ref_var_bias[ref_name][var.start]['n_var'][2] += 1 364 | elif match_flag_0 == 1: 365 | dict_ref_var_bias[ref_name][var.start]['n_var'][0] += 1 366 | # record the starting position of each read cover the variant 367 | dict_ref_var_bias[ref_name][var.start]['distribute'][0].append(pos_start) 368 | dict_ref_var_bias[ref_name][var.start]['distribute'][2].append(pos_end) 369 | elif match_flag_1 == 1: 370 | dict_ref_var_bias[ref_name][var.start]['n_var'][1] += 1 371 | # record the starting position of each read cover the variant 372 | dict_ref_var_bias[ref_name][var.start]['distribute'][1].append(pos_start) 373 | dict_ref_var_bias[ref_name][var.start]['distribute'][3].append(pos_end) 374 | else: 375 | dict_ref_var_bias[ref_name][var.start]['n_var'][3] += 1 376 | 377 | # standard updating of read number and mapping quality 378 | if flag_real: # no golden information 379 | dict_ref_var_bias[ref_name][var.start]['n_read'][0] += 1 380 | dict_ref_var_bias[ref_name][var.start]['map_q'][0] += mapq 381 | else: 382 | if run_id != None and run_id != chr_tag: # not the same chromosome 383 | dict_ref_var_bias[ref_name][var.start]['n_read'][2] += 1 384 | dict_ref_var_bias[ref_name][var.start]['map_q'][2] += 1 385 | elif dict_ref_var_name[ref_name].get(var.start) == None: 386 | continue 387 | elif 'hapA' == hap_tag: # hapA 388 | #if seq_name in dict_ref_var_name[ref_name][var.start][0]: # check if the read name is in the golden set 389 | if (seq_name, flag_read_n) in dict_ref_var_name[ref_name][var.start][0]: # check if the read name is in the golden set 390 | dict_ref_var_bias[ref_name][var.start]['n_read'][0] += 1 391 | dict_ref_var_bias[ref_name][var.start]['map_q'][0] += mapq 392 | else: 393 | dict_ref_var_bias[ref_name][var.start]['n_read'][2] += 1 394 | dict_ref_var_bias[ref_name][var.start]['map_q'][2] += 1 395 | elif 'hapB' == hap_tag: # hapB 396 | #if seq_name in dict_ref_var_name[ref_name][var.start][1]: # check if the read name is in the golden set 397 | if (seq_name, flag_read_n) in dict_ref_var_name[ref_name][var.start][1]: # check if the read name is in the golden set 398 | dict_ref_var_bias[ref_name][var.start]['n_read'][1] += 1 399 | dict_ref_var_bias[ref_name][var.start]['map_q'][1] += mapq 400 | else: 401 | dict_ref_var_bias[ref_name][var.start]['n_read'][2] += 1 402 | dict_ref_var_bias[ref_name][var.start]['map_q'][2] += 1 403 | else: 404 | print("WARNING, there is a read without haplotype information!!") 405 | 406 | return dict_ref_var_bias 407 | 408 | 409 | def len_var_seq( 410 | var :pysam.VariantRecord, 411 | genotype:int 412 | )-> tuple : 413 | """ 414 | Switch the ref sequence according to the haplotype information 415 | """ 416 | if genotype == 0: 417 | return 0, var.ref 418 | else: 419 | alt = var.alts[genotype - 1] 420 | return len(var.ref) - len(alt), alt 421 | 422 | 423 | def variant_seq( 424 | f_vcf :pysam.VariantFile, 425 | f_fasta :pysam.FastaFile 426 | )-> tuple: # dict_set_conflict_vars, dict_var_haps, dict_cohort 427 | """ 428 | Output 429 | - dict_set_conflict_vars: the dictionary marking the overlaping variants 430 | - dict_ref_alts: 431 | in each contig: 432 | - key: var.start 433 | - values: [varseq_hap0, varseq_hap1] 434 | # not only store the varseq but also indicating the variant length 435 | """ 436 | dict_ref_alts = {} 437 | dict_set_conflict_vars = {} 438 | for ref_name in f_fasta.references: 439 | dict_ref_alts[ref_name] = {} 440 | dict_set_conflict_vars[ref_name] = set() 441 | 442 | old_ref_name = "" 443 | for var in f_vcf: 444 | ref_name = var.contig 445 | if old_ref_name != ref_name: # changing the contig 446 | # Reset the parameters 447 | overlap0, overlap1 = 0, 0 448 | prev_start0, prev_start1 = -1, -1 449 | old_ref_name = ref_name 450 | 451 | hap_0, hap_1 = var.samples[0]['GT'] 452 | diff_hap0, var_seq0 = len_var_seq(var, hap_0) 453 | diff_hap1, var_seq1 = len_var_seq(var, hap_1) 454 | if var.start > prev_start0 + overlap0 and var.start > prev_start1 + overlap1: # checking if there are overlaps 455 | dict_ref_alts[ref_name][var.start] = [var_seq0, var_seq1, diff_hap0, diff_hap1] 456 | # hap0 457 | prev_start0 = var.start 458 | overlap0 = len(var_seq0) - 1 if (diff_hap0 == 0) else diff_hap0 459 | # hap1 460 | prev_start1 = var.start 461 | overlap1 = len(var_seq1) - 1 if (diff_hap1 == 0) else diff_hap1 462 | else: # overlapping variants are consider conflicts 463 | dict_set_conflict_vars[ref_name].add(prev_start1) 464 | dict_set_conflict_vars[ref_name].add(var.start) 465 | return dict_set_conflict_vars, dict_ref_alts 466 | 467 | 468 | 469 | 470 | 471 | if __name__ == "__main__": 472 | parser = argparse.ArgumentParser() 473 | parser.add_argument('-v', '--vcf', help='vcf file') 474 | parser.add_argument('-s', '--sam', help='sam file') 475 | parser.add_argument('-f', '--fasta', help='reference fasta file') 476 | parser.add_argument('-r', '--real_data', help='turn off hap_information warning for real data', action='store_true') 477 | parser.add_argument('-p', '--golden_pickle', help='the pickle file contain the golden information for report reference') 478 | parser.add_argument('-i', '--run_id', help='the tag for run_id, can be used to indicate for example chromosome number') 479 | parser.add_argument('-t', '--thread', help='Number of threads, not supported', type=int, default=8) 480 | parser.add_argument('-o', '--out', help='output file') 481 | args = parser.parse_args() 482 | 483 | fn_vcf = args.vcf 484 | fn_sam = args.sam 485 | fn_fasta = args.fasta 486 | flag_real = args.real_data 487 | fn_golden = args.golden_pickle 488 | fn_output = args.out 489 | run_id = args.run_id 490 | 491 | f_vcf = pysam.VariantFile(fn_vcf) 492 | f_sam = pysam.AlignmentFile(fn_sam) 493 | f_fasta = pysam.FastaFile(fn_fasta) 494 | var_chain = 25 495 | print("Start building the variant maps...") 496 | dict_set_conflict_vars, dict_ref_alts = variant_seq( 497 | f_vcf=f_vcf, 498 | f_fasta=f_fasta 499 | ) 500 | # extend conflict set 501 | for ref_name in dict_set_conflict_vars.keys(): 502 | for pos in list(dict_set_conflict_vars[ref_name]): 503 | for extend in range(pos-var_chain, pos+var_chain): 504 | dict_set_conflict_vars[ref_name].add(extend) 505 | 506 | print("Start comparing reads to the variant map...") 507 | dict_ref_bias = compare_sam_to_haps( 508 | f_vcf=f_vcf, 509 | f_sam=f_sam, 510 | dict_ref_alts=dict_ref_alts, 511 | dict_set_conflict_vars=dict_set_conflict_vars, 512 | flag_real=flag_real, 513 | fn_golden=fn_golden, 514 | run_id=run_id 515 | ) 516 | f_vcf = pysam.VariantFile(fn_vcf) 517 | print("Start output report...") 518 | output_report( 519 | f_vcf=f_vcf, 520 | dict_ref_bias=dict_ref_bias, 521 | dict_set_conflict_vars=dict_set_conflict_vars, 522 | flag_real=flag_real, 523 | fn_golden=fn_golden, 524 | fn_output=fn_output 525 | ) 526 | 527 | 528 | -------------------------------------------------------------------------------- /biastools/consensus_vcf_map_adaptive.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pickle 3 | import pysam 4 | import numpy as np 5 | 6 | 7 | 8 | def len_var_seq( 9 | var :pysam.VariantRecord, 10 | genotype:int 11 | )-> tuple : 12 | """ 13 | Switch the ref sequence according to the haplotype information 14 | """ 15 | if genotype == 0: 16 | return 0, var.ref 17 | else: 18 | alt = var.alts[genotype - 1] 19 | return len(var.ref) - len(alt), alt 20 | 21 | 22 | def variant_seq( 23 | f_vcf :pysam.VariantFile, 24 | f_fasta :pysam.FastaFile 25 | )-> tuple: # dict_set_conflict_vars, dict_var_haps, dict_cohort 26 | """ 27 | Output 28 | - dict_set_conflict_vars: the dictionary marking the overlaping variants 29 | - dict_ref_alts: 30 | in each contig: 31 | - key: var.start 32 | - values: [varseq_hap0, varseq_hap1] 33 | # not only store the varseq but also indicating the variant length 34 | """ 35 | dict_ref_alts = {} 36 | dict_set_conflict_vars = {} 37 | for ref_name in f_fasta.references: 38 | dict_ref_alts[ref_name] = {} 39 | dict_set_conflict_vars[ref_name] = set() 40 | 41 | old_ref_name = "" 42 | for var in f_vcf: 43 | ref_name = var.contig 44 | if old_ref_name != ref_name: # changing the contig 45 | # Reset the parameters 46 | overlap0, overlap1 = 0, 0 47 | prev_start0, prev_start1 = -1, -1 48 | old_ref_name = ref_name 49 | 50 | hap_0, hap_1 = var.samples[0]['GT'] 51 | diff_hap0, var_seq0 = len_var_seq(var, hap_0) 52 | diff_hap1, var_seq1 = len_var_seq(var, hap_1) 53 | if var.start > prev_start0 + overlap0 and var.start > prev_start1 + overlap1: # checking if there are overlaps 54 | dict_ref_alts[ref_name][var.start] = [var_seq0, var_seq1, hap_0, hap_1] 55 | # hap0 56 | prev_start0 = var.start 57 | overlap0 = len(var_seq0) - 1 if (diff_hap0 == 0) else diff_hap0 58 | # hap1 59 | prev_start1 = var.start 60 | overlap1 = len(var_seq1) - 1 if (diff_hap1 == 0) else diff_hap1 61 | else: # overlapping variants are consider conflicts 62 | dict_set_conflict_vars[ref_name].add(prev_start1) 63 | dict_set_conflict_vars[ref_name].add(var.start) 64 | return dict_set_conflict_vars, dict_ref_alts 65 | 66 | 67 | def hap_seq( 68 | var :pysam.VariantRecord, 69 | genotype:int 70 | )-> str : 71 | """ 72 | return variant sequence according to haplotype information 73 | """ 74 | if genotype == 0: 75 | return var.ref 76 | else: 77 | return var.alts[genotype - 1] 78 | 79 | 80 | def left_right_check(seq_hap0, seq_hap1): 81 | """ 82 | Check the extension direction of the repetitiveness 83 | return: 84 | - 0: right side extension 85 | - 1: left side extension 86 | - 2: both sides are extensible 87 | """ 88 | assert(seq_hap0 != seq_hap1) 89 | assert((seq_hap0 in seq_hap1) or (seq_hap1 in seq_hap0)) 90 | len_0 = len(seq_hap0) 91 | len_1 = len(seq_hap1) 92 | if len_0 > len_1: 93 | if seq_hap0[:len_1] == seq_hap1: 94 | return 0 # right side repetitive 95 | elif seq_hap0[-len_1:] == seq_hap1: 96 | return 1 # left side repetitive 97 | else: 98 | if seq_hap1[:len_0] == seq_hap0: 99 | return 0 # right side repetitive 100 | elif seq_hap1[-len_0:] == seq_hap0: 101 | return 1 # left side repetitive 102 | return 2 # in the middle 103 | 104 | 105 | def extend_ref_seq( 106 | seq_hap0, 107 | seq_hap1, 108 | ref_extend_0, 109 | ref_extend_1, 110 | flag_right=True 111 | )-> tuple: 112 | """ 113 | Extend the seq_hap0 and seq_hap1 till they makes a difference 114 | """ 115 | seq_hap0_extend = seq_hap0 116 | seq_hap1_extend = seq_hap1 117 | assert((seq_hap0_extend in seq_hap1_extend) or (seq_hap1_extend in seq_hap0_extend)) 118 | len_iterate = min(len(ref_extend_0), len(ref_extend_1)) 119 | if flag_right: # extend to the right 120 | for idx in range(len_iterate): 121 | seq_hap0_extend += ref_extend_0[idx] 122 | seq_hap1_extend += ref_extend_1[idx] 123 | if (seq_hap0_extend in seq_hap1_extend) or (seq_hap1_extend in seq_hap0_extend): # still indistinguishable 124 | continue 125 | else: 126 | return seq_hap0_extend, seq_hap1_extend, idx+1 127 | else: # extend to the left 128 | for idx in range(len_iterate): 129 | seq_hap0_extend = ref_extend_0[-idx-1] + seq_hap0_extend 130 | seq_hap1_extend = ref_extend_1[-idx-1] + seq_hap1_extend 131 | if (seq_hap0_extend in seq_hap1_extend) or (seq_hap1_extend in seq_hap0_extend): # still indistinguishable 132 | continue 133 | else: 134 | return seq_hap0_extend, seq_hap1_extend, idx+1 135 | return seq_hap0_extend, seq_hap1_extend, False 136 | 137 | 138 | def extend_ref_seq_padding( 139 | seq_hap0, 140 | seq_hap1, 141 | ref_extend_0, 142 | ref_extend_1, 143 | flag_right=True, 144 | padding=5 145 | ): 146 | """ 147 | Call the extend_ref_seq and add padding in the end 148 | """ 149 | if flag_right: 150 | seq_hap0_extend, seq_hap1_extend, len_extend = extend_ref_seq(seq_hap0, seq_hap1, ref_extend_0[:-padding], ref_extend_1[:-padding], flag_right) 151 | if len_extend: 152 | return seq_hap0_extend + ref_extend_0[len_extend:len_extend+padding], seq_hap1_extend + ref_extend_1[len_extend:len_extend+padding], len_extend+padding 153 | else: 154 | return seq_hap0, seq_hap1, False 155 | else: 156 | seq_hap0_extend, seq_hap1_extend, len_extend = extend_ref_seq(seq_hap0, seq_hap1, ref_extend_0[padding:], ref_extend_1[padding:], flag_right) 157 | if len_extend: 158 | return ref_extend_0[-len_extend-padding:-len_extend] + seq_hap0_extend, ref_extend_1[-len_extend-padding:-len_extend] + seq_hap1_extend, len_extend+padding 159 | else: 160 | return seq_hap0, seq_hap1, False 161 | 162 | 163 | 164 | def nearest_left_right_var( 165 | left_0, 166 | right_0, 167 | f_hap0_fasta, 168 | left_1, 169 | right_1, 170 | f_hap1_fasta, 171 | ref_name, 172 | left_extend=40, 173 | right_extend=40 174 | ) -> tuple: 175 | left_seq_0 = f_hap0_fasta.fetch(reference=ref_name, start=left_0 - left_extend, end=left_0) 176 | left_seq_1 = f_hap1_fasta.fetch(reference=ref_name, start=left_1 - left_extend, end=left_1) 177 | left_var = -1 178 | for idx in range(left_extend-1, 0, -1): 179 | if left_seq_0[idx] != left_seq_1[idx]: 180 | left_var = left_extend-idx 181 | break 182 | right_seq_0 = f_hap0_fasta.fetch(reference=ref_name, start=right_0, end=right_0 + right_extend) 183 | right_seq_1 = f_hap1_fasta.fetch(reference=ref_name, start=right_1, end=right_1 + right_extend) 184 | right_var = -1 185 | for idx in range(right_extend): 186 | if right_seq_0[idx] != right_seq_1[idx]: 187 | right_var = idx 188 | break 189 | return left_var, right_var 190 | 191 | 192 | 193 | def check_coordinate( 194 | dict_ref_alts :dict, 195 | f_hap0_fasta :pysam.FastaFile, 196 | f_hap1_fasta :pysam.FastaFile, 197 | dict_ref_consensus_map0: dict, 198 | dict_ref_consensus_map1: dict, 199 | dict_set_conflict_vars: dict, 200 | extend_limit :int=100, 201 | padding :int=5 202 | ) -> dict: 203 | """ 204 | Make sure the mapping point result in the same sequence as shown in the vcf file 205 | dict_effective_variant {} 206 | - key: var_start (at reference coordinate) 207 | - values: [flag_side, len_extend] # len_extend can be either right or left 208 | """ 209 | dict_effective_variant = {} 210 | count_discrepency = 0 211 | for ref_name, dict_var_seq in dict_ref_alts.items(): 212 | set_conflict = dict_set_conflict_vars[ref_name] 213 | for var_start, pair_var_seq in dict_var_seq.items(): 214 | if var_start in set_conflict: 215 | continue 216 | 217 | seq_hap0 = pair_var_seq[0] 218 | seq_hap1 = pair_var_seq[1] 219 | pos_map0 = dict_ref_consensus_map0[ref_name][var_start] 220 | pos_map1 = dict_ref_consensus_map1[ref_name][var_start] 221 | long_hap0 = f_hap0_fasta.fetch(reference=ref_name, start=pos_map0-padding, end=pos_map0 + len(seq_hap0)+padding) 222 | long_hap1 = f_hap1_fasta.fetch(reference=ref_name, start=pos_map1-padding, end=pos_map1 + len(seq_hap1)+padding) 223 | 224 | if long_hap0 != long_hap1: 225 | if (long_hap0 in long_hap1) or (long_hap1 in long_hap0): 226 | flag_side = left_right_check(long_hap0, long_hap1) # check which side the repetitive be 227 | if flag_side == 0: # right side 228 | # get additional extend_limit (default 100) bp from the reference 229 | extend_hap0 = f_hap0_fasta.fetch(reference=ref_name, start=pos_map0+len(seq_hap0)+padding, end=pos_map0+len(seq_hap0)+extend_limit) 230 | extend_hap1 = f_hap1_fasta.fetch(reference=ref_name, start=pos_map1+len(seq_hap1)+padding, end=pos_map1+len(seq_hap1)+extend_limit) 231 | effect_hap0, effect_hap1, len_extend = extend_ref_seq_padding(long_hap0, long_hap1, extend_hap0, extend_hap1, True, padding) 232 | if len_extend: 233 | left_var, right_var = nearest_left_right_var(pos_map0, pos_map0+len(seq_hap0), f_hap0_fasta, \ 234 | pos_map1, pos_map1+len(seq_hap1), f_hap1_fasta, ref_name, 40, 40+len_extend) 235 | dict_effective_variant[var_start] = (0, len_extend, left_var, right_var) 236 | else: 237 | print("--- 0 EFFECTIVE VARIANT too long at", var_start, seq_hap0, seq_hap1) 238 | elif flag_side == 1: # left side 239 | extend_hap0 = f_hap0_fasta.fetch(reference=ref_name, start=pos_map0-extend_limit-padding, end=pos_map0-padding) 240 | extend_hap1 = f_hap1_fasta.fetch(reference=ref_name, start=pos_map1-extend_limit-padding, end=pos_map1-padding) 241 | effect_hap0, effect_hap1, len_extend = extend_ref_seq_padding(long_hap0, long_hap1, extend_hap0, extend_hap1, False, padding) 242 | if len_extend: 243 | left_var, right_var = nearest_left_right_var(pos_map0, pos_map0+len(seq_hap0), f_hap0_fasta, \ 244 | pos_map1, pos_map1+len(seq_hap1), f_hap1_fasta, ref_name, 40+len_extend, 40) 245 | dict_effective_variant[var_start] = (1, len_extend, left_var, right_var) 246 | else: 247 | print("--- 1 EFFECTIVE VARIANT too long at", var_start, seq_hap0, seq_hap1) 248 | else: # both sides are extensible 249 | extend_hap0 = f_hap0_fasta.fetch(reference=ref_name, start=pos_map0+len(seq_hap0)+padding, end=pos_map0+len(seq_hap0)+extend_limit) 250 | extend_hap1 = f_hap1_fasta.fetch(reference=ref_name, start=pos_map1+len(seq_hap1)+padding, end=pos_map1+len(seq_hap1)+extend_limit) 251 | r_effect_hap0, r_effect_hap1, r_len_extend = extend_ref_seq_padding(long_hap0, long_hap1, extend_hap0, extend_hap1, True, padding) 252 | 253 | extend_hap0 = f_hap0_fasta.fetch(reference=ref_name, start=pos_map0-extend_limit-padding, end=pos_map0-padding) 254 | extend_hap1 = f_hap1_fasta.fetch(reference=ref_name, start=pos_map1-extend_limit-padding, end=pos_map1-padding) 255 | l_effect_hap0, l_effect_hap1, l_len_extend = extend_ref_seq_padding(long_hap0, long_hap1, extend_hap0, extend_hap1, False, padding) 256 | flag_extend = -1 257 | if l_len_extend == 0: # right anyway 258 | if r_len_extend == 0: 259 | print("--- 2 EFFECTIVE VARIANT ENCOUNTER at", var_start, seq_hap0, seq_hap1, "L", l_len_extend) 260 | else: 261 | flag_extend=0 262 | elif r_len_extend == 0: # left anyway 263 | flag_extend=1 264 | elif r_len_extend < l_len_extend: # right is better 265 | flag_extend=0 266 | else: # left is better 267 | flag_extend=1 268 | 269 | if flag_extend == 0: 270 | left_var, right_var = nearest_left_right_var(pos_map0, pos_map0+len(seq_hap0), f_hap0_fasta, \ 271 | pos_map1, pos_map1+len(seq_hap1), f_hap1_fasta, ref_name, 40, 40+r_len_extend) 272 | dict_effective_variant[var_start] = (0, r_len_extend, left_var, right_var) 273 | elif flag_extend == 1: 274 | left_var, right_var = nearest_left_right_var(pos_map0, pos_map0+len(seq_hap0), f_hap0_fasta, \ 275 | pos_map1, pos_map1+len(seq_hap1), f_hap1_fasta, ref_name, 40+l_len_extend, 40) 276 | dict_effective_variant[var_start] = (1, l_len_extend, left_var, right_var) 277 | 278 | fetch_hap_0 = long_hap0[padding:-padding] 279 | fetch_hap_1 = long_hap1[padding:-padding] 280 | if seq_hap0.upper() != fetch_hap_0.upper() and seq_hap0 != '*': 281 | print("Discrepency at", ref_name, str(var_start), str(pos_map0), "haplotype 0! Expect", seq_hap0, ", get", fetch_hap_0, "...") 282 | count_discrepency += 1 283 | if seq_hap1.upper() != fetch_hap_1.upper() and seq_hap1 != '*': 284 | print("Discrepency at", ref_name, str(var_start), str(pos_map1), "haplotype 1! Expect", seq_hap1, ", get", fetch_hap_1, "...") 285 | count_discrepency += 1 286 | print("Total Discrepency:", count_discrepency) 287 | return dict_effective_variant 288 | 289 | 290 | def variant_map( 291 | fn_chain :str, 292 | dict_ref_alts :dict, 293 | dict_set_conflict_vars :dict 294 | ) -> tuple: 295 | """ 296 | Using chain file to build the variant map 297 | mapping from reference to target genome coordinate 298 | """ 299 | dict_ref_consensus_map = {} 300 | for ref_name in dict_ref_alts.keys(): 301 | dict_ref_consensus_map[ref_name] = {} 302 | 303 | # Read and parse the chain file 304 | dict_chain_info = {} 305 | key_tuple = None 306 | fc = open(fn_chain, 'r') 307 | for line in fc: 308 | fields = line.strip().split() 309 | if len(fields) > 0 and fields[0] == "chain": 310 | key_tuple = tuple(fields) 311 | dict_chain_info[key_tuple] = [] 312 | else: 313 | dict_chain_info[key_tuple].append(fields) 314 | fc.close() 315 | 316 | for key_tuple, list_info in dict_chain_info.items(): 317 | assert(key_tuple[4] == key_tuple[9]) 318 | t_start = int(key_tuple[5]) 319 | assert(t_start == 0) # in this version, we only support one whole genome 320 | t_stop = int(key_tuple[6]) 321 | h_start = int(key_tuple[10]) 322 | ref_name = key_tuple[2] 323 | 324 | list_var_start = sorted(dict_ref_alts[ref_name].keys()) 325 | set_conflict = dict_set_conflict_vars[ref_name] 326 | idx_chain = 0 327 | pos_chain = t_start + int(list_info[idx_chain][0]) 328 | offset = 0 329 | for var_start in list_var_start: 330 | if var_start in set_conflict: 331 | continue 332 | elif var_start < t_start: 333 | continue 334 | elif var_start > t_stop: 335 | break 336 | 337 | if var_start < pos_chain: 338 | dict_ref_consensus_map[ref_name][var_start] = var_start + offset 339 | else: 340 | while pos_chain <= var_start: 341 | pos_chain += int(list_info[idx_chain][1]) 342 | offset -= int(list_info[idx_chain][1]) 343 | offset += int(list_info[idx_chain][2]) 344 | idx_chain += 1 345 | pos_chain += int(list_info[idx_chain][0]) 346 | dict_ref_consensus_map[ref_name][var_start] = var_start + offset 347 | return dict_ref_consensus_map 348 | 349 | 350 | def count_haps( 351 | dict_ref_alts :dict, 352 | f_sam0 :pysam.AlignmentFile, 353 | f_sam1 :pysam.AlignmentFile, 354 | dict_ref_consensus_map0 :dict, 355 | dict_ref_consensus_map1 :dict, 356 | dict_set_conflict_vars :dict, 357 | debug :bool=False 358 | ) -> dict: 359 | """ 360 | Count the number of reads in each golden haplotype sam covering the variants 361 | """ 362 | dict_ref_var_count = {} 363 | for ref_name, dict_vars in dict_ref_alts.items(): 364 | dict_ref_var_count[ref_name] = {} 365 | set_conflict = dict_set_conflict_vars[ref_name] 366 | for var_start, hap_seqs in dict_vars.items(): 367 | if var_start in set_conflict: 368 | continue 369 | if hap_seqs[2] == hap_seqs[3]: # if the var is homozygous 370 | continue 371 | hap0_start = dict_ref_consensus_map0[ref_name][var_start] 372 | hap0_stop = hap0_start + len(hap_seqs[0]) 373 | hap1_start = dict_ref_consensus_map1[ref_name][var_start] 374 | hap1_stop = hap1_start + len(hap_seqs[1]) 375 | 376 | # read numbers overlapping the variants 377 | count0 = f_sam0.count(contig=ref_name, start=hap0_start, stop=hap0_stop) 378 | count1 = f_sam1.count(contig=ref_name, start=hap1_start, stop=hap1_stop) 379 | if debug: 380 | print(ref_name, var_start, ':\n\thapA (' + str(count0) + "): ", end="") 381 | for read in f_sam0.fetch(contig=ref_name, start=hap0_start, stop=hap0_stop): 382 | print(read.query_name, end=", ") 383 | print("\n\thapB (" + str(count1) + "): ", end="") 384 | for read in f_sam1.fetch(contig=ref_name, start=hap1_start, stop=hap1_stop): 385 | print(read.query_name, end=", ") 386 | print("\n", end="") 387 | 388 | dict_ref_var_count[ref_name][var_start] = (count0,count1) 389 | return dict_ref_var_count 390 | 391 | 392 | def get_bound( 393 | hap0_start, 394 | hap0_stop, 395 | hap1_start, 396 | hap1_stop, 397 | len_hap0, 398 | len_hap1, 399 | flag_side, 400 | len_extend 401 | ) -> tuple: 402 | """ 403 | return: (eff_lbound0, eff_rbound0, eff_lbound1, eff_rbound1) 404 | """ 405 | min_len = min(len_hap0, len_hap1) 406 | if flag_side == 0: # extend to the right 407 | if len_hap0 < len_hap1: 408 | return (hap0_stop+len_extend, \ 409 | hap0_start, \ 410 | hap1_start+min_len+len_extend, \ 411 | hap1_stop) #min(hap1_stop, hap1_start+min_len+len_extend)) 412 | else: 413 | return (hap0_start+min_len+len_extend, \ 414 | hap0_stop, \ 415 | hap1_stop+len_extend, \ 416 | hap1_start) 417 | #min(hap0_stop, hap0_start+min_len+len_extend), \ 418 | else: # extend to the left 419 | if len_hap0 < len_hap1: 420 | return (hap0_stop, \ 421 | hap0_start-len_extend, \ 422 | hap1_start, \ 423 | hap1_stop-min_len-len_extend) 424 | #max(hap1_start, hap1_stop-min_len-len_extend), \ 425 | else: 426 | return (hap0_start, \ 427 | hap0_stop-min_len-len_extend, \ 428 | hap1_stop, \ 429 | hap1_start-len_extend) 430 | #max(hap0_start, hap0_stop-min_len-len_extend), \ 431 | 432 | 433 | 434 | def count_haps_n_report_name( 435 | dict_ref_alts :dict, 436 | f_sam0 :pysam.AlignmentFile, 437 | f_sam1 :pysam.AlignmentFile, 438 | dict_ref_consensus_map0 :dict, 439 | dict_ref_consensus_map1 :dict, 440 | dict_set_conflict_vars :dict, 441 | dict_effective_var :dict, 442 | padding :int=5, 443 | debug :bool=False 444 | ) -> dict: 445 | """ 446 | Count the number of reads in each golden haplotype sam covering the variants 447 | """ 448 | dict_ref_var_count = {} 449 | dict_ref_var_name = {} 450 | for ref_name, dict_vars in dict_ref_alts.items(): 451 | dict_ref_var_count[ref_name] = {} 452 | dict_ref_var_name [ref_name] = {} 453 | set_conflict = dict_set_conflict_vars[ref_name] 454 | 455 | len_dict_vars = len(dict_vars) 456 | for idx, (var_start, hap_seqs) in enumerate(dict_vars.items()): 457 | if var_start in set_conflict: 458 | continue 459 | if hap_seqs[2] == hap_seqs[3]: # if the var is homozygous 460 | continue 461 | hap0_start = dict_ref_consensus_map0[ref_name][var_start] 462 | hap0_stop = hap0_start + len(hap_seqs[0]) 463 | hap1_start = dict_ref_consensus_map1[ref_name][var_start] 464 | hap1_stop = hap1_start + len(hap_seqs[1]) 465 | 466 | #if var_start < 6611800: 467 | # continue 468 | # read numbers overlapping the variants 469 | if dict_effective_var.get(var_start): # if the site has larger effective var size 470 | flag_side, len_extend, left_var, right_var = dict_effective_var[var_start] 471 | min_len = min(len(hap_seqs[0]), len(hap_seqs[1])) 472 | if flag_side == 0: # right extend 473 | eff_start0 = hap0_start 474 | eff_stop0 = hap0_start + len(hap_seqs[0]) + len_extend 475 | eff_start1 = hap1_start 476 | eff_stop1 = hap1_start + len(hap_seqs[1]) + len_extend 477 | else: 478 | eff_start0 = hap0_stop - len(hap_seqs[0]) - len_extend 479 | eff_stop0 = hap0_stop 480 | eff_start1 = hap1_stop - len(hap_seqs[1]) - len_extend 481 | eff_stop1 = hap1_stop 482 | eff_lbound0, eff_rbound0, eff_lbound1, eff_rbound1 = get_bound(hap0_start, hap0_stop, hap1_start, hap1_stop, \ 483 | len(hap_seqs[0]), len(hap_seqs[1]), flag_side, len_extend) 484 | # compensate for the nearby variants 485 | if left_var != -1: 486 | eff_lbound0 = min(eff_lbound0, hap0_start-left_var) 487 | eff_lbound1 = min(eff_lbound1, hap1_start-left_var) 488 | if right_var != -1: 489 | eff_rbound0 = max(eff_rbound0, hap0_stop+right_var) 490 | eff_rbound1 = max(eff_rbound1, hap1_stop+right_var) 491 | 492 | read_segment0 = f_sam0.fetch(contig=ref_name, start=eff_start0, stop=eff_stop0) 493 | set_expand0 = set() 494 | set_inside0 = set() 495 | for read in read_segment0: 496 | if read.reference_end >= eff_lbound0 and read.reference_start <= eff_rbound0: 497 | set_expand0.add((read.query_name, read.is_read2)) 498 | else: 499 | set_inside0.add((read.query_name, read.is_read2)) 500 | 501 | read_segment1 = f_sam1.fetch(contig=ref_name, start=eff_start1, stop=eff_stop1) 502 | set_expand1 = set() 503 | set_inside1 = set() 504 | #print(hap1_start, hap1_start+len(hap_seqs[0]), len_extend, left_var, right_var, flag_side) 505 | #print(eff_lbound1, eff_rbound1) 506 | for read in read_segment1: 507 | if read.reference_end >= eff_lbound1 and read.reference_start <= eff_rbound1: 508 | set_expand1.add((read.query_name, read.is_read2)) 509 | else: 510 | set_inside1.add((read.query_name, read.is_read2)) 511 | 512 | """ 513 | if var_start == 6611841: 514 | print(flag_side, len_extend) 515 | print(hap_seqs[0], hap_seqs[1]) 516 | print(hap0_start, hap0_stop) 517 | print(hap1_start, hap1_stop) 518 | 519 | print(set_expand1) 520 | print(set_inside1) 521 | print(hap0_start, eff_start0, hap0_stop, eff_stop0) 522 | print(hap1_start, eff_start1, hap1_stop, eff_stop1) 523 | print(len(set_expand0) + len(set_expand1)) 524 | 525 | read_segment0_start = f_sam0.fetch(contig=ref_name, start=eff_start0) 526 | read_segment0_stop = f_sam0.fetch(contig=ref_name, start=eff_stop0) 527 | read_segment1_start = f_sam1.fetch(contig=ref_name, start=eff_start1) 528 | read_segment1_stop = f_sam1.fetch(contig=ref_name, start=eff_stop1) 529 | name_set0_start = set([read.query_name for read in read_segment0_start]) 530 | name_set0_stop = set([read.query_name for read in read_segment0_stop]) 531 | name_set1_start = set([read.query_name for read in read_segment1_start]) 532 | name_set1_stop = set([read.query_name for read in read_segment1_stop]) 533 | 534 | name_set0 = name_set0_start.intersection(name_set0_stop) 535 | name_set1 = name_set1_start.intersection(name_set1_stop) 536 | count0 = len(name_set0) 537 | count1 = len(name_set1) 538 | #symmetric_difference 539 | print(var_start) 540 | print(name_set0_start.symmetric_difference(name_set0_stop)) 541 | print(name_set1_start.symmetric_difference(name_set1_stop))""" 542 | 543 | count0 = len(set_expand0) 544 | count1 = len(set_expand1) 545 | dict_ref_var_count[ref_name][var_start] = (count0, count1) 546 | dict_ref_var_name [ref_name][var_start] = (set_expand0, set_expand1, count0, count1, set_inside0, set_inside1) 547 | else: 548 | read_segment0 = f_sam0.fetch(contig=ref_name, start=hap0_start, stop=hap0_stop) 549 | name_set0 = set([(read.query_name, read.is_read2) for read in read_segment0]) 550 | count0 = len(name_set0) 551 | read_segment1 = f_sam1.fetch(contig=ref_name, start=hap1_start, stop=hap1_stop) 552 | name_set1 = set([(read.query_name, read.is_read2) for read in read_segment1]) 553 | count1 = len(name_set1) 554 | 555 | dict_ref_var_count[ref_name][var_start] = (count0, count1) 556 | dict_ref_var_name [ref_name][var_start] = (name_set0, name_set1, count0, count1) 557 | 558 | if debug: 559 | print(ref_name, var_start, ':\n\thapA (' + str(count0) + "): ", end="") 560 | for read in f_sam0.fetch(contig=ref_name, start=hap0_start, stop=hap0_stop): 561 | print(read.query_name, end=", ") 562 | print("\n\thapB (" + str(count1) + "): ", end="") 563 | for read in f_sam1.fetch(contig=ref_name, start=hap1_start, stop=hap1_stop): 564 | print(read.query_name, end=", ") 565 | print("\n", end="") 566 | return dict_ref_var_count, dict_ref_var_name 567 | 568 | 569 | def output_report( 570 | f_vcf :pysam.VariantFile, 571 | dict_ref_var_count :dict, 572 | fn_output :str 573 | ) -> None: 574 | """ 575 | ourput report 576 | """ 577 | f_all = open(fn_output, 'w') 578 | f_gap = open(fn_output + '.gap', 'w') 579 | f_SNP = open(fn_output + '.SNP', 'w') 580 | f_all.write("CHR\tHET_SITE\tGOLDEN_DISTRIBUTION\tREF_COUNT\tALT_COUNT\tGAP\n") 581 | f_gap.write("CHR\tHET_SITE\tGOLDEN_DISTRIBUTION\tREF_COUNT\tALT_COUNT\n") 582 | f_SNP.write("CHR\tHET_SITE\tGOLDEN_DISTRIBUTION\tREF_COUNT\tALT_COUNT\n") 583 | for var in f_vcf: 584 | hap_0, hap_1 = var.samples[0]['GT'] 585 | if hap_0 != 0 and hap_1 != 0: 586 | continue 587 | ref_name = var.contig 588 | if dict_ref_var_count[ref_name].get(var.start): # Exist legal variant 589 | count0, count1 = dict_ref_var_count[ref_name][var.start] 590 | len_var = 0 591 | if hap_0 == 0: 592 | read_distribution = count0/max((count0+count1),0.001) 593 | distring = format(read_distribution, '.8f') + '\t' + str(count0) + '\t' + str(count1) 594 | len_var = len(var.alts[hap_1-1]) 595 | else: 596 | read_distribution = count1/max((count0+count1),0.001) 597 | distring = format(read_distribution, '.8f') + '\t' + str(count1) + '\t' + str(count0) 598 | len_var = len(var.alts[hap_0-1]) 599 | f_all.write(ref_name + '\t' + str(var.start+1) + '\t' + distring + '\t') 600 | if len(var.ref) != len_var: 601 | f_gap.write(ref_name + '\t' + str(var.start+1) + '\t' + distring + '\n') 602 | f_all.write('.\n') 603 | else: 604 | f_SNP.write(ref_name + '\t' + str(var.start+1) + '\t' + distring + '\n') 605 | f_all.write('\n') 606 | 607 | f_all.close() 608 | f_gap.close() 609 | f_SNP.close() 610 | 611 | 612 | 613 | if __name__ == "__main__": 614 | parser = argparse.ArgumentParser() 615 | parser.add_argument('-v', '--vcf', help='vcf file') 616 | parser.add_argument('-c0', '--hap0_chain', help='hap0 chain file') 617 | parser.add_argument('-c1', '--hap1_chain', help='hap1 chain file') 618 | parser.add_argument('-f0', '--hap0_fasta', help='hap0 consensus fasta file') 619 | parser.add_argument('-f1', '--hap1_fasta', help='hap1 consensus fasta file') 620 | parser.add_argument('-s0', '--hap0_sam', help='hap0 sam file') 621 | parser.add_argument('-s1', '--hap1_sam', help='hap1 sam file') 622 | parser.add_argument('-o', '--out', help='output file') 623 | args = parser.parse_args() 624 | 625 | fn_vcf = args.vcf 626 | fn_chain0 = args.hap0_chain 627 | fn_chain1 = args.hap1_chain 628 | fn_hap0_fasta = args.hap0_fasta 629 | fn_hap1_fasta = args.hap1_fasta 630 | fn_sam0 = args.hap0_sam 631 | fn_sam1 = args.hap1_sam 632 | fn_output = args.out 633 | var_chain = 25 634 | 635 | f_vcf = pysam.VariantFile(fn_vcf) 636 | f_hap0_fasta = pysam.FastaFile(fn_hap0_fasta) 637 | f_hap1_fasta = pysam.FastaFile(fn_hap1_fasta) 638 | print("Start locating variants and the conflicting variants...") 639 | dict_set_conflict_vars, dict_ref_alts = variant_seq( 640 | f_vcf=f_vcf, 641 | f_fasta=f_hap0_fasta 642 | ) 643 | # extend conflict set 644 | for ref_name in dict_set_conflict_vars.keys(): 645 | for pos in list(dict_set_conflict_vars[ref_name]): 646 | for extend in range(pos-var_chain, pos+var_chain): 647 | dict_set_conflict_vars[ref_name].add(extend) 648 | print("Start building the mapping consensus coordinate...") 649 | dict_ref_consensus_map0 = variant_map( 650 | fn_chain=fn_chain0, 651 | dict_ref_alts=dict_ref_alts, 652 | dict_set_conflict_vars=dict_set_conflict_vars 653 | ) 654 | dict_ref_consensus_map1 = variant_map( 655 | fn_chain=fn_chain1, 656 | dict_ref_alts=dict_ref_alts, 657 | dict_set_conflict_vars=dict_set_conflict_vars 658 | ) 659 | # obsolete if you are confident 660 | print("Checking if the coordinate is correct...") 661 | dict_effective_var = check_coordinate( 662 | dict_ref_alts=dict_ref_alts, 663 | f_hap0_fasta=f_hap0_fasta, 664 | f_hap1_fasta=f_hap1_fasta, 665 | dict_ref_consensus_map0=dict_ref_consensus_map0, 666 | dict_ref_consensus_map1=dict_ref_consensus_map1, 667 | dict_set_conflict_vars=dict_set_conflict_vars, 668 | padding=10 669 | ) 670 | print("Checking the simulation sam file covering of the variants") 671 | f_sam0 = pysam.AlignmentFile(fn_sam0) 672 | f_sam1 = pysam.AlignmentFile(fn_sam1) 673 | dict_ref_var_count, dict_ref_var_name = count_haps_n_report_name( 674 | dict_ref_alts=dict_ref_alts, 675 | f_sam0=f_sam0, 676 | f_sam1=f_sam1, 677 | dict_ref_consensus_map0=dict_ref_consensus_map0, 678 | dict_ref_consensus_map1=dict_ref_consensus_map1, 679 | dict_set_conflict_vars=dict_set_conflict_vars, 680 | dict_effective_var=dict_effective_var, 681 | padding=10, 682 | debug=False 683 | ) 684 | f_vcf = pysam.VariantFile(fn_vcf) 685 | print("Start output report...") 686 | output_report( 687 | f_vcf=f_vcf, 688 | dict_ref_var_count=dict_ref_var_count, 689 | fn_output=fn_output) 690 | print("Dump golden read names pickle file...") 691 | with open(fn_output + '.pickle', 'wb') as f: 692 | pickle.dump(dict_ref_var_name, f) 693 | 694 | --------------------------------------------------------------------------------