├── biastools
    ├── __init__.py
    ├── biastools_compare.sh
    ├── vcf_to_bed.py
    ├── biastools_predict.sh
    ├── filter_het_VCF.py
    ├── biastools_align.sh
    ├── predict_model.py
    ├── biastools_simulation.sh
    ├── biastools_analysis.sh
    ├── compare_bias_with_RD.py
    ├── sample_baseline.py
    ├── merge_baseline.py
    ├── biastools_scan.py
    ├── golden_graph_report.py
    ├── biastools.py
    ├── predict_experiment.py
    ├── indel_balance_plot.py
    ├── golden_graph.py
    ├── scanning_bias.py
    ├── ref_bi_naive.py
    └── consensus_vcf_map_adaptive.py
├── tutorial
    ├── HG002.chr20.part.vcf.gz
    ├── run.sh
    └── README.md
├── figures
    ├── context-aware-assignment.png
    ├── HG002.GIAB.4.2.1.demo.indel_balance.png
    └── context_aware.md
├── .gitignore
├── LICENSE
├── setup.py
└── README.md


/biastools/__init__.py:
--------------------------------------------------------------------------------
1 | #biastools
2 | import biastools.biastools
3 | import biastools.biastools_scan
4 | 


--------------------------------------------------------------------------------
/tutorial/HG002.chr20.part.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maojanlin/biastools/HEAD/tutorial/HG002.chr20.part.vcf.gz


--------------------------------------------------------------------------------
/figures/context-aware-assignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maojanlin/biastools/HEAD/figures/context-aware-assignment.png


--------------------------------------------------------------------------------
/figures/HG002.GIAB.4.2.1.demo.indel_balance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maojanlin/biastools/HEAD/figures/HG002.GIAB.4.2.1.demo.indel_balance.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.vcf
 2 | *.fna
 3 | *.fa
 4 | *.vcf.*
 5 | *.?am
 6 | *.f*
 7 | find_reads_given_HET.py
 8 | region_spec_refbi.py
 9 | old_biastools.sh
10 | 


--------------------------------------------------------------------------------
/tutorial/run.sh:
--------------------------------------------------------------------------------
1 | biastools --simulate -g grch38_chr20_part.fa -v HG002.chr20.part.vcf.gz -s HG002_part -r tutorial
2 | biastools --align -a bowtie2 -g grch38_chr20_part.fa -v HG002.chr20.part.vcf.gz -s HG002_part -r tutorial
3 | biastools --analyze -g grch38_chr20_part.fa -v HG002.chr20.part.vcf.gz -s HG002_part -r tutorial
4 | biastools_scan --scan -g grch38_chr20_part.fa -s HG002_part -r tutorial -i out_dir/HG002_part.tutorial.sorted.bam
5 | 


--------------------------------------------------------------------------------
/biastools/biastools_compare.sh:
--------------------------------------------------------------------------------
 1 | path_out=$1
 2 | sample_id=$2
 3 | run_id=$3
 4 | target_bed=$4
 5 | improve_bed=$5
 6 | improve_lowRd=$6
 7 | path_module=$7
 8 | prefix=${path_out}/${sample_id}
 9 | 
10 | bedtools subtract  -a ${improve_bed} -b ${improve_lowRd} > ${prefix}.improve.goodRd.bias.bed
11 | bedtools intersect -a ${target_bed}  -b ${improve_lowRd} > ${prefix}.improve.skipped.bias.bed
12 | 
13 | python3 ${path_module}compare_bias_with_RD.py -lt ${target_bed} -li ${prefix}.improve.goodRd.bias.bed -lrd ${prefix}.improve.skipped.bias.bed -out ${prefix}.${run_id}.improve.bias.bed
14 | #python3 check_inside_centromere.py -lr1 centromere_extend.bed -lr2 ${prefix}.${run_id}.improve.bias.bed
15 | 


--------------------------------------------------------------------------------
/biastools/vcf_to_bed.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pysam
 3 | 
 4 | 
 5 | def main():
 6 |     parser = argparse.ArgumentParser(description="Generate a BED file from a VCF file.")
 7 |     parser.add_argument('-v', '--vcf', help='list of vcf files for input', required=True)
 8 |     parser.add_argument('-o', '--out', help='output bed file', required=True)
 9 |     args = parser.parse_args()
10 | 
11 |     vcf_path = args.vcf
12 |     vcf = pysam.VariantFile(vcf_path)
13 |     fo = open(args.out, 'w')
14 |     for record in vcf:
15 |         chrom = record.chrom
16 |         start = record.start
17 |         end = record.stop
18 |         fo.write(f"{chrom}\t{start}\t{end}\n")
19 |     fo.close()
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/biastools/biastools_predict.sh:
--------------------------------------------------------------------------------
 1 | path_out=$1
 2 | sample_id=$2
 3 | run_id=$3
 4 | flag_real=$4
 5 | report_real=$5
 6 | report_simulation=$6
 7 | path_module=$7
 8 | prefix=${path_out}/${sample_id}
 9 | 
10 | 
11 | if [[ ${report_real} == 'none' ]]; then
12 |     report_real=${prefix}.real.${run_id}.bias
13 | fi
14 | 
15 | if [[ ${flag_real} == 1 || ${report_simulation} == 'none' ]]; then
16 |     echo "[Biastools] Real report bias prediction."
17 |     python3 ${path_module}predict_model.py -rr ${report_real} -out ${prefix}.real.${run_id}
18 | else
19 |     echo "[Biastools] Bias prediction based on simulation report!"
20 |     python3 ${path_module}predict_experiment.py -sr ${report_simulation} \
21 |                                                 -rr ${report_real} \
22 |                                                 -out ${prefix}.sim.${run_id}
23 | fi
24 | 


--------------------------------------------------------------------------------
/tutorial/README.md:
--------------------------------------------------------------------------------
 1 | # Tutorial: running biastools
 2 | 
 3 | In the tutorial, there are two initial files:
 4 | - ```grch38_chr20_part.fa```, which is the beginning 506,000 bases of the chr20 of GRCh38
 5 | - ```HG002.chr20.part.vcf.gz```, is the VCF file containing the first 1612 variants of HG002's chr20 called by Q100 project
 6 | 
 7 | After installation, the user can run the ```run.sh``` script, which simulate the reads from the reference genome and VCF file, and then align the simulated reads with Bowtie 2, 
 8 | and then analyze the alignment with context-aware assignment method. Finally, the biastools scan mode is used to scan the whole alignment bam file.
 9 | 
10 | If the biastools is not installed, users can also directly called 
11 | ```
12 | python3 biastools/biastools.py
13 | ```
14 | and 
15 | ```
16 | python3 biastools/biastools_scan.py
17 | ```
18 | to run the procedure.
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Mao-Jan Lin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/biastools/filter_het_VCF.py:
--------------------------------------------------------------------------------
 1 | #program to find HET sites from VCF
 2 | import argparse
 3 | import pysam
 4 | 
 5 | def parse_het_site(fn_vcf, fn_output):
 6 |     in_vcf_file  = pysam.VariantFile(fn_vcf, 'r')
 7 |     out_vcf_file = pysam.VariantFile(fn_output, 'w', header=in_vcf_file.header)
 8 |     for segment in in_vcf_file:
 9 |         #hap_info = str(segment).split()[9].split('|') # "0|0", "1|0", "0|1" tag
10 |         #if hap_info[0] != hap_info[1]:
11 |         phase_info = segment.samples[0]['GT']
12 |         if len(phase_info) != 2:
13 |             print("WARNING! non diploid haplotype.")
14 |             continue
15 |         hap_0, hap_1 = phase_info
16 |         if hap_0 == None or hap_1 == None:
17 |             print("WARNING! one haplotype information is missing.")
18 |             continue
19 |         if hap_0 + hap_1 != 0:
20 |             out_vcf_file.write(segment)
21 |     in_vcf_file.close()
22 |     out_vcf_file.close()
23 | 
24 | if __name__ == "__main__":
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument('-v', '--vcf', help='vcf/vcf.gz file for chromosomes')
27 |     parser.add_argument('-o', '--out', help='output vcf.gz file with HET sites')
28 |     args = parser.parse_args()
29 |     
30 |     fn_vcf = args.vcf
31 |     fn_output = args.out
32 |     
33 |     parse_het_site(fn_vcf, fn_output)
34 | 


--------------------------------------------------------------------------------
/biastools/biastools_align.sh:
--------------------------------------------------------------------------------
 1 | path_ref=$1
 2 | path_vcf=$2
 3 | path_out=$3
 4 | sample_id=$4
 5 | THR=$5
 6 | ALN=$6
 7 | ALN_IDX=$7
 8 | run_id=$8
 9 | path_module=$9
10 | prefix=${path_out}/${sample_id}
11 | 
12 | echo "[Biastools] Align sequences to the original reference"
13 | if [[ ${ALN_IDX} == 'none' ]]; then
14 |     ALN_IDX=${path_ref}
15 | fi
16 | 
17 | if [[ ${ALN} == "bowtie2" ]]; then
18 |     echo "[Biastools] Align with bowtie2"
19 |     if [ ! -f ${ALN_IDX}.1.bt2 ]; then
20 |         bowtie2-build ${path_ref} ${ALN_IDX}
21 |     fi
22 |     bowtie2 -p ${THR} -x ${ALN_IDX} --rg-id ${run_id}_hapA --rg SM:${sample_id} -1 ${prefix}.hapA_1.fq.gz -2 ${prefix}.hapA_2.fq.gz |\
23 |         samtools sort -o ${prefix}.hapA.${run_id}.sorted.bam  
24 |     bowtie2 -p ${THR} -x ${ALN_IDX} --rg-id ${run_id}_hapB --rg SM:${sample_id} -1 ${prefix}.hapB_1.fq.gz -2 ${prefix}.hapB_2.fq.gz |\
25 |         samtools sort -o ${prefix}.hapB.${run_id}.sorted.bam 
26 | elif [[ ${ALN} == "bwamem" ]]; then
27 |     echo "[Biastools] Align with BWA MEM"
28 |     if [ ! -f ${ALN_IDX}.bwt ]; then
29 |         bwa index ${path_ref} -p ${ALN_IDX}
30 |     fi
31 |     bwa mem -t ${THR} ${ALN_IDX} ${prefix}.hapA_1.fq.gz ${prefix}.hapA_2.fq.gz -R "@RG\tID:${run_id}_hapA\tSM:${sample_id}" |\
32 |         samtools sort -@ ${THR} -o ${prefix}.hapA.${run_id}.sorted.bam -
33 |     bwa mem -t ${THR} ${ALN_IDX} ${prefix}.hapB_1.fq.gz ${prefix}.hapB_2.fq.gz -R "@RG\tID:${run_id}_hapB\tSM:${sample_id}" |\
34 |         samtools sort -@ ${THR} -o ${prefix}.hapB.${run_id}.sorted.bam -
35 | fi
36 | samtools merge -f ${prefix}.${run_id}.sorted.bam ${prefix}.hapA.${run_id}.sorted.bam ${prefix}.hapB.${run_id}.sorted.bam
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | setup(
 3 |   name = 'biastools',
 4 |   packages = ['biastools'],
 5 |   version = '0.3.1',
 6 |   license='MIT',
 7 |   description = 'The toolkits to analyze reference bias of short DNA read alignment.',
 8 |   author = 'Mao-Jan Lin',
 9 |   author_email = 'mj.maojanlin@gmail.com',
10 |   url = 'https://github.com/maojanlin/biastools',
11 |   download_url = 'https://github.com/maojanlin/biastools/tarball/master',
12 |   keywords = ['biastools', 'reference bias', 'alignment'],
13 |   install_requires=[
14 |           'numpy',
15 |           'pysam',
16 |           'pandas',
17 |           'matplotlib',
18 |           'seaborn',
19 |           'scikit-learn',
20 |           'scipy'
21 |       ],
22 |   include_package_data=True,
23 |   data_files=[('biastools', ['biastools/biastools_align.sh',
24 |                              'biastools/biastools_compare.sh',
25 |                              'biastools/biastools_simulation.sh',
26 |                              'biastools/biastools_analysis.sh',
27 |                              'biastools/biastools_predict.sh'])],
28 |   zip_safe = False,
29 |   classifiers=[
30 |     'Development Status :: 3 - Alpha',
31 |     'Intended Audience :: Developers',
32 |     'Topic :: Software Development :: Build Tools',
33 |     'License :: OSI Approved :: MIT License',
34 |     'Programming Language :: Python :: 3',
35 |     'Programming Language :: Python :: 3.4',
36 |     'Programming Language :: Python :: 3.5',
37 |     'Programming Language :: Python :: 3.6',
38 |   ],
39 |   entry_points={"console_scripts": ["biastools = biastools.biastools:main","biastools_scan = biastools.biastools_scan:main"],},
40 | )
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/biastools/predict_model.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | 
 6 | 
 7 | def predict_bias(real_feature, miss_info, best_threshold, out_prefix):
 8 |     """
 9 |     quality score * balance score
10 |     """
11 |     real_feature['z_MAPQ'] = ((real_feature['AVG_MAPQ'] - 45) * -1).clip(lower=0)
12 |     real_feature['combine_score'] = (real_feature['z_MAPQ']) * (real_feature['BALANCE']) #* (real_feature['BALANCE'])
13 |     real_feature['plus_score']    = (real_feature['z_MAPQ']/45) + 1.5*real_feature['BALANCE']
14 | 
15 |     print(real_feature[real_feature['plus_score'] > best_threshold])
16 |     real_feature[real_feature['plus_score'] > best_threshold].to_csv(out_prefix + "_bias.tsv", index=False, sep = "\t")
17 | 
18 | 
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('-rr', '--real_report', help='the real data bias report')
24 |     parser.add_argument('-thr', '--threshold',  help='the threshold for prediction model [1.5]', type=int, default=1.5)
25 |     parser.add_argument('-out', '--out_prefix', help='the prefix for reports [predict]', type=str, default='predict')
26 |     args = parser.parse_args()
27 | 
28 |     fn_real       = args.real_report
29 |     best_th       = args.threshold
30 |     out_prefix    = args.out_prefix
31 |     
32 |     df_real       = pd.read_csv(fn_real, sep='\t')
33 | 
34 |     # filter out the sites suspicious of imcomplete vcf information
35 |     miss_info = (df_real['OTHER'] > df_real['NUM_READS'] * 0.9) + (df_real['OTHER'] > df_real['NUM_READS'] * 0.4) * \
36 |                 ( (df_real['REF'] == 0) + (df_real['ALT'] == 0 ))
37 | 
38 |     df_real[miss_info].to_csv(out_prefix + "_suspicious.tsv", index=False, sep = "\t")
39 |     print("filtered number:", sum(miss_info))
40 | 
41 |     df_real_test  = df_real[~miss_info]
42 |     predict_bias(df_real_test, miss_info, best_th, out_prefix)
43 | 
44 |     
45 | 


--------------------------------------------------------------------------------
/biastools/biastools_simulation.sh:
--------------------------------------------------------------------------------
 1 | path_ref=$1
 2 | path_vcf=$2
 3 | path_out=$3
 4 | sample_id=$4
 5 | THR=$5
 6 | coverage=$6
 7 | path_module=$7
 8 | prefix=${path_out}/${sample_id}
 9 | 
10 | if [ ! -f "${path_ref}.fai" ]; then
11 |     samtools faidx ${path_ref}
12 | fi
13 | 
14 | bcftools norm -f ${path_ref} ${path_vcf} -m +any -Oz -o ${prefix}.normalized.vcf.gz
15 | bcftools index ${prefix}.normalized.vcf.gz
16 | 
17 | echo "[Biastools] Generate haplotype consensus reference sequence"
18 | bcftools consensus -f ${path_ref} -o ${prefix}.hapA.fa -H 1 ${prefix}.normalized.vcf.gz -c ${prefix}.ref2hapA.chain
19 | bcftools consensus -f ${path_ref} -o ${prefix}.hapB.fa -H 2 ${prefix}.normalized.vcf.gz -c ${prefix}.ref2hapB.chain
20 | samtools faidx ${prefix}.hapA.fa
21 | samtools faidx ${prefix}.hapB.fa
22 | 
23 | echo "[Biastools] Calculate how many reads should be generated"
24 | total_base=$(( $( cut -f2 ${path_ref}.fai | paste -s -d+ ) ))
25 | read_num=$(expr ${total_base} / 151 / 4 \* ${coverage})
26 | echo "generating ${read_num} 2x151 reads in each haplotype"
27 | 
28 | echo "[Biastools] Simulate sequences"
29 | mason_simulator --illumina-read-length 151 --num-threads ${THR} -ir ${prefix}.hapA.fa -o ${prefix}.hapA_1.fq -or ${prefix}.hapA_2.fq -oa ${prefix}.gt.hapA.sam -n ${read_num}
30 | mason_simulator --illumina-read-length 151 --num-threads ${THR} -ir ${prefix}.hapB.fa -o ${prefix}.hapB_1.fq -or ${prefix}.hapB_2.fq -oa ${prefix}.gt.hapB.sam -n ${read_num} --seed 9388
31 | samtools sort -@ ${THR} ${prefix}.gt.hapA.sam > ${prefix}.gt.hapA.sorted.bam
32 | samtools sort -@ ${THR} ${prefix}.gt.hapB.sam > ${prefix}.gt.hapB.sorted.bam
33 | samtools index ${prefix}.gt.hapA.sorted.bam
34 | samtools index ${prefix}.gt.hapB.sorted.bam
35 | rm ${prefix}.gt.hapA.sam
36 | rm ${prefix}.gt.hapB.sam
37 | 
38 | gzip -f ${prefix}.hapA_1.fq
39 | gzip -f ${prefix}.hapA_2.fq
40 | gzip -f ${prefix}.hapB_1.fq
41 | gzip -f ${prefix}.hapB_2.fq
42 | 
43 | echo "[Biastools] Filter the heterozygous site in vcf file"
44 | python3 ${path_module}filter_het_VCF.py -v ${prefix}.normalized.vcf.gz  -o ${prefix}.het.vcf.gz
45 | tabix -p vcf ${prefix}.het.vcf.gz
46 | 
47 | echo "[Biastools] Generate golden distribution report"
48 | python3 ${path_module}consensus_vcf_map_adaptive.py -v ${prefix}.het.vcf.gz \
49 |     -c0 ${prefix}.ref2hapA.chain \
50 |     -c1 ${prefix}.ref2hapB.chain \
51 |     -f0 ${prefix}.hapA.fa \
52 |     -f1 ${prefix}.hapB.fa \
53 |     -s0 ${prefix}.gt.hapA.sorted.bam \
54 |     -s1 ${prefix}.gt.hapB.sorted.bam \
55 |     -o  ${prefix}.golden.rpt
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/figures/context_aware.md:
--------------------------------------------------------------------------------
 1 | ## Context-aware assignment algorithm
 2 | This method works by searching for the REF and ALT alleles, together with some of their flanking sequence, within the sequences of all the reads that aligned overlapping the variant.
 3 | Details were shown in the [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-024-03240-8).
 4 | ![diagram](./context-aware-assignment.png)
 5 | 
 6 | ### Cohort assignment
 7 | Variants appearing within a short distance (default: 25 bp) together into a “cohort.”  The cohorts are compared in the same style of local assignment.
 8 | 
 9 | ### Local assignment
10 | The read sequence is compared to `hap1` and `hap2` alleles of each variant.  To account for discripancies between read alignment and VCF representation, multiple anchor points are tested  (see Fig. 7 in the [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-024-03240-8)).  If a read perfectly matches to one of `hap1` or `hap2`, it is assigned accordingly.
11 | If the read matches to both `hap1` and `hap2`, a situation commonly observed in short tandem repeats, the read is assigned as `both`.
12 | Reads that do not perfectly match to either `hap1` or `hap2` under any anchor point are categorized as `others`.
13 | 
14 | As shown in the pipeline figure example, the variant is an insertion.  The `hap1` sequence includes the inserted segment along with 5 bp of flanking sequence on both sides, while `hap2` consists of only the flanking sequences without the insertion.
15 | Read1 and read2 can be successfully assigned, whereas read3 and read4 are categorized as `others` due to sequencing error.
16 | 
17 | ### Edit-distance assignment
18 | Starting from biastools v0.3.0, an additional edit-distance assignment step is introduced to mitigate the impact of sequencing errors on reads previously categorized as ``others`` in the first two stages.
19 | As shown in the figure, the edit distance between read3 and `hap1` is computed (value: 1) as well as the distance to `hap2` (value: 3).  The edit-distances are then normalized
20 | by dividing by the length of `hap1` (longer) `hap2` (shorter), respectively.  Since the normalized edit distance to `hap1` is shorter, read3 is assigned to hap1.
21 | 
22 | #### Conditions
23 | By default, the edit-distance assignment is applied only when the following conditions are met:
24 | 1. The edit-distance between the read and the final assignned haplotype is less than or equal to 5.
25 | 2. The length of the gap is shorter than 20 bp.
26 | 
27 | These two conditions are selected based on empirical observations.
28 | 


--------------------------------------------------------------------------------
/biastools/biastools_analysis.sh:
--------------------------------------------------------------------------------
 1 | path_ref=$1
 2 | path_vcf=$2
 3 | path_out=$3
 4 | sample_id=$4
 5 | THR=$5
 6 | run_id=$6
 7 | flag_real=$7
 8 | flag_naive=$8
 9 | boundary=$9
10 | path_module=${10}
11 | prefix=${path_out}/${sample_id}
12 | bam_file=${11}
13 | 
14 | 
15 | echo "[Biastools] Intersect the bam file and vcf file"
16 | if [ ! -f ${prefix}.het.vcf.gz ]; then
17 |     bcftools norm -f ${path_ref} ${path_vcf} -m +any -Oz -o ${prefix}.normalized.vcf.gz
18 |     bcftools index ${prefix}.normalized.vcf.gz
19 |     python3 ${path_module}filter_het_VCF.py -v ${prefix}.normalized.vcf.gz  -o ${prefix}.het.vcf.gz
20 |     tabix -p vcf ${prefix}.het.vcf.gz
21 | fi
22 | if [ ! -f ${prefix}.${run_id}.sorted.het.bam ]; then
23 |     python3 ${path_module}vcf_to_bed.py -v ${prefix}.het.vcf.gz -o ${prefix}.het.bed
24 |     samtools view -h -L ${prefix}.het.bed ${bam_file} -@ ${THR} | samtools sort -@ ${THR} > ${prefix}.${run_id}.sorted.het.bam
25 |     samtools index ${prefix}.${run_id}.sorted.het.bam
26 | fi
27 | 
28 | 
29 | echo "[Biastools] Reference bias analysis"
30 | if [[ ${flag_naive} == 1 ]]; then
31 |     assign_method=${path_module}"ref_bi_naive.py"
32 | else
33 |     assign_method=${path_module}"ref_bi_context.py"
34 | fi
35 | 
36 | mkdir -p ${path_out}/${run_id}"_report"
37 | r_prefix=${path_out}/${run_id}"_report"/${sample_id}
38 | if [[ ${flag_real} == 1 ]]; then
39 |     python3 ${assign_method} -s ${prefix}.${run_id}.sorted.het.bam \
40 |                              -v ${prefix}.het.vcf.gz \
41 |                              -f ${path_ref} \
42 |                              -p ${prefix}.golden.rpt.pickle \
43 |                              -o ${r_prefix}.${run_id}.real.bias \
44 |                              --real
45 |     # indel balance plot
46 |     python3 ${path_module}indel_balance_plot.py  -lr ${r_prefix}.${run_id}.real.bias.all \
47 |                                              -ln ${run_id} \
48 |                                              -vcf ${prefix}.het.vcf.gz \
49 |                                              -bd ${boundary} \
50 |                                              -map \
51 |                                              -out ${r_prefix}.${run_id}.real \
52 |                                              -real
53 | else
54 |     python3 ${assign_method} -s ${prefix}.${run_id}.sorted.het.bam \
55 |                              -v ${prefix}.het.vcf.gz \
56 |                              -f ${path_ref} \
57 |                              -p ${prefix}.golden.rpt.pickle \
58 |                              -o ${r_prefix}.${run_id}.sim.bias
59 |     
60 |     # report the bias categories and report
61 |     python3 ${path_module}golden_graph_report.py -mb ${r_prefix}.${run_id}.sim.bias.snp -out ${r_prefix}.${run_id}.snp
62 |     python3 ${path_module}golden_graph_report.py -mb ${r_prefix}.${run_id}.sim.bias.gap -out ${r_prefix}.${run_id}.gap
63 |     # plot the measures with NMB and NAB                      
64 |     python3 ${path_module}golden_graph.py        -mb ${r_prefix}.${run_id}.sim.bias.snp -out ${r_prefix}.${run_id}.snp
65 |     python3 ${path_module}golden_graph.py        -mb ${r_prefix}.${run_id}.sim.bias.gap -out ${r_prefix}.${run_id}.gap
66 |     # indel balance plot
67 |     python3 ${path_module}indel_balance_plot.py  -lr ${r_prefix}.${run_id}.sim.bias.all \
68 |                                                  -ln ${run_id} \
69 |                                                  -vcf ${prefix}.het.vcf.gz \
70 |                                                  -bd ${boundary} \
71 |                                                  -map \
72 |                                                  -out ${r_prefix}.${run_id}.sim
73 | fi
74 | 


--------------------------------------------------------------------------------
/biastools/compare_bias_with_RD.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | 
  4 | 
  5 | def read_list_bed(list_bed):
  6 |     dict_chr_bias = {}
  7 |     for fn_bed in list_bed:
  8 |         f = open(fn_bed)
  9 |         f.readline()
 10 |         for line in f:
 11 |             fields = line.split()
 12 |             contig = fields[0]
 13 |             start = int(fields[1])
 14 |             stop  = int(fields[2])
 15 |             if dict_chr_bias.get(contig):
 16 |                 dict_chr_bias[contig].append((start, stop))
 17 |             else:
 18 |                 dict_chr_bias[contig] = [(start, stop)]
 19 |     return dict_chr_bias
 20 | 
 21 | 
 22 | def compare_bias_regions(dict_target, dict_improve, dict_lowRd, out_file):
 23 |     assert sorted(dict_target.keys()) == sorted(dict_improve.keys()), "discrepancy on the reference of the two lists"
 24 |     f_o = open(out_file, 'w')
 25 |     f_o.write('#chrom\tchromStart\tchromEnd\tname(%;initial;improve;lowRd)\n')
 26 | 
 27 |     total_100 = []
 28 |     total_75 = []
 29 |     total_50 = []
 30 |     total_25 = []
 31 |     total_under_25 = []
 32 |     for contig in sorted(dict_target.keys()):
 33 |         local_100 = []
 34 |         local_75 = []
 35 |         local_50 = []
 36 |         local_25 = []
 37 |         local_under_25 = []
 38 | 
 39 |         region_target  = dict_target [contig]
 40 |         region_improve = dict_improve[contig]
 41 |         region_lowRd   = dict_lowRd  [contig]
 42 |         idx_2 = 0
 43 |         idx_3 = 0
 44 |         for region in region_target:
 45 |             start_1, stop_1 = region
 46 |             #if stop_1 - start_1 < 1000:
 47 |             #    continue
 48 |             contain_region_2 = []
 49 |             contain_lowRd    = []
 50 |             for idx in range(idx_2, len(region_improve)):
 51 |                 start_2, stop_2 = region_improve[idx]
 52 |                 if stop_2 < start_1:
 53 |                     continue
 54 |                 elif start_2 < stop_1:
 55 |                     contain_region_2.append(region_improve[idx])
 56 |                 else:
 57 |                     idx_2 = idx-1
 58 |                     break
 59 |             for idx in range(idx_3, len(region_lowRd)):
 60 |                 start_3, stop_3 = region_lowRd[idx]
 61 |                 if stop_3 < start_1:
 62 |                     continue
 63 |                 elif start_3 < stop_1:
 64 |                     contain_lowRd.append(region_lowRd[idx])
 65 |                 else:
 66 |                     idx_3 = idx-1
 67 |                     break
 68 | 
 69 |             len_region_1 = stop_1 - start_1
 70 |             len_region_2 = sum([ele[1]-ele[0] for ele in contain_region_2])
 71 |             len_region_3 = sum([ele[1]-ele[0] for ele in contain_lowRd])
 72 |             improve_len = len_region_1 - len_region_2 - len_region_3
 73 |             if improve_len == len_region_1:
 74 |                 local_100.append(region)
 75 |                 f_o.write(contig + '\t' + str(start_1) + '\t' + str(stop_1) + '\t' + '100;' + str(len_region_1) + ';' + str(len_region_2) + ';' + str(len_region_3) + '\n')
 76 |             elif improve_len >= len_region_1*0.75:
 77 |                 local_75.append(region)
 78 |                 f_o.write(contig + '\t' + str(start_1) + '\t' + str(stop_1) + '\t' + '75;' + str(len_region_1) + ';' + str(len_region_2) + ';' + str(len_region_3) + '\n')
 79 |             elif improve_len >= len_region_1*0.5:
 80 |                 local_50.append(region)
 81 |                 f_o.write(contig + '\t' + str(start_1) + '\t' + str(stop_1) + '\t' + '50;' + str(len_region_1) + ';' + str(len_region_2) + ';' + str(len_region_3) + '\n')
 82 |             elif improve_len >= len_region_1*0.25:
 83 |                 local_25.append(region)
 84 |                 f_o.write(contig + '\t' + str(start_1) + '\t' + str(stop_1) + '\t' + '25;' + str(len_region_1) + ';' + str(len_region_2) + ';' + str(len_region_3) + '\n')
 85 |             else:
 86 |                 local_under_25.append(region)
 87 |         total_100 += local_100
 88 |         total_75  += local_75
 89 |         total_50  += local_50
 90 |         total_25  += local_25
 91 |         total_under_25 += local_under_25
 92 |         print(contig, len(local_100), len(local_75), len(local_50), len(local_25), len(local_under_25))
 93 |     f_o.close()
 94 |     len_total = sum([len(total_100), len(total_75), len(total_50), len(local_25), len(total_under_25)])
 95 |     print(len(total_100), round(len(total_100)/len_total,3), \
 96 |           len(total_75),  round(len(total_75)/len_total,3),  \
 97 |           len(total_50),  round(len(total_50)/len_total,3),  \
 98 |           len(total_25),  round(len(total_25)/len_total,3),  \
 99 |           len(total_under_25), round(len(total_under_25)/len_total,3))
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     parser = argparse.ArgumentParser()
109 |     parser.add_argument('-lt',  '--list_target',  nargs='+', required=True, help='the first list of scanning bias bed report')
110 |     parser.add_argument('-li',  '--list_improve', nargs='+', required=True, help='the second list of scanning bias bed report, the region should contain in list 1')
111 |     parser.add_argument('-lrd', '--list_lowRd',   nargs='+', required=True, help='the second list of scanning bias bed report, the region should contain in list 1')
112 |     parser.add_argument('-out', '--output_improve', help="output the improve regions")
113 |     args = parser.parse_args()
114 | 
115 |     list_target  = args.list_target
116 |     list_improve = args.list_improve
117 |     list_lowRd   = args.list_lowRd
118 |     out_file   = args.output_improve
119 | 
120 |     dict_target  = read_list_bed(list_target)
121 |     dict_improve = read_list_bed(list_improve)
122 |     dict_lowRd   = read_list_bed(list_lowRd)
123 | 
124 |     compare_bias_regions(dict_target, dict_improve, dict_lowRd, out_file)
125 | 
126 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | _Updated: Apr 17, 2025_
  3 | # Biastools: Measuring, visualizing and diagnosing reference bias
  4 | 
  5 | This github is originally forked from https://github.com/sheila12345/biastools
  6 | 
  7 | ## Prerequisite programs
  8 | - samtools=v1.11
  9 | - bcftools=v1.9
 10 | - bedtools=v2.30.0
 11 | - gzip=v1.9
 12 | - tabix=v1.9
 13 | - bowtie2=v2.4.2
 14 | - bwa=v0.7.17
 15 | - mason_simulator=v2.0.9 (only for biastools --simulate)
 16 | - SeqAn=v2.4.0 (only for biastools --simulate)
 17 |   
 18 | 
 19 | ## Installation
 20 | - [pip](https://pypi.org/project/biastools/)
 21 | ```
 22 | pip install biastools
 23 | ```
 24 | - [Github](https://github.com/maojanlin/biastools.git)
 25 | ```
 26 | git clone https://github.com/maojanlin/biastools.git
 27 | cd biastools
 28 | ```
 29 | Though optional, it is a good practice to install a virtual environment to manage the dependancies:
 30 | 
 31 | ```
 32 | python -m venv venv
 33 | source venv/bin/activate
 34 | ```
 35 | Now a virtual environment (named venv) is activated. Install biastools:
 36 | 
 37 | ```
 38 | python setup.py install
 39 | ```
 40 | 
 41 | 
 42 | ## Usage
 43 | 
 44 | ### Simulation, plotting, and analysis
 45 | ```
 46 | $ biastools --simulate --align --analyze -o <work_dir> -g <ref.fa> -v <vcf> -s <sample_name> -r <run_id>
 47 | ```
 48 | 
 49 | With the example command, biastools 
 50 | 1. Simulates reads based on `<ref.fa>` and `<vcf>`, generating pair-end `.fq.gz` files for both haplotypes (`work_dir/sample_name.hap{A,B}_{1,2}.fq.gz`). 
 51 | 2. Aligns the reads to the reference `<ref.fa>`, generating a BAM file with phasing information (`work_dir/sample_name.run_id.sorted.bam`).
 52 | 3. Analyzes the BAM file with the context-aware assignment method, generating bias reports and plots.
 53 | 
 54 | #### Other aligners
 55 | Biastools supports [Bowtie 2](https://github.com/BenLangmead/bowtie2) and [bwa mem](https://github.com/lh3/bwa) aligners. BAM files from other aligners (named with `<work_dir/sample_name.run_id.sorted.bam>` and tagged with haplotype information) can be analyzed with
 56 | 
 57 | ```
 58 | $ biastools --analyze -o <work_dir> -g <ref.fa> -v <vcf> -s <sample_name> -r <run_id>
 59 | ```
 60 | 
 61 | #### Direct Analysis on Real sequence data
 62 | Biastools can also analyze real sequence data with the `--real` option using the context-aware assignment algorithm. The resulting plot does not include simulation information (`sample_id.real.indel_balance.pdf`).
 63 | ```
 64 | $ biastools --analyze --real -t <thread> -o <work_dir> -g <ref.fa> -v <vcf> -s <sample_name> -r <run_id> \
 65 |                       --bam <path_to_target.bam>
 66 | ```
 67 | Biastools first fetches the relevant alignments from the target BAM file, focusing only on heterozygous variant sites specified in the VCF file. These sites are then analyzed using a [context-aware algorithm](figures/context_aware.md). Finally, Biastools generates a bias report along with a bias-by-allele-length plot, both included in the output folder.
 68 | 
 69 | 
 70 | #### Combined Bias-by-allele-length plot
 71 | Multiple analysis results can be combined into a single Bias-by-allele-length plot. In biastools version 0.3.1, the default plotting module displays the 25th percentile, mean, and 75th percentile of the fraction of ALT alleles for variants stratified by allele length, using ticks to indicate the interquartile range and a central dot to mark the mean.
 72 | 
 73 | ```
 74 | $ biastools --analyze -o <work_dir> -g <ref.fa> -v <vcf> -s <sample_name> -r <run_id> \
 75 |                       -lr file1.bias.all file2.bias.all file3.bias.all... \
 76 |                       -ld run_id1 run_id2 run_id3...
 77 | ```
 78 | 
 79 | The output file `sample_name.combine.sim.indel_balance.pdf` plots the fraction of ALT alleles merged from the bias reports specified after the `-lr` option.  Users can use `-ld` option to specify the tool names, which will appear in the legend. To generate a combined plot using only real data bias reports (excluding simulation information), use the `--real` option.
 80 | 
 81 | An example of a combined bias-by-allele-length plot:
 82 | ![multiple_indel_plot](figures/HG002.GIAB.4.2.1.demo.indel_balance.png?raw=true "multiple_indel_plot")
 83 | 
 84 | 
 85 | ### Bias prediction from bias report
 86 | #### Real data
 87 | Biastools can predict if a variant is bias or not by:
 88 | 
 89 | ```
 90 | $ biastools --predict -o <work_dir> -g <ref.fa> -v <vcf> -s <sample_name> -r <run_pd_id> -pr <path_to_bias_report>
 91 | ```
 92 | 
 93 | With the example command, biastools
 94 | 4. Generates two files: `sample_name.real.pd_id_bias.tsv` and `sample_name.real.pd_id_suspicious.tsv`. The `bias.tsv` report contains all sites predicted to be biased by the model. The `suspicious.tsv` file contains the sites which suspicious of lacking enough information from the VCF file. In another word, the reads align to the site shows different pattern to the haplotype indicated by the VCF file. 
 95 | 
 96 | #### Simulated guided prediction
 97 | 
 98 | ```
 99 | $ biastools --predict -o <work_dir> -g <ref.fa> -v <vcf> -s <sample_name> -r <run_pd_id> \
100 |                       -pr <path_to_bias_report> \
101 |                       -ps <path_to_simulated_bias_report>
102 | ```
103 | 
104 | If the report of the sample based on simulated data is presented, biastools can generate cross prediction experiment result. In the experiment, the ground truth bias sites are based on simulation data.
105 | 
106 | ### Scanning bias without vcf information
107 | #### Scanning
108 | ```
109 | $ biastools_scan --scan -o <work_dir> -g <ref.fa> -s <sample_name> -r <run_id> -i <path_to_target.bam>
110 | ```
111 | 
112 | Biastools transforms the `<path_to_target.bam>` into the mpileup format and generates baised and suspicious regions (`sample_name.run_id.bias.bed` and `sample_name.run_id.suspicious.bed`).
113 | 
114 | 
115 | #### Compare two bam files with common baseline
116 | ```
117 | $ biastools_scan --compare_bam -o <work_dir> -g <ref.fa> -s <sample_name> -r <run_id> \
118 |                                -i  <path_to_target.bam> \
119 |                                -i2 <path_to_second.bam> \
120 |                                -m  <path_to_target.mpileup> \
121 |                                -m2 <path_to_second.mpileup>
122 | ```
123 | Biastools generates a common baseline from `path_to_target.bam` and `path_to_second.bam`, and uses the new common baseline to recalculate the bias regions based on the two mpileup files. The mpileup files can be generated by running **scanning** first, or directly run the **bcftools consensus**.
124 | 
125 | 
126 | 
127 | #### Directly compare two bias reports
128 | User can also generate the comparison of the bias reports without a common baseline (not recommended):
129 | ```
130 | $ biastools_scan --compare_rpt -o <work_dir> -s <sample_name> -r <run_id> \
131 |                                -b1 <path_to_target_bias.bed> \
132 |                                -b2 <path_to_improved_bias.bed> \
133 |                                -l2 <path_to_improved_lowRd.bed>
134 | ```
135 | 
136 | 
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/biastools/sample_baseline.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import argparse
  3 | import random
  4 | import numpy as np
  5 | import os
  6 | from subprocess import call
  7 | 
  8 | from scanning_bias import scanning_bias, calculate_measures 
  9 | 
 10 | 
 11 | def baseline(
 12 |         f_mpileup   :pysam.VariantRecord,
 13 |         fn_sample   :str,
 14 |         window_size :int
 15 |         ) -> tuple:
 16 |     """
 17 |     Take in the sample mpileup, and output the average read_depth/variant Density/non Diploid portion
 18 |     """
 19 |     dict_ref_info = scanning_bias(f_gvcf=f_mpileup)
 20 |     dict_3D_measures = calculate_measures(
 21 |         dict_ref_info=dict_ref_info,
 22 |         window_size=window_size
 23 |         )
 24 |     
 25 |     total_read_depth  = np.array([])
 26 |     total_var_density = np.array([])
 27 |     total_dip_density = np.array([])
 28 |     
 29 |     fo = open(fn_sample + '.baseline', 'w')
 30 |     fo.write('#chr pos segment_len RD_mean RD_std VD_mean VD_std ND_mean ND_std\n')
 31 |     for ref_name, dict_array in dict_3D_measures.items():
 32 |         for start_pos, array_info in dict_array.items():
 33 |             array_read_depth, array_var_density, array_dip_density = array_info
 34 |             
 35 |             avg_read_depth   = np.mean(array_read_depth)
 36 |             std_read_depth   = np.std(array_read_depth)
 37 |             #positive_avg_var = np.mean(array_var_density)
 38 |             #positive_std_var = np.std(array_var_density)
 39 |             #positive_avg_dip = np.mean(array_dip_density)
 40 |             #positive_std_dip = np.std(array_dip_density)
 41 | 
 42 |             fo.write(ref_name + ' ' + str(start_pos) + ' ' + str(len(array_read_depth)) + ' ')
 43 |             fo.write(str(round(avg_read_depth,2))   + ' ' + str(round(std_read_depth,2)) + ' ')
 44 |             #positive_var     = array_var_density[array_var_density != 0]
 45 |             positive_var     = array_var_density
 46 |             if len(positive_var) > 0:
 47 |                 positive_avg_var = np.mean(positive_var)
 48 |                 positive_std_var = np.std(positive_var)
 49 |                 fo.write(str(round(positive_avg_var,2)) + ' ' + str(round(positive_std_var,2)) + ' ')
 50 |             
 51 |             #positive_dip     = array_dip_density[array_var_density != 0]
 52 |             positive_dip     = array_dip_density
 53 |             if len(positive_dip) > 0:
 54 |                 positive_avg_dip = np.mean(positive_dip)
 55 |                 positive_std_dip = np.std(positive_dip)
 56 |                 fo.write(str(round(positive_avg_dip,2)) + ' ' + str( round(positive_std_dip, 2)) + '\n')
 57 |             else:
 58 |                 fo.write('\n')
 59 |             
 60 |             total_read_depth  = np.concatenate((total_read_depth , array_read_depth))
 61 |             total_var_density = np.concatenate((total_var_density, positive_var))
 62 |             total_dip_density = np.concatenate((total_dip_density, positive_dip))
 63 |             #total_var_density = np.concatenate((total_var_density, array_var_density))
 64 |             #total_dip_density = np.concatenate((total_dip_density, array_dip_density))
 65 |     
 66 |     fo.write('#total sample len: ' + str(len(total_read_depth)) + '\n')
 67 |     fo.write('#total_statistics:\n')
 68 |     fo.write('#chr pos segment_len RD_mean RD_std VD_mean VD_std ND_mean ND_std\n# ')
 69 |     fo.write(str(round(np.mean(total_read_depth),5))  + ' ' + str(round(np.std(total_read_depth),5)) + ' ')
 70 |     fo.write(str(round(np.mean(total_var_density),5)) + ' ' + str(round(np.std(total_var_density),5)) + ' ')
 71 |     fo.write(str(round(np.mean(total_dip_density),5)) + ' ' + str(round(np.std(total_dip_density),5)))
 72 |     fo.close()
 73 |     return np.mean(total_read_depth), np.mean(total_var_density), np.mean(total_dip_density)
 74 | 
 75 | 
 76 | def sample_select(
 77 |         fn_sample   :str,
 78 |         seed        :int,
 79 |         min_len     :int,
 80 |         f_bam       :pysam.AlignmentFile
 81 |     ):
 82 |     """
 83 |     Take out the contig length greater than min_len (threshold_contig)
 84 |     For each contig, takes 100 segments totally equal to 1/1000 of the contig length
 85 |     """
 86 |     random.seed(seed)
 87 |     fo = open(fn_sample + '.bed', 'w')
 88 |     write_flag = False
 89 |     for idx, name in enumerate(f_bam.header.references):
 90 |         contig_len = f_bam.header.lengths[idx]
 91 |         if contig_len > min_len:
 92 |             write_flag = True
 93 |             thousandth = int(contig_len / 100000)
 94 |             list_sample_start = random.sample(range(100000 - 1), 100)
 95 |             for sample_start in sorted(list_sample_start):
 96 |                 fo.write(name + ' ' + str(sample_start*thousandth) + ' ' + str(sample_start*thousandth+thousandth) + '\n')
 97 |         elif write_flag == False:
 98 |             fo.write(name + ' 1 ' + str(contig_len) + '\n')
 99 |             write_flag = True
100 |     fo.close()
101 | 
102 | 
103 | 
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     parser = argparse.ArgumentParser()
108 |     parser.add_argument('-b', '--bam_file', help='the bam file we want to sample')
109 |     parser.add_argument('-f', '--reference_fasta', help='the reference fasta file for mpileup building')
110 |     parser.add_argument('-o', '--sample_bed', help='the sampled 1/1000 bed file')
111 |     parser.add_argument('-w', '--window_size', help='window size for average depth, density analysis', type=int, default=400)
112 |     parser.add_argument('-th', '--threshold_contig', help='the minimum contig length for sampling', type=int, default=10000000)
113 |     parser.add_argument('--seed', help='seed for random sampling', type=int, default=0)
114 |     parser.add_argument('-k', '--kill', help='kill all storage files', action='store_true')
115 |     args = parser.parse_args()
116 |     
117 |     fn_bam    = args.bam_file
118 |     fn_ref    = args.reference_fasta
119 |     fn_sample = args.sample_bed
120 |     min_len   = args.threshold_contig
121 |     window_size = args.window_size
122 |     seed      = args.seed
123 |     kill_flag = args.kill
124 | 
125 |     f_bam = pysam.AlignmentFile(fn_bam)
126 | 
127 |     # sample bed file according to the bam file information
128 |     sample_select(fn_sample, seed, min_len, f_bam)
129 | 
130 | 
131 |     # SAMTOOLS command for extract the sample region bam file
132 |     if os.path.exists(fn_sample + '.bam') and not kill_flag:
133 |         print(fn_sample + '.bam already exist.')
134 |     else:
135 |         command = ('samtools view -h ' + fn_bam + ' -L ' + fn_sample + '.bed -o ' + fn_sample + '.bam')
136 |         print(command)
137 |         call(command, shell=True)
138 | 
139 |     # BCFTOOLS command for mpileup the bam file
140 |     if os.path.exists(fn_sample + '.mpileup') and not kill_flag:
141 |         print(fn_sample + '.mpileup already exist.')
142 |     else:
143 |         command = ('bcftools mpileup --count-orphans --annotate FORMAT/AD,FORMAT/DP -f ' + fn_ref + ' --min-BQ 0 --min-MQ 0 ' \
144 |                    + fn_sample + '.bam -o ' + fn_sample + '.mpileup')
145 |         print(command)
146 |         call(command, shell=True)
147 | 
148 |     f_mpileup = pysam.VariantFile(fn_sample + '.mpileup')
149 |     baseline(f_mpileup, fn_sample, window_size)
150 | 
151 |     
152 | 


--------------------------------------------------------------------------------
/biastools/merge_baseline.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import argparse
  3 | import random
  4 | import numpy as np
  5 | import os
  6 | from subprocess import call
  7 | 
  8 | from scanning_bias import scanning_bias, calculate_measures 
  9 | from sample_baseline import sample_select 
 10 | import pickle
 11 | 
 12 | 
 13 | def baseline(
 14 |         f_mpileup_1 :pysam.VariantRecord,
 15 |         f_mpileup_2 :pysam.VariantRecord,
 16 |         fn_sample   :str,
 17 |         window_size :int
 18 |         ) -> tuple:
 19 |     """
 20 |     Take in the sample mpileup, and output the average read_depth/variant Density/non Diploid portion
 21 |     """
 22 |     dict_ref_info_1 = scanning_bias(f_gvcf=f_mpileup_1)
 23 |     dict_3D_measures_1 = calculate_measures(
 24 |         dict_ref_info=dict_ref_info_1,
 25 |         window_size=window_size
 26 |         )
 27 |     dict_ref_info_2 = scanning_bias(f_gvcf=f_mpileup_2)
 28 |     dict_3D_measures_2 = calculate_measures(
 29 |         dict_ref_info=dict_ref_info_2,
 30 |         window_size=window_size
 31 |         )
 32 |     
 33 |     total_read_depth  = np.array([])
 34 |     total_var_density = np.array([])
 35 |     total_dip_density = np.array([])
 36 |     
 37 |     fo = open(fn_sample + '.baseline', 'w')
 38 |     fo.write('#chr pos segment_len RD_mean RD_std VD_mean VD_std ND_mean ND_std\n')
 39 |     for ref_name, dict_array in dict_3D_measures_1.items():
 40 |         for start_pos, array_info in dict_array.items():
 41 |             array_read_depth, array_var_density, array_dip_density = array_info
 42 |             
 43 |             avg_read_depth   = np.mean(array_read_depth)
 44 |             std_read_depth   = np.std(array_read_depth)
 45 | 
 46 |             fo.write(ref_name + ' ' + str(start_pos) + ' ' + str(len(array_read_depth)) + ' ')
 47 |             fo.write(str(round(avg_read_depth,2))   + ' ' + str(round(std_read_depth,2)) + ' ')
 48 |             #positive_var     = array_var_density[array_var_density != 0]
 49 |             positive_var     = array_var_density
 50 |             if len(positive_var) > 0:
 51 |                 positive_avg_var = np.mean(positive_var)
 52 |                 positive_std_var = np.std(positive_var)
 53 |                 fo.write(str(round(positive_avg_var,2)) + ' ' + str(round(positive_std_var,2)) + ' ')
 54 |             
 55 |             #positive_dip     = array_dip_density[array_var_density != 0]
 56 |             positive_dip     = array_dip_density
 57 |             if len(positive_dip) > 0:
 58 |                 positive_avg_dip = np.mean(positive_dip)
 59 |                 positive_std_dip = np.std(positive_dip)
 60 |                 fo.write(str(round(positive_avg_dip,2)) + ' ' + str( round(positive_std_dip, 2)) + '\n')
 61 |             else:
 62 |                 fo.write('\n')
 63 |             
 64 |             total_read_depth  = np.concatenate((total_read_depth , array_read_depth))
 65 |             total_var_density = np.concatenate((total_var_density, positive_var))
 66 |             total_dip_density = np.concatenate((total_dip_density, positive_dip))
 67 |     for ref_name, dict_array in dict_3D_measures_2.items():
 68 |         for start_pos, array_info in dict_array.items():
 69 |             array_read_depth, array_var_density, array_dip_density = array_info
 70 |             
 71 |             avg_read_depth   = np.mean(array_read_depth)
 72 |             std_read_depth   = np.std(array_read_depth)
 73 | 
 74 |             fo.write(ref_name + ' ' + str(start_pos) + ' ' + str(len(array_read_depth)) + ' ')
 75 |             fo.write(str(round(avg_read_depth,2))   + ' ' + str(round(std_read_depth,2)) + ' ')
 76 |             #positive_var     = array_var_density[array_var_density != 0]
 77 |             positive_var     = array_var_density
 78 |             if len(positive_var) > 0:
 79 |                 positive_avg_var = np.mean(positive_var)
 80 |                 positive_std_var = np.std(positive_var)
 81 |                 fo.write(str(round(positive_avg_var,2)) + ' ' + str(round(positive_std_var,2)) + ' ')
 82 |             
 83 |             #positive_dip     = array_dip_density[array_var_density != 0]
 84 |             positive_dip     = array_dip_density
 85 |             if len(positive_dip) > 0:
 86 |                 positive_avg_dip = np.mean(positive_dip)
 87 |                 positive_std_dip = np.std(positive_dip)
 88 |                 fo.write(str(round(positive_avg_dip,2)) + ' ' + str( round(positive_std_dip, 2)) + '\n')
 89 |             else:
 90 |                 fo.write('\n')
 91 |             
 92 |             total_read_depth  = np.concatenate((total_read_depth , array_read_depth))
 93 |             total_var_density = np.concatenate((total_var_density, positive_var))
 94 |             total_dip_density = np.concatenate((total_dip_density, positive_dip))
 95 | 
 96 |     fo.write('#total sample len: ' + str(len(total_read_depth)) + '\n')
 97 |     fo.write('#total_statistics:\n')
 98 |     fo.write('#chr pos segment_len RD_mean RD_std VD_mean VD_std ND_mean ND_std\n# ')
 99 |     fo.write(str(round(np.mean(total_read_depth),5))  + ' ' + str(round(np.std(total_read_depth),5)) + ' ')
100 |     fo.write(str(round(np.mean(total_var_density),5)) + ' ' + str(round(np.std(total_var_density),5)) + ' ')
101 |     fo.write(str(round(np.mean(total_dip_density),5)) + ' ' + str(round(np.std(total_dip_density),5)))
102 |     fo.close()
103 |     print("[Biastools] Generate " + fn_sample + '.baseline')
104 |     return np.mean(total_read_depth), np.mean(total_var_density), np.mean(total_dip_density)
105 | 
106 | 
107 | 
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     parser = argparse.ArgumentParser()
112 |     parser.add_argument('-b1', '--bam_file_1', help='the bam file we want to sample')
113 |     parser.add_argument('-b2', '--bam_file_2', help='the bam file we want to sample')
114 |     parser.add_argument('-f', '--reference_fasta', help='the reference fasta file for mpileup building')
115 |     parser.add_argument('-o', '--sample_bed', help='the sampled 1/1000 bed file')
116 |     parser.add_argument('-w', '--window_size', help='window size for average depth, density analysis', type=int, default=400)
117 |     parser.add_argument('-th', '--threshold_contig', help='the minimum contig length for sampling', type=int, default=10000000)
118 |     parser.add_argument('--seed', help='seed for random sampling', type=int, default=0)
119 |     args = parser.parse_args()
120 |     
121 |     fn_bam_1  = args.bam_file_1
122 |     fn_bam_2  = args.bam_file_2
123 |     fn_ref    = args.reference_fasta
124 |     fn_sample = args.sample_bed
125 |     min_len   = args.threshold_contig
126 |     window_size = args.window_size
127 |     seed      = args.seed
128 | 
129 |     # sample bed file according to the bam file information
130 |     f_bam = pysam.AlignmentFile(fn_bam_1)
131 |     sample_select(fn_sample, seed, min_len, f_bam)
132 | 
133 | 
134 |     # SAMTOOLS command for extract the sample region bam file
135 |     command = ('samtools view -h ' + fn_bam_1 + ' -L ' + fn_sample + '.bed -o ' + fn_bam_1 + '.sample.bam')
136 |     print(command)
137 |     call(command, shell=True)
138 |     command = ('samtools view -h ' + fn_bam_2 + ' -L ' + fn_sample + '.bed -o ' + fn_bam_2 + '.sample.bam')
139 |     print(command)
140 |     call(command, shell=True)
141 | 
142 |     # BCFTOOLS command for mpileup the bam file
143 |     command = ('bcftools mpileup --count-orphans --annotate FORMAT/AD,FORMAT/DP -f ' + fn_ref + ' --min-BQ 0 --min-MQ 0 ' \
144 |             + fn_bam_1 + '.sample.bam -o ' + fn_bam_1 + '.sample.mpileup')
145 |     print(command)
146 |     call(command, shell=True)
147 |     command = ('bcftools mpileup --count-orphans --annotate FORMAT/AD,FORMAT/DP -f ' + fn_ref + ' --min-BQ 0 --min-MQ 0 ' \
148 |             + fn_bam_2 + '.sample.bam -o ' + fn_bam_2 + '.sample.mpileup')
149 |     print(command)
150 |     call(command, shell=True)
151 |     
152 |     f_mpileup_1 = pysam.VariantFile(fn_bam_1 + '.sample.mpileup')
153 |     f_mpileup_2 = pysam.VariantFile(fn_bam_2 + '.sample.mpileup')
154 |     
155 |     print('[Biastools] Generate sample baseline')
156 |     baseline(f_mpileup_1, f_mpileup_2, fn_sample, window_size)
157 | 
158 |     
159 | 
160 | 


--------------------------------------------------------------------------------
/biastools/biastools_scan.py:
--------------------------------------------------------------------------------
  1 | # Wrap up python file for the biastools 3rd module
  2 | import subprocess
  3 | import sys
  4 | import os
  5 | import argparse
  6 | from biastools.biastools import check_program_install, catch_assert
  7 | 
  8 | 
  9 | def main():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument('-o', '--out', help="Path to output directory ['out_dir'].", default="out_dir")
 12 |     parser.add_argument('-g', '--genome', help="Path to the reference genome.")
 13 |     parser.add_argument('-i', '--bam', help="Path to the alignment bam file, should be SORTED.")
 14 |     parser.add_argument('-s', '--sample_id', help="Sample ID ['sample'].", default="sample")
 15 |     parser.add_argument('-r', '--run_id', help="Run ID ['run'].", default="run")
 16 |     # Process options
 17 |     parser.add_argument('--scan',        help='[1] Option to scan and report bias region.', action='store_true')
 18 |     parser.add_argument('--compare_bam', help='[2] Option to generate common baseline and compare.', action='store_true')
 19 |     parser.add_argument('--compare_rpt', help='[3] Option to directly compare two bias report.', action='store_true')
 20 | 
 21 |     parser.add_argument('-t', '--thread', help="Number of threads to use [max].", type=int)
 22 |     parser.add_argument('--force', help="running the program without checking prerequisite programs.", action='store_true')
 23 |     # [1]
 24 |     parser.add_argument('-w', '--wig', help="Generate the wig files for the three measures, VERY SLOW [False]", action='store_true')
 25 |     parser.add_argument('-R', '--range', help="The range in the bam file targeted for analysis.")
 26 |     # [2]
 27 |     parser.add_argument('-i2', '--bam2',     help="Path to the second alignment bam file want to compare, should be SORTED.")
 28 |     parser.add_argument('-m',  '--mpileup',  help="Path to the mpileup file of the first bam file.")
 29 |     parser.add_argument('-m2', '--mpileup2', help="Path to the mpileup file of the second bam file.")
 30 |     # [3]
 31 |     parser.add_argument('-b1', '--bed1', help="Path to the first bed file for comparison.")
 32 |     parser.add_argument('-b2', '--bed2', help="Path to the second bed file for comparison.")
 33 |     parser.add_argument('-l2', '--lowRd2', help="Path to the .lowRd.bed report of the second file.")
 34 |     args = parser.parse_args()
 35 |     
 36 |     path_output = args.out
 37 |     path_ref   = args.genome
 38 |     bam_file   = args.bam
 39 |     sample_id  = args.sample_id
 40 |     run_id     = args.run_id
 41 |     
 42 |     flag_scan        = args.scan
 43 |     flag_compare_bam = args.compare_bam
 44 |     flag_compare_rpt = args.compare_rpt
 45 |     try:
 46 |         assert flag_scan + flag_compare_bam + flag_compare_rpt >= 1 
 47 |     except AssertionError:
 48 |         catch_assert(parser, "At least one of the --scan/compare_bam/compare_rpt option should be specified.")
 49 | 
 50 |     flag_force = args.force
 51 |     thread = args.thread
 52 |     if thread == None:
 53 |         if sys.platform == "darwin":
 54 |             result = subprocess.run(["sysctl -n hw.ncpu"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
 55 |         else:
 56 |             result = subprocess.run(["nproc"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
 57 |         thread = int(result.stdout.strip())
 58 |     flag_wig  = args.wig
 59 |     Range     = args.range 
 60 |     bam_file2 = args.bam2
 61 |     mpileup_file = args.mpileup
 62 |     mpileup_file2 = args.mpileup2
 63 |     bed_file1 = args.bed1
 64 |     bed_file2 = args.bed2
 65 |     lowRd_file2 = args.lowRd2
 66 | 
 67 |     
 68 |     # Checking prerequisite programs are installed
 69 |     if flag_force != True:
 70 |         check_program_install(["bedtools", \
 71 |                                "samtools", \
 72 |                                "bcftools"])
 73 | 
 74 |     # Start running
 75 |     command = "mkdir -p " + path_output
 76 |     subprocess.call(command, shell=True)
 77 |     prefix = path_output + '/' + sample_id 
 78 |     path_module = os.path.dirname(__file__) + '/'
 79 |     if flag_scan:
 80 |         print("[Biastools] Scanning...")
 81 |         if os.path.exists(bam_file+'.bai'):
 82 |             pass
 83 |         else:
 84 |             command = ["samtools", "index", bam_file]
 85 |             subprocess.call(command)
 86 |         
 87 |         print("[BIASTOOLS] SAMPLE", bam_file, " as ", sample_id + ".baseline ...")
 88 |         command = ["python3", path_module+"sample_baseline.py", "-b", bam_file, "-f", path_ref, "-o", prefix+".sample"]
 89 |         print(' '.join(command))
 90 |         subprocess.call(command)
 91 |         
 92 |         if Range == None:
 93 |             print("[BIASTOOLS] Process the whole bam file...")
 94 |             target_bam = bam_file
 95 |         else:
 96 |             print("[BIASTOOLS] Extract reads from " + Range + "...")
 97 |             target_bam = prefix + '.range.bam'
 98 |             command = ["samtools", "view", " -h", bam_file, Range, "-o", target_bam, "-@", thread]
 99 |             print(' '.join(command))
100 |             subprocess.call(command)
101 | 
102 |         print("[BIASTOOLS] Format the mpileup...")
103 |         if os.path.exists(prefix+'.'+run_id+'.mpileup'):
104 |             print(prefix+'.'+run_id+'.mpileup already exist!')
105 |         else:
106 |             command = ["bcftools", "mpileup", "--count-orphans", "--annotate", "FORMAT/AD,FORMAT/DP", \
107 |                        "-f", path_ref, \
108 |                        "--min-BQ", "0", \
109 |                        "--min-MQ", "0", \
110 |                        "--threads", str(thread), target_bam, "-o", prefix+'.'+run_id+'.mpileup']
111 |             print(' '.join(command))
112 |             subprocess.call(command)
113 |         print("[BIASTOOLS] Scanning bias...")
114 |         if flag_wig:
115 |             command = ["python3", path_module+"scanning_bias.py", "-g", prefix+'.'+run_id+'.mpileup', "--sample", "-b", prefix+".sample.baseline", \
116 |                        "-wig", "-o", prefix+'.'+run_id+'.scanning']
117 |         else:
118 |             command = ["python3", path_module+"scanning_bias.py", "-g", prefix+'.'+run_id+'.mpileup', "--sample", "-b", prefix+".sample.baseline", \
119 |                        "-o", prefix+'.'+run_id+'.scanning']
120 |         print(' '.join(command))
121 |         subprocess.call(command)
122 |     
123 |     if flag_compare_bam:
124 |         if os.path.exists(bam_file+'.bai'):
125 |             pass
126 |         else:
127 |             command = ["samtools", "index", bam_file]
128 |             subprocess.call(command)
129 |         if os.path.exists(bam_file2+'.bai'):
130 |             pass
131 |         else:
132 |             command = ["samtools", "index", bam_file2]
133 |             subprocess.call(command)
134 | 
135 |         print("[Biastools] Generate common baseline...")
136 |         baseline = prefix+"."+run_id+".combine"
137 |         command = ["python3", path_module+"merge_baseline.py", "-b1", bam_file, "-b2", bam_file2, "-f", path_ref, "-o", baseline]
138 |         #print(' '.join(command))
139 |         subprocess.call(command)
140 |         command = ' '.join(["python3", path_module+"scanning_bias.py", "-g", mpileup_file,  "-b", baseline+".baseline", "-o", baseline+".1.scanning", ">", prefix+"."+run_id+".log"])
141 |         #print(command)
142 |         subprocess.call(command, shell=True)
143 |         command = ' '.join(["python3", path_module+"scanning_bias.py", "-g", mpileup_file2, "-b", baseline+".baseline", "-o", baseline+".2.scanning", ">", prefix+"."+run_id+".log"])
144 |         #print(command)
145 |         subprocess.call(command, shell=True)
146 | 
147 |         print("[Biastools] Compare two bam files with common baseline...")
148 |         command = ' '.join(["bash", path_module+"biastools_compare.sh", path_output, sample_id, run_id, \
149 |                             baseline+".1.scanning.bias.bed", \
150 |                             baseline+".2.scanning.bias.bed", \
151 |                             baseline+".2.scanning.lowRd.bed", \
152 |                             path_module])
153 |         print(command)
154 |         subprocess.call(command, shell=True)
155 |     if flag_compare_rpt:
156 |         print("[Biastools] Compare two bed files...")
157 |         command = ' '.join(["bash", path_module+"biastools_compare.sh", path_output, sample_id, run_id, bed_file1, bed_file2, lowRd_file2, path_module])
158 |         print(command)
159 |         subprocess.call(command, shell=True)
160 | 
161 | 
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     main()
166 | 


--------------------------------------------------------------------------------
/biastools/golden_graph_report.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pandas as pd
  3 | import seaborn as sns
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | import math
  7 | import random
  8 | import numpy as np
  9 | 
 10 | 
 11 | colors = ["#bce4ff", "#8bd0fe", "#59bcfc", "#0099fc", "#0086dd", "#006bb1", "#004a7a", "#002740"]
 12 | colors = ["#f2dad5", "#e8bfc1", "#d9a4b2", "#c78ba6", "#aa719a", "#8b5b89", "#634271", "#3c2a4f"]
 13 | 
 14 | def map_mapq_to_size(mapq):
 15 |     if mapq >= 40:
 16 |         return 0
 17 |     elif mapq >= 30:
 18 |         return 1
 19 |     elif mapq >= 20:
 20 |         return 2
 21 |     elif mapq >= 10:
 22 |         return 3
 23 |     elif mapq >= 5:
 24 |         return 4
 25 |     elif mapq >= 3:
 26 |         return 5
 27 |     elif mapq >= 1:
 28 |         return 6
 29 |     return 7
 30 | 
 31 | labels = ['>40', '30~40', '20~30', '10~20', '5~10', '3~5', '1~3', '<1']
 32 | 
 33 | def map_color(
 34 |         var:float
 35 |         )-> int:
 36 |     """
 37 |     color_code = int(var/2)
 38 |     if color_code > 20:
 39 |         color_code = 20
 40 |     return color_code
 41 |     """
 42 |     if var > 0.5:
 43 |         return 0
 44 |     elif var > 0.3:
 45 |         return 1
 46 |     elif var > 0.1:
 47 |         return 2
 48 |     elif var > 0.05:
 49 |         return 3
 50 |     elif var > 0.01:
 51 |         return 4
 52 |     else:
 53 |         return 5
 54 | 
 55 | p_labels = ['>0.5', '0.3~0.5', '0.1~0.3', '0.05~0.1', '0.01~0.05', '<0.01']
 56 | 
 57 | def map_num_to_size(num):
 58 |     if num == 0:
 59 |         return 0
 60 |     elif num <= 3:
 61 |         return 1
 62 |     elif num <= 5:
 63 |         return 2
 64 |     elif num <= 10:
 65 |         return 3
 66 |     elif num <= 15:
 67 |         return 4
 68 |     elif num <= 20:
 69 |         return 5
 70 |     elif num <= 30:
 71 |         return 6
 72 |     return 7
 73 | 
 74 | n_labels = ['0', '1~3', '4~6', '7~10', '11~15', '16~20', '21~30', '>30']
 75 | 
 76 | 
 77 | def dist_origin(a, b):
 78 |     return math.dist((0,a) + (b,0))
 79 | 
 80 | 
 81 | 
 82 | def plot_golden(out_prefix, df_use):
 83 |     # Add columns
 84 |     mapQ   = list(df_use['AVG_MAPQ'])
 85 |     pValue = list(df_use['EVEN_P_VALUE'])
 86 |     
 87 |     sp = pd.DataFrame()
 88 |     sp['ALLELIC BALANCE']    = list(df_use['BALANCE'])
 89 |     sp['MAPPING BALANCE']    = list(df_use['MAP_BALANCE'])
 90 |     sp['SIMULATION BALANCE'] = list(df_use['SIM_BALANCE'])
 91 |     sp.head()
 92 |     
 93 |     mapped_mapQ = [map_mapq_to_size(q) for q in mapQ]
 94 |     mapped_p    = [map_color(p) for p in pValue]
 95 |     sp['Avg_MapQ_code'] = mapped_mapQ
 96 |     sp['Even_p_value']  = mapped_p
 97 |     sp['Assign_other']  = [map_num_to_size(n) for n in list(df_use['OTHER'])   ]
 98 |     sp['Map_other']     = [map_num_to_size(n) for n in list(df_use['MIS_MAP']) ]
 99 |     sp['MapQ'] = list(mapQ)
100 | 
101 |     #================== color map ====================
102 |     set_mapQ_value = set(sp['Avg_MapQ_code'])
103 |     color_mapQ = []
104 |     for idx in sorted(set_mapQ_value):
105 |         color_mapQ.append(colors[idx])
106 |     
107 |     set_misMap_value = set(sp['Map_other'])
108 |     color_misMap = []
109 |     for idx in sorted(set_misMap_value):
110 |         color_misMap.append(colors[idx])
111 |     
112 |     #=========================== all merged plot ============================
113 |     print("Ploting the Merged golden distribution Plot!")
114 |     sp['Normalized Assignment Balance'] = list(df_use['BALANCE']-df_use['SIM_BALANCE']) # the average map_q score
115 |     sp['Normalized Mapping Balance'] = list(df_use['MAP_BALANCE']-df_use['SIM_BALANCE']) # the average map_q score
116 |     #ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Assignment Balance",  hue = "Avg_MapQ_code", data = sp, \
117 |     #        xlim=(-0.6,0.6), ylim=(-0.6,0.6), palette=sns.color_palette(color_mapQ))
118 |     ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Assignment Balance",  hue = "Map_other", data = sp, \
119 |             xlim=(-0.8,0.8), ylim=(-0.8,0.8), palette=sns.color_palette(color_misMap))
120 |     ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.2)
121 |     ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.2)
122 |     ax.ax_joint.get_legend().remove()
123 |     h, l = ax.ax_joint.get_legend_handles_labels()
124 |     #plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(0, 0), loc='lower right', borderaxespad=0.2)
125 |     plt.legend(h, n_labels, title="Mismapped Gain#", bbox_to_anchor=(0, 0), loc='lower right', borderaxespad=0.2)
126 |     #plt.savefig(out_prefix + '.mismap.pdf')
127 | 
128 |     #print(df_use[sp['Normalized Assignment Balance']**2 + sp['Normalized Mapping Balance']**2 > 0.01])
129 |     biased = (sp['Normalized Assignment Balance']**2 + sp['Normalized Mapping Balance']**2 > 0.01)
130 |     b_loss = ((sp['Normalized Assignment Balance'] < sp['Normalized Mapping Balance']*2 + 0.1)*(sp['Normalized Assignment Balance']*2 + \
131 |               0.1 > sp['Normalized Mapping Balance'])) + \
132 |              ((sp['Normalized Assignment Balance'] + 0.1 > sp['Normalized Mapping Balance']*2)*(sp['Normalized Assignment Balance']*2 \
133 |               < sp['Normalized Mapping Balance'] + 0.1))
134 |     b_flux = (sp['Normalized Assignment Balance'] > 0.1)*(sp['Map_other'] >= 3) + \
135 |              (sp['Normalized Assignment Balance'] < -0.1)*(sp['Map_other'] >= 3)
136 |     b_artifact = (sp['Normalized Assignment Balance'] > 0.1)*(sp['Map_other'] < 3) + \
137 |                  (sp['Normalized Assignment Balance'] < -0.1)*(sp['Map_other'] < 3)
138 | 
139 |     sp['Category'] = biased*4
140 |     sp['Category'] -= (biased * b_loss)*3
141 |     sp['Category'] -= (biased * ~b_loss * b_flux)*2
142 |     sp['Category'] -= (biased * ~b_loss * b_artifact)*1
143 |     labels = ['Balanced', 'Bias (Loss)', 'Bias (Flux)', 'Bias (Local)', 'Outliers']
144 | 
145 |     custom_palette = sns.color_palette('Set2')
146 |     custom_palette = custom_palette[:4] + custom_palette[-1:]
147 |     ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Assignment Balance",  hue = "Category", data = sp, \
148 |             xlim=(-0.8,0.8), ylim=(-0.8,0.8), palette=custom_palette)
149 |     ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.2)
150 |     ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.2)
151 |     ax.ax_joint.get_legend().remove()
152 |     h, l = ax.ax_joint.get_legend_handles_labels()
153 |     plt.legend(h, labels, title="Category#", bbox_to_anchor=(1, 0), loc='lower right', borderaxespad=0.2)
154 |     plt.savefig(out_prefix + '.category.pdf')
155 | 
156 |     print("-------------------------------------------")
157 |     print("Number of balanced:", sum(sp['Category'] == 0))
158 |     print("Number of bias_loss:", sum(sp['Category'] == 1))
159 |     print("Number of bias_flux:", sum(sp['Category'] == 2))
160 |     print("Number of bias_local:", sum(sp['Category'] == 3))
161 |     print("Number of outliers:", sum(sp['Category'] == 4))
162 |     print("-------------------------------------------")
163 | 
164 |     df_use.loc[(sp['Category'] == 0).values, :].to_csv(out_prefix + '.balanced.tsv', index=False, sep="\t")
165 |     df_use.loc[((sp['Category'] == 1)*(sp['Normalized Assignment Balance'] > 0)).values, :].to_csv(out_prefix + '.bias-loss.1.tsv', index=False, sep="\t")
166 |     df_use.loc[((sp['Category'] == 1)*(sp['Normalized Assignment Balance'] < 0)).values, :].to_csv(out_prefix + '.bias-loss.2.tsv', index=False, sep="\t")
167 |     df_use.loc[((sp['Category'] == 2)*(sp['Normalized Assignment Balance'] > 0)).values, :].to_csv(out_prefix + '.bias-flux.1.tsv', index=False, sep="\t")
168 |     df_use.loc[((sp['Category'] == 2)*(sp['Normalized Assignment Balance'] < 0)).values, :].to_csv(out_prefix + '.bias-flux.2.tsv', index=False, sep="\t")
169 |     df_use.loc[((sp['Category'] == 3)*(sp['Normalized Assignment Balance'] > 0)).values, :].to_csv(out_prefix + '.bias-local.1.tsv', index=False, sep="\t")
170 |     df_use.loc[((sp['Category'] == 3)*(sp['Normalized Assignment Balance'] < 0)).values, :].to_csv(out_prefix + '.bias-local.2.tsv', index=False, sep="\t")
171 |     df_use.loc[(sp['Category'] == 4).values, :].to_csv(out_prefix + '.bias-outlier.tsv', index=False, sep="\t")
172 |     df_use.loc[(sp['Map_other'] > 4).values, :].to_csv(out_prefix + '.bias-mismap_gain.tsv', index=False, sep="\t")
173 | 
174 | 
175 | 
176 | 
177 | if __name__ == "__main__":
178 |     parser = argparse.ArgumentParser()
179 |     parser.add_argument('-mb', '--bias_report', help='bias report, must contain the golden information')
180 |     parser.add_argument('-qt', '--quality_threshold', help='threshold that filtered the sites with avg_mapQ below the threshold', type=int)
181 |     parser.add_argument('-out', '--output_prefix', help='the prefix for the output plots and report')
182 |     args = parser.parse_args()
183 |     
184 |     fn_bias = args.bias_report
185 |     mapQ_th = args.quality_threshold
186 |     output_prefix = args.output_prefix
187 |     if output_prefix == None:
188 |         output_prefix = fn_bias
189 | 
190 |     df_use = pd.read_csv(fn_bias, sep='\t')
191 |     if mapQ_th:
192 |         df_use = df_use[df_use['AVG_MAPQ'] >= mapQ_th]
193 |     df_use.head()
194 | 
195 |     plot_golden(output_prefix, df_use)
196 | 
197 | 


--------------------------------------------------------------------------------
/biastools/biastools.py:
--------------------------------------------------------------------------------
  1 | # Wrap up python file for the biastools 1st and 2nd module
  2 | import subprocess
  3 | import sys
  4 | import os
  5 | import argparse
  6 | from shutil import which
  7 | 
  8 | def is_tool(name):
  9 |     """Check whether `name` is on PATH and marked as executable."""
 10 |     return which(name) is not None
 11 | 
 12 | 
 13 | def check_program_install(list_names):
 14 |     flag_violate = False
 15 |     for name in list_names:
 16 |         if is_tool(name) == False:
 17 |             print(name, "is a prerequisite program, please install it before running biastools")
 18 |             flag_violate = True
 19 |     if flag_violate:
 20 |         print("Use --force option if you want to disable the prerequisite program check.")
 21 |         exit(1)
 22 | 
 23 | 
 24 | def bool2str(flag):
 25 |     if flag:
 26 |         return "1"
 27 |     else:
 28 |         return "0"
 29 | 
 30 | 
 31 | def catch_assert(parser, message):
 32 |     print('\n', message, '\n')
 33 |     parser.print_usage()
 34 |     exit(1)
 35 | 
 36 | 
 37 | 
 38 | 
 39 | def main():
 40 |     parser = argparse.ArgumentParser(description="Simulation/Alignment/Analyzing/Prediction module of the Biastools v0.3.1")
 41 |     parser.add_argument('--version', action='version', version='%(prog)s 0.3.1')
 42 |     parser.add_argument('-o', '--out', help="Path to output directory ['out_dir'].", default="out_dir")
 43 |     parser.add_argument('-g', '--genome', help="Path to the reference genome.")
 44 |     parser.add_argument('-v', '--vcf', help="Path to the personal vcf file.")
 45 |     parser.add_argument('-s', '--sample_id', help="Sample ID ['sample'].", default="sample")
 46 |     parser.add_argument('-r', '--run_id', help="Run ID ['run'].", default="run")
 47 |     # Process options
 48 |     parser.add_argument('--simulate', help='[1] Option to run biastools simulation.', action='store_true')
 49 |     parser.add_argument('--align',    help='[2] Option to run biastools align.', action='store_true')
 50 |     parser.add_argument('--analyze',  help='[3] Option to run biastools analyze.', action='store_true')
 51 |     parser.add_argument('--predict',  help='[4] Option to predict bias from analysis report.', action='store_true')
 52 | 
 53 |     parser.add_argument('-t', '--thread', help="Number of threads to use [max].", type=int)
 54 |     parser.add_argument('--force', help="running the program without checking prerequisite programs.", action='store_true')
 55 |     # [1]
 56 |     parser.add_argument('-x', '--coverage', help="Read coverage to simulate [30].", type=int, default=30)
 57 |     # [2]
 58 |     parser.add_argument('-a', '--aligner', help="Aligner to use (bowtie2|bwamem) [bowtie2]", default="bowtie2")
 59 |     parser.add_argument('-b', '--align_index', help="Path to the aligner index (target reference)")
 60 |     # [3]
 61 |     parser.add_argument('-i', '--bam', help="Path to the alignment bam file, should be sorted [out_dir/sample.run_id.sorted.bam].")
 62 |     parser.add_argument('-n', '--naive', help= "Option to run the naive assignment method [False].", action='store_true')
 63 |     parser.add_argument('-R', '--real',  help= "Option for performing analysis on real data [False].", action='store_true')
 64 |     parser.add_argument('-d', '--boundary', help= "Boundary to plot the indel balance plot [20]", type=int, default=20)
 65 |     parser.add_argument('-lr', '--list_report', help= "List of bias report to plot the indel balance plot", nargs='+')
 66 |     parser.add_argument('-ld', '--list_run_id', help= "List of run ID for namings in the indel balance plot", nargs='+')
 67 |     # [4]
 68 |     parser.add_argument('-ps', '--sim_report',  help= "Path to the simulation report.")
 69 |     parser.add_argument('-pr', '--real_report', help= "Path to the real read report  [out_dir/sample.real.run.bias].")
 70 |     args = parser.parse_args()
 71 |     
 72 |     ##### Parameters for biastool_analysis
 73 |     path_output = args.out
 74 |     path_ref   = args.genome
 75 |     path_vcf   = args.vcf
 76 |     sample_id  = args.sample_id
 77 |     run_id     = args.run_id
 78 |     bam_file   = args.bam
 79 |     if bam_file == None:
 80 |         bam_file = path_output + '/' + sample_id + '.' + run_id + '.sorted.bam'
 81 |     
 82 |     flag_simulate = args.simulate
 83 |     flag_align    = args.align
 84 |     flag_analyze  = args.analyze
 85 |     flag_predict  = args.predict
 86 | 
 87 |     path_module = os.path.dirname(__file__) + '/'
 88 |     try:
 89 |         assert flag_simulate + flag_align + flag_analyze + flag_predict >= 1 
 90 |     except AssertionError:
 91 |         catch_assert(parser, "At least one of the --simulate/align/analyze/predict option should be specified.")
 92 | 
 93 |     flag_force = args.force
 94 |     thread = args.thread
 95 |     if thread == None:
 96 |         if sys.platform == "darwin":
 97 |             result = subprocess.run(["sysctl -n hw.ncpu"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
 98 |         else:
 99 |             result = subprocess.run(["nproc"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
100 |         thread = int(result.stdout.strip())
101 |     
102 |     coverage = args.coverage
103 |     aligner  = args.aligner
104 |     align_index = args.align_index
105 |     try:
106 |         assert aligner=="bowtie2" or aligner=="bwamem" 
107 |     except AssertionError:
108 |         catch_assert(parser, "Only bowtie2 and bwamem are supported.")
109 | 
110 |     flag_naive  = args.naive
111 |     flag_real   = args.real
112 |     boundary    = args.boundary
113 |     list_report = args.list_report
114 |     list_run_id = args.list_run_id
115 |     if list_report:
116 |         try:
117 |             assert len(list_report) == len(list_run_id)
118 |         except AssertionError:
119 |             catch_assert(parser, "Number of list --list_report and --list_run_id entries are inconsistent.")
120 | 
121 |     sim_report  = args.sim_report
122 |     real_report = args.real_report
123 |     if flag_predict: 
124 |         try:
125 |             assert real_report != None
126 |         except AssertionError:
127 |             catch_assert(parser, "<real_report> should be specified when using --predict")
128 | 
129 | 
130 |     
131 |     # Checking prerequisite programs are installed
132 |     if flag_force != True:
133 |         list_program = ["bedtools", \
134 |                          "samtools", \
135 |                          "bcftools", \
136 |                          "gzip", \
137 |                          "tabix"]
138 |         if flag_align:
139 |             list_program += ["bwa", "bowtie2"]
140 |         if flag_simulate:
141 |             list_program.append("mason_simulator")
142 |         check_program_install( list_program ) 
143 | 
144 |     # Start running
145 |     command = "mkdir -p " + path_output
146 |     subprocess.call(command, shell=True)
147 | 
148 |     if flag_simulate:
149 |         try:
150 |             assert path_ref != None
151 |             assert path_vcf != None 
152 |         except AssertionError:
153 |             catch_assert(parser, "<genome> and <vcf> should be specified when using --simulate")
154 |         print("[Biastools] Simulate...")
155 |         command = ' '.join(["bash", path_module+"biastools_simulation.sh", path_ref, path_vcf, path_output, sample_id, str(thread), str(coverage), path_module])
156 |         #print(command)
157 |         subprocess.call(command, shell=True)
158 |     if flag_align:
159 |         try:
160 |             assert path_ref != None
161 |             assert path_vcf != None
162 |         except AssertionError:
163 |             catch_assert(parser, "<genome> and <vcf> should be specified when using --align")
164 |         if align_index == None:
165 |             align_index = path_ref
166 |         print("[Biastools] Align...")
167 |         command = ' '.join(["bash", path_module+"biastools_align.sh", path_ref, path_vcf, path_output, sample_id, str(thread), aligner, align_index, run_id, path_module])
168 |         #print(command)
169 |         subprocess.call(command, shell=True)
170 |     if flag_analyze:
171 |         if list_report != None:
172 |             print("[Biastools] Plot the indel balance plot for multiple bias reports...")
173 |             if flag_real:
174 |                 subprocess.call(['python3', path_module+'indel_balance_plot.py', "-lr"] + list_report + ["-ln"] + list_run_id + [  \
175 |                                             "-vcf", path_output+"/"+sample_id+".het.vcf.gz", "-bd", str(boundary), "-map", \
176 |                                             "-out", path_output+"/"+sample_id+"."+run_id+".real", "-real"])
177 |             else:
178 |                 subprocess.call(['python3', path_module+'indel_balance_plot.py', "-lr"] + list_report + ["-ln"] + list_run_id + [ \
179 |                                             "-vcf", path_output+"/"+sample_id+".het.vcf.gz", "-bd", str(boundary), "-map", \
180 |                                             "-out", path_output+"/"+sample_id+"."+run_id+".sim"])
181 |         else:
182 |             try:
183 |                 assert path_ref != None
184 |                 assert path_vcf != None
185 |             except AssertionError:
186 |                 catch_assert(parser, "<genome> and <vcf> should be specified when using --analyze")
187 |             print("[Biastools] Analyze and plot...")
188 |             command = ' '.join(["bash", path_module+"biastools_analysis.sh", path_ref, path_vcf, path_output, sample_id, str(thread), run_id, bool2str(flag_real), \
189 |                                 bool2str(flag_naive), str(boundary), path_module, bam_file])
190 |             #print(command)
191 |             subprocess.call(command, shell=True)
192 |     if flag_predict:
193 |         print("[Biastools] Predict bias...")
194 |         command = ' '.join(["bash", path_module+"biastools_predict.sh", path_output, sample_id, run_id, bool2str(flag_real), real_report, sim_report, path_module])
195 |         #print(command)
196 |         subprocess.call(command, shell=True)
197 | 
198 | 
199 | 
200 | 
201 |     
202 | if __name__ == "__main__":
203 |     main()
204 | 
205 | 
206 | 
207 |     
208 | 


--------------------------------------------------------------------------------
/biastools/predict_experiment.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import matplotlib.pyplot as plt
  3 | import seaborn as sns
  4 | import numpy as np
  5 | from matplotlib.colors import ListedColormap
  6 | import pandas as pd
  7 | import random
  8 | 
  9 | from sklearn import datasets, metrics
 10 | from sklearn.metrics import roc_curve, precision_recall_curve, auc
 11 | 
 12 | 
 13 | 
 14 | def get_label(df_simulation):
 15 |     """
 16 |     sort and label the simulated data, real data
 17 |     """
 18 |     sp = pd.DataFrame()
 19 |     sp['Map_other']                  = list(df_simulation['MIS_MAP'])
 20 |     sp['Normalized Allelic Balance'] = list(df_simulation['BALANCE']-df_simulation['SIM_BALANCE']) # the average map_q score
 21 |     sp['Normalized Mapping Balance'] = list(df_simulation['MAP_BALANCE']-df_simulation['SIM_BALANCE']) # the average map_q score
 22 |     
 23 |     biased = (sp['Normalized Allelic Balance']**2 + sp['Normalized Mapping Balance']**2 > 0.01)
 24 |     b_loss = ((sp['Normalized Allelic Balance'] < sp['Normalized Mapping Balance']*2 + 0.1) * \
 25 |               (sp['Normalized Allelic Balance']*2 + 0.1 > sp['Normalized Mapping Balance']))
 26 |     b_flux = (sp['Normalized Allelic Balance'] > 0.1)*(sp['Map_other'] > 4)
 27 |     b_artifact = (sp['Normalized Allelic Balance'] > 0.1)*(sp['Map_other'] <= 4)
 28 | 
 29 |     sp['Category'] = biased*4
 30 |     sp['Category'] -= (biased * b_loss)*3
 31 |     sp['Category'] -= (biased * ~b_loss * b_flux)*2
 32 |     sp['Category'] -= (biased * ~b_loss * b_artifact)*1
 33 |     
 34 |     sp['binary_category'] = (sp['Category'] > 0)
 35 |     return sp
 36 | 
 37 |     
 38 | def print_accuracy(predict, label):
 39 |     print("Correct Num:",    np.sum(predict == label))
 40 |     TP = np.sum((predict == label) * (predict != 0))
 41 |     FP = np.sum((predict != label) * (predict != 0))
 42 |     FN = np.sum((predict != label) * (predict == 0))
 43 |     print("True Positive:",  TP)
 44 |     print("False Positive:", FP)
 45 |     print("False Negative:", FN)
 46 |     print("Precision:", TP/(TP+FP))
 47 |     print("Recall:", TP/(TP+FN))
 48 | 
 49 | 
 50 | def combine_score(sim_feature, sim_label, real_feature, real_label, miss_info, best_threshold, out_prefix):
 51 |     """
 52 |     quality score * balance score
 53 |     """
 54 |     sim_feature['label'] = sim_label
 55 |     sim_feature['z_MAPQ'] = ((sim_feature['AVG_MAPQ'] - 45) * -1).clip(lower=0)
 56 |     sim_feature['combine_score'] = (sim_feature['z_MAPQ']) * (sim_feature['BALANCE']) #* (sim_feature['BALANCE'])
 57 |     sim_feature['plus_score']    = (sim_feature['z_MAPQ']/45) + 1.5*sim_feature['BALANCE']
 58 |     sim_feature['mix_score']     = sim_feature['plus_score'] + sim_feature['combine_score'] / 20
 59 | 
 60 |     fpr_m, tpr_m, thresholds = metrics.roc_curve(sim_feature['label'], sim_feature['combine_score'], pos_label=True)
 61 |     fpr_p, tpr_p, thresholds = metrics.roc_curve(sim_feature['label'], sim_feature['plus_score'], pos_label=True)
 62 |     plt.plot(fpr_m, tpr_m, label="simulation_mul, auc="+str(round(auc(fpr_m,tpr_m),2)))
 63 |     plt.plot(fpr_p, tpr_p, label="simulation_add, auc="+str(round(auc(fpr_p,tpr_p),2)))
 64 | 
 65 |     real_feature['label'] = real_label
 66 |     real_feature['z_MAPQ'] = ((real_feature['AVG_MAPQ'] - 45) * -1).clip(lower=0)
 67 |     real_feature['combine_score'] = (real_feature['z_MAPQ']) * (real_feature['BALANCE']) #* (real_feature['BALANCE'])
 68 |     real_feature['plus_score']    = (real_feature['z_MAPQ']/45) + 1.5*real_feature['BALANCE']
 69 |     r_fpr_m, r_tpr_m, thresholds = metrics.roc_curve(real_feature['label'], real_feature['combine_score'], pos_label=True)
 70 |     r_fpr_p, r_tpr_p, thresholds = metrics.roc_curve(real_feature['label'], real_feature['plus_score'], pos_label=True)
 71 |     plt.plot(r_fpr_m, r_tpr_m, label="real_mul, auc="+str(round(auc(r_fpr_m, r_tpr_m),2)))
 72 |     plt.plot(r_fpr_p, r_tpr_p, label="real_add, auc="+str(round(auc(r_fpr_p, r_tpr_p),2)))
 73 | 
 74 |     plt.xlabel('False Positive Rate')
 75 |     plt.ylabel('True Positive Rate')
 76 |     plt.legend()
 77 |     plt.savefig(out_prefix + "_ROC.pdf")
 78 |     plt.clf()
 79 | 
 80 | 
 81 |     precision, recall, thresholds = precision_recall_curve(sim_feature['label'], sim_feature['combine_score'])
 82 |     precision_p, recall_p, thresholds = precision_recall_curve(sim_feature['label'], sim_feature['plus_score'])
 83 |     r_precision, r_recall, thresholds = precision_recall_curve(real_feature['label'], real_feature['combine_score'])
 84 |     r_precision_p, r_recall_p, thresholds = precision_recall_curve(real_feature['label'], real_feature['plus_score'])
 85 |     
 86 |     plt.plot(recall,   precision,   label="simulation_mul, auc="+str(round(auc(recall, precision),2)))
 87 |     plt.plot(recall_p, precision_p, label="simulation_add, auc="+str(round(auc(recall_p, precision_p),2)))
 88 |     plt.plot(r_recall,   r_precision,   label="real_mul, auc="+str(round(auc(r_recall, r_precision),2)))
 89 |     plt.plot(r_recall_p, r_precision_p, label="real_add, auc="+str(round(auc(r_recall_p, r_precision_p),2)))
 90 |     
 91 |     plt.xlabel('Recall')
 92 |     plt.ylabel('Precision')
 93 |     plt.legend()
 94 |     plt.savefig(out_prefix + "_PRC.pdf")
 95 |     plt.clf()
 96 | 
 97 |     print("====== sim featue ========")
 98 |     print_accuracy(sim_feature['plus_score'] > best_threshold, sim_feature['label'])
 99 |     print("====== real featue ========")
100 |     print_accuracy(real_feature['plus_score'] > best_threshold, real_feature['label'])
101 |     print("======= overlap =========")
102 |     print_accuracy(sim_feature[~miss_info]['plus_score'] > best_threshold, real_feature['plus_score'] > 1.5)
103 |     print("sim label True", np.sum(sim_feature['label']))
104 |     print("sim feature",  np.sum(sim_feature['plus_score'] > best_threshold))
105 |     print("real feature", np.sum(real_feature['plus_score'] > best_threshold))
106 |     FP = (sim_feature['plus_score'] > best_threshold)* ~(sim_feature['label'])
107 |     FN = (sim_feature['plus_score'] <= best_threshold)* (sim_feature['label'])
108 |     return FP, FN
109 | 
110 | 
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     parser = argparse.ArgumentParser()
115 |     parser.add_argument('-sr', '--simulation_report', help='the simulation bias report')
116 |     parser.add_argument('-rr', '--real_report', help='the real data bias report')
117 |     parser.add_argument('-thr', '--threshold',  help='the threshold for prediction model [1.5]', type=int, default=1.5)
118 |     parser.add_argument('-out', '--out_prefix', help='the prefix for plottings [predict]', type=str, default='predict')
119 |     args = parser.parse_args()
120 | 
121 |     fn_simulation = args.simulation_report
122 |     fn_real       = args.real_report
123 |     best_th       = args.threshold
124 |     out_prefix    = args.out_prefix
125 |     
126 |     df_simulation = pd.read_csv(fn_simulation, sep='\t')
127 |     df_real       = pd.read_csv(fn_real, sep='\t')
128 | 
129 |     sp_label = get_label(df_simulation)
130 | 
131 |     # filter out the sites suspicious of imcomplete vcf information
132 |     miss_info = (df_real['OTHER'] > df_real['NUM_READS'] * 0.9) + (df_real['OTHER'] > df_real['NUM_READS'] * 0.4) * \
133 |                 ( (df_real['REF'] == 0) + (df_real['ALT'] == 0 ))
134 |     no_info = df_simulation['AVG_MAPQ'].isnull()
135 |     no_info += df_simulation['MAP_BALANCE'].isnull()
136 |     no_info += df_simulation['BALANCE'].isnull()
137 |     miss_info += no_info
138 |     df_simulation = df_simulation[~no_info]
139 |     sp_label      = sp_label[~no_info]
140 |     print("filtered number:", sum(miss_info))
141 | 
142 |     df_real_test  = df_real[~miss_info]
143 |     sp_real_label = sp_label[~miss_info]
144 |     FP, FN = combine_score(df_simulation, sp_label.iloc[:, 4].values, df_real_test, sp_real_label.iloc[:, 4].values, \
145 |                            miss_info, best_th, out_prefix)
146 |     
147 |     # print data of FP and FN
148 |     with pd.option_context('display.max_rows', None):  # more options can be specified also
149 |         print("False Positive:")
150 |         print(df_simulation[FP])
151 |         print("==================================================")
152 |         print("False Negative:")
153 |         print(df_simulation[FN])
154 | 
155 |     # plot false positive
156 |     labels = ['Balanced', 'Bias (Loss)', 'Bias (Flux)', 'Bias (Local)', 'Outliers']
157 |     idx_cat = set(sp_label[FP+FN]["Category"])
158 |     labels = [labels[idx] for idx in sorted(idx_cat)]
159 |     ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Allelic Balance",  hue = "Category", data = sp_label[FP+FN], \
160 |             xlim=(-0.8,0.8), ylim=(-0.8,0.8), palette='Set2')
161 |     ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.2)
162 |     ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.2)
163 |     ax.fig.suptitle("False Positive and False Negative")
164 |     ax.fig.tight_layout()
165 |     ax.ax_joint.get_legend().remove()
166 |     h, l = ax.ax_joint.get_legend_handles_labels()
167 |     plt.legend(h, labels, title="Category#", bbox_to_anchor=(0, 0), loc='lower left', borderaxespad=0.2)
168 |     plt.savefig(out_prefix + '_FP_and_FN.pdf')
169 |     plt.clf()
170 |     
171 |     labels = ['Balanced', 'Bias (Loss)', 'Bias (Flux)', 'Bias (Local)', 'Outliers']
172 |     idx_cat = set(sp_label[~(FP+FN)]["Category"])
173 |     labels = [labels[idx] for idx in sorted(idx_cat)]
174 |     ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Allelic Balance",  hue = "Category", data = sp_label[~(FP+FN)], \
175 |             xlim=(-0.8,0.8), ylim=(-0.8,0.8), palette='Set2')
176 |     ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.2)
177 |     ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.2)
178 |     ax.fig.suptitle("True Positive and True Negative")
179 |     ax.fig.tight_layout()
180 |     ax.ax_joint.get_legend().remove()
181 |     h, l = ax.ax_joint.get_legend_handles_labels()
182 |     plt.legend(h, labels, title="Category#", bbox_to_anchor=(0, 0), loc='lower left', borderaxespad=0.2)
183 |     plt.savefig(out_prefix + '_TP_and_TN.pdf')
184 |     
185 |     
186 | 
187 | 


--------------------------------------------------------------------------------
/biastools/indel_balance_plot.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import seaborn as sns
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | import math
  6 | import numpy as np
  7 | import pysam
  8 | 
  9 | 
 10 | 
 11 | def read_bias_report(fn_bias_report):
 12 |     list_bias_SNP = []
 13 |     list_bias_gap = []
 14 |     f = open(fn_bias_report, 'r')
 15 |     header = f.readline()
 16 |     for line in f:
 17 |         fields = line.split()
 18 |         if fields[-1] == '.':
 19 |             list_bias_gap.append(fields)
 20 |         else:
 21 |             list_bias_SNP.append(fields)
 22 |     f.close()
 23 |     return list_bias_SNP, list_bias_gap
 24 | 
 25 | 
 26 | def calculate_SNP_balance(assign_SNP, flag_real):
 27 |     """
 28 |     Return for simulated read:
 29 |     [[simulate_balance], [map_balance], [assign_balance]]
 30 |     Return for real read:
 31 |     [assign_balance]
 32 |     """
 33 |     if flag_real:
 34 |         record = [[float(fields[5]) for fields in assign_SNP]]
 35 |     else:
 36 |         record = [[],[],[]]
 37 |         for idx in range(len(assign_SNP)):
 38 |             record[0].append(float(assign_SNP[idx][14]))
 39 |             record[1].append(float(assign_SNP[idx][10]))
 40 |             record[2].append(float(assign_SNP[idx][5]))
 41 |     return record
 42 | 
 43 | 
 44 | def calculate_gap_balance(assign_gap, f_vcf, len_bd, get_idx):
 45 |     list_insert = [ [] for _ in range(len_bd) ]
 46 |     list_delete = [ [] for _ in range(len_bd) ]
 47 |     for idx in range(len(assign_gap)):
 48 |         ref_name  = assign_gap[idx][0]
 49 |         var_start = int(assign_gap[idx][1])
 50 |         var_segment = f_vcf.fetch(contig=ref_name, start=var_start-1, stop=var_start+1) # get exactly the variant at the site
 51 |         for var in var_segment:
 52 |             if var.start+1 != var_start:
 53 |                 continue
 54 |             len_ref = len(var.ref)
 55 |             if len(var.alts) == 1:
 56 |                 len_alt = len(var.alts[0])
 57 |             else:
 58 |                 hap = var.samples[0]['GT']
 59 |                 if hap[0] != 0:
 60 |                     len_alt = len(var.alts[hap[0]-1])
 61 |                 else:
 62 |                     len_alt = len(var.alts[hap[1]-1])
 63 |             
 64 |             if len_ref > len_alt: # deletion
 65 |                 diff = min(len_ref - len_alt -1, len_bd-1)
 66 |                 record = float(assign_gap[idx][get_idx])
 67 |                 list_delete[diff].append(record)
 68 |             else: # 0 and insertions
 69 |                 diff = min(len_alt - len_ref -1, len_bd-1)
 70 |                 record = float(assign_gap[idx][get_idx])
 71 |                 list_insert[diff].append(record)
 72 |     return list_insert, list_delete
 73 | 
 74 | 
 75 | def addlabels(x, y, len_bd):
 76 |     for i in range(len(x)):
 77 |         # Format numbers: use 'k' for values ≥1000, no decimal points
 78 |         if y[i] >= 1000:
 79 |             label = f'{int(y[i]/1000)}k'
 80 |         else:
 81 |             label = str(int(y[i]))
 82 |         plt.text(i-len_bd, y[i], label, ha='center', va='bottom', fontsize=8)  # Added 30 degree rotation
 83 | 
 84 | 
 85 | def plot_balance(balance_delete, balance_SNP, balance_insert, output_name, len_bd, list_incidents, list_plot_name, use_median=False):
 86 |     len_plot = len(list_plot_name)
 87 |     balance_list = [np.zeros(2*len_bd+1) for idx in range(len_plot)]
 88 |     balance_25th = [np.zeros(2*len_bd+1) for idx in range(len_plot)]
 89 |     balance_75th = [np.zeros(2*len_bd+1) for idx in range(len_plot)]
 90 |     
 91 |     # Process deletions
 92 |     for idy, list_delete in enumerate(balance_delete):
 93 |         for idx in range(len_bd):
 94 |             list_balance = np.array(list_delete[idx])
 95 |             if len(list_balance) > 1:
 96 |                 valid_balance = list_balance[~np.isnan(list_balance)]
 97 |                 # Calculate 1 - value for all statistics
 98 |                 flipped_balance = 1 - valid_balance
 99 |                 balance_list[idy][len_bd-1-idx] = np.median(flipped_balance) if use_median else np.mean(flipped_balance)
100 |                 # Note: when we flip values, 75th becomes 25th and vice versa
101 |                 balance_25th[idy][len_bd-1-idx] = np.quantile(flipped_balance, 0.25)  # Was 0.75
102 |                 balance_75th[idy][len_bd-1-idx] = np.quantile(flipped_balance, 0.75)  # Was 0.25
103 |             else:
104 |                 balance_list[idy][len_bd-1-idx] = np.nan
105 |                 balance_25th[idy][len_bd-1-idx] = np.nan
106 |                 balance_75th[idy][len_bd-1-idx] = np.nan
107 |     
108 |     # Process SNPs
109 |     for idy, list_balance in enumerate(np.array(balance_SNP)):
110 |         valid_balance = list_balance[~np.isnan(list_balance)]
111 |         flipped_balance = 1 - valid_balance
112 |         balance_list[idy][len_bd] = np.median(flipped_balance) if use_median else np.mean(flipped_balance)
113 |         balance_25th[idy][len_bd] = np.quantile(flipped_balance, 0.25)  # Was 0.75
114 |         balance_75th[idy][len_bd] = np.quantile(flipped_balance, 0.75)  # Was 0.25
115 |     
116 |     # Process insertions
117 |     for idy, list_insert in enumerate(balance_insert):
118 |         for idx in range(len_bd):
119 |             list_balance = np.array(list_insert[idx])
120 |             if len(list_balance) > 1:
121 |                 valid_balance = list_balance[~np.isnan(list_balance)]
122 |                 flipped_balance = 1 - valid_balance
123 |                 balance_list[idy][len_bd+1+idx] = np.median(flipped_balance) if use_median else np.mean(flipped_balance)
124 |                 balance_25th[idy][len_bd+1+idx] = np.quantile(flipped_balance, 0.25)  # Was 0.75
125 |                 balance_75th[idy][len_bd+1+idx] = np.quantile(flipped_balance, 0.75)  # Was 0.25
126 |             else:
127 |                 balance_list[idy][idx+len_bd+1] = np.nan
128 |                 balance_25th[idy][idx+len_bd+1] = np.nan
129 |                 balance_75th[idy][idx+len_bd+1] = np.nan
130 | 
131 |     t = list(range(-len_bd, len_bd+1))
132 |     f, (a0, a1) = plt.subplots(2, 1, gridspec_kw={
133 |         'height_ratios': [3, 1],
134 |         'hspace': 0.1
135 |     })
136 |     f.set_size_inches(20, 10)  # Slightly taller to accommodate labels
137 |     
138 |     prop_cycle = plt.rcParams['axes.prop_cycle']
139 |     colors = prop_cycle.by_key()['color']
140 | 
141 |     # Adjust the subplot parameters to give specified padding
142 |     f.subplots_adjust(right=0.85, hspace=0.1)  # Make room for legend on right
143 |     
144 |     for idx, name in enumerate(list_plot_name):
145 |         # Calculate error bar lengths
146 |         yerr_minus = balance_list[idx] - balance_25th[idx]
147 |         yerr_plus = balance_75th[idx] - balance_list[idx]
148 |         # make sure the error bar is not negative
149 |         yerr_minus = np.maximum(yerr_minus, 0)
150 |         yerr_plus = np.maximum(yerr_plus, 0)
151 |         yerr = np.vstack((yerr_minus, yerr_plus))
152 |         
153 |         # Plot with asymmetric error bars
154 |         a0.errorbar(t, balance_list[idx],  # Removed (1-balance_list[idx]) since we already flipped
155 |                    yerr=yerr, 
156 |                    capsize=3, fmt='-o', label=name, color=colors[idx],
157 |                    markersize=6, elinewidth=1, capthick=1)
158 |     
159 |     # Move legend inside the upper panel, near the bottom
160 |     a0.legend(frameon=True, fancybox=True, framealpha=0.9,
161 |               loc='lower center',  # Place at bottom center
162 |               bbox_to_anchor=(0.5, 0.05),  # Position slightly above bottom
163 |               ncol=2)  # Two columns for better space usage
164 |     
165 |     a0.axhline(y=0.5, color='gray', linestyle='dashdot', linewidth=0.9)
166 |     a0.set(ylabel='Fraction of alternate allele')
167 |     a0.grid(True, linestyle='--', alpha=0.3)
168 |     
169 |     a1.set(xlabel='Insertion (+) or deletion (-) length')
170 |     a1.set(ylabel='# of variants')
171 |     
172 |     # Increase bar width
173 |     width = 0.65  # Changed from 0.5 to 0.8 for thicker bars
174 |     bars = a1.bar(t, list_incidents, align='center', width=width, log=True, linewidth=1)
175 |     a1.set_ylim([1, max(list_incidents)*5])
176 |     
177 |     # Create x-ticks only for multiples of 5 and boundaries
178 |     xticks = []
179 |     xticklabels = []
180 |     for x in range(-len_bd, len_bd + 1):
181 |         if x == -len_bd or x == len_bd or x % 5 == 0:
182 |             xticks.append(x)
183 |             if x == -len_bd:
184 |                 xticklabels.append(f"≤-{len_bd}")
185 |             elif x == len_bd:
186 |                 xticklabels.append(f"≥{len_bd}")
187 |             else:
188 |                 xticklabels.append(str(x))
189 |     
190 |     a1.set_xticks(xticks)
191 |     a1.set_xticklabels(xticklabels)  # Remove rotation
192 |     
193 |     # Use the same x-ticks for the upper plot
194 |     a0.set_xticks(xticks)
195 |     a0.set_xticklabels(xticklabels)  # Remove rotation
196 |     
197 |     addlabels(t, list_incidents, len_bd)
198 |     a1.grid(axis='y', linestyle='--', alpha=0.3)
199 |     
200 |     a0.set_xlim(a1.get_xlim())
201 |     
202 |     # Adjust subplot spacing
203 |     f.subplots_adjust(hspace=0.1)  # Keep minimal space between plots
204 |     
205 |     plt.savefig(output_name + '.indel_balance.pdf', bbox_inches='tight', dpi=300)
206 | 
207 | 
208 | if __name__ == "__main__":
209 |     parser = argparse.ArgumentParser()
210 |     parser.add_argument('-lr', '--list_report', nargs='+', required=True, help='the list of assignment bias report')
211 |     parser.add_argument('-ln', '--list_name',   nargs='+', required=True, help='the second bias report')
212 |     parser.add_argument('-vcf', '--vcf_report', help='the vcf report for the bias report regions')
213 |     parser.add_argument('-bd', '--boundary', type=int, default=40, help='the boundary indel lengths extend from 0')
214 |     parser.add_argument('-map', '--flag_mapping', action='store_true', help='show the mapping rather than local result')
215 |     parser.add_argument('-real', '--flag_real', action='store_true', help='specify if the report contains no simulation information')
216 |     parser.add_argument('-out', '--output_name', help="output file name")
217 |     parser.add_argument('-median', '--use_median', action='store_true', 
218 |                        help='Use median instead of mean for central tendency')
219 |     args = parser.parse_args()
220 | 
221 |     list_report = args.list_report
222 |     list_name   = args.list_name
223 |     fn_vcf = args.vcf_report
224 |     boundary = args.boundary
225 |     flag_map = args.flag_mapping
226 |     flag_real = args.flag_real
227 |     output_name = args.output_name
228 |     if output_name == None:
229 |         output_name = list_name[0]
230 | 
231 |     assert len(list_report) == len(list_name), "Number of bias_report and bias names are different."
232 |     
233 |     f_vcf = pysam.VariantFile(fn_vcf)
234 |     # read the bias report
235 |     list_bias_report = []
236 |     for fn_assign_report in list_report:
237 |         assign_report = read_bias_report(fn_assign_report)
238 |         list_bias_report.append(assign_report)
239 |     
240 |     # fetch the SNP balance information
241 |     list_balance_SNP = []
242 |     for assign_SNP, assign_gap in list_bias_report:
243 |         balance_SNP = calculate_SNP_balance(assign_SNP, flag_real)
244 |         list_balance_SNP.append(balance_SNP)
245 | 
246 |     if flag_real: # no simulation of mapping information provided
247 |         list_plot_name = list_name #[name + '(real)' for name in list_name]
248 |         
249 |         # fetch the gap balance information
250 |         list_balance_delete = []
251 |         list_balance_insert = []
252 |         for assign_SNP, assign_gap in list_bias_report:
253 |             balance_insert, balance_delete = calculate_gap_balance(assign_gap, f_vcf, boundary, 5)
254 |             list_balance_insert.append(balance_insert)
255 |             list_balance_delete.append(balance_delete)
256 | 
257 |         balance_SNP    = list_balance_SNP
258 |         balance_delete = list_balance_delete
259 |         balance_insert = list_balance_insert
260 |     else: # to plot the simulated reads, the first entry is the simulated balance information, then we can choose map or local_assignment
261 |         flag_choice = 2
262 |         gap_choice  = 5
263 |         list_plot_name = ["simulated"]
264 |         if flag_map:
265 |             flag_choice = 1
266 |             gap_choice  = 10
267 |             list_plot_name += [name + '(map)' for name in list_name]
268 |         else:
269 |             list_plot_name += [name + '(assign)' for name in list_name]
270 |     
271 |         # fetch the gap balance information
272 |         balance_insert, balance_delete = calculate_gap_balance(list_bias_report[0][1], f_vcf, boundary, 14) # getting the simulated information
273 |         list_balance_delete = [balance_delete]
274 |         list_balance_insert = [balance_insert]
275 |         for assign_SNP, assign_gap in list_bias_report:
276 |             balance_insert, balance_delete = calculate_gap_balance(assign_gap, f_vcf, boundary, gap_choice)
277 |             list_balance_insert.append(balance_insert)
278 |             list_balance_delete.append(balance_delete)
279 | 
280 |         balance_SNP    = [list_balance_SNP[0][0]] + [balance[flag_choice] for balance in list_balance_SNP]
281 |         balance_delete = list_balance_delete
282 |         balance_insert = list_balance_insert
283 |     
284 | 
285 |     # get the incident numbers of the indels
286 |     list_incidents = [len(balance) for balance in list_balance_delete[0]][::-1] + [len(list_balance_SNP[0][0])] + [len(balance) for balance in list_balance_insert[0]]
287 |     
288 |     plot_balance(balance_delete, balance_SNP, balance_insert, output_name, boundary, list_incidents, list_plot_name, args.use_median)
289 | 
290 | 


--------------------------------------------------------------------------------
/biastools/golden_graph.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pandas as pd
  3 | import seaborn as sns
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | import math
  7 | import random
  8 | import numpy as np
  9 | 
 10 | 
 11 | colors = ["#bce4ff", "#8bd0fe", "#59bcfc", "#0099fc", "#0086dd", "#006bb1", "#004a7a", "#002740"]
 12 | colors = ["#f2dad5", "#e8bfc1", "#d9a4b2", "#c78ba6", "#aa719a", "#8b5b89", "#634271", "#3c2a4f"]
 13 | 
 14 | def map_mapq_to_size(mapq):
 15 |     if mapq >= 40:
 16 |         return 0
 17 |     elif mapq >= 30:
 18 |         return 1
 19 |     elif mapq >= 20:
 20 |         return 2
 21 |     elif mapq >= 10:
 22 |         return 3
 23 |     elif mapq >= 5:
 24 |         return 4
 25 |     elif mapq >= 3:
 26 |         return 5
 27 |     elif mapq >= 1:
 28 |         return 6
 29 |     return 7
 30 | 
 31 | labels = ['>40', '30~40', '20~30', '10~20', '5~10', '3~5', '1~3', '<1']
 32 | 
 33 | def map_color(
 34 |         var:float
 35 |         )-> int:
 36 |     """
 37 |     color_code = int(var/2)
 38 |     if color_code > 20:
 39 |         color_code = 20
 40 |     return color_code
 41 |     """
 42 |     if var > 0.5:
 43 |         return 0
 44 |     elif var > 0.3:
 45 |         return 1
 46 |     elif var > 0.1:
 47 |         return 2
 48 |     elif var > 0.05:
 49 |         return 3
 50 |     elif var > 0.01:
 51 |         return 4
 52 |     else:
 53 |         return 5
 54 | 
 55 | p_labels = ['>0.5', '0.3~0.5', '0.1~0.3', '0.05~0.1', '0.01~0.05', '<0.01']
 56 | 
 57 | def map_num_to_size(num):
 58 |     if num == 0:
 59 |         return 0
 60 |     elif num <= 3:
 61 |         return 1
 62 |     elif num <= 5:
 63 |         return 2
 64 |     elif num <= 10:
 65 |         return 3
 66 |     elif num <= 15:
 67 |         return 4
 68 |     elif num <= 20:
 69 |         return 5
 70 |     elif num <= 30:
 71 |         return 6
 72 |     return 7
 73 | 
 74 | n_labels = ['0', '1~3', '4~6', '7~10', '11~15', '16~20', '21~30', '>30']
 75 | 
 76 | def map_waste_to_color(value):
 77 |     return int(math.ceil(value*8))
 78 | 
 79 | 
 80 | def plot_golden(out_prefix, df_use):
 81 |     
 82 |     # Add columns
 83 |     df_use['WASTE_INFO']   = (df_use['OTHER'])/(df_use['NUM_READS']+0.01)
 84 |     mapQ   = list(df_use['AVG_MAPQ'])
 85 |     pValue = list(df_use['EVEN_P_VALUE'])
 86 |     
 87 |     sp = pd.DataFrame()
 88 |     sp['ASSIGNMENT BALANCE'] = list(df_use['BALANCE'])
 89 |     sp['MAPPING BALANCE']    = list(df_use['MAP_BALANCE'])
 90 |     sp['SIMULATION BALANCE'] = list(df_use['SIM_BALANCE'])
 91 |     sp.head()
 92 |     
 93 |     mapped_mapQ = [map_mapq_to_size(q) for q in mapQ]
 94 |     mapped_p    = [map_color(p) for p in pValue]
 95 |     waste_value = [map_waste_to_color(q) for q in list(df_use['WASTE_INFO'])]
 96 |     sp['Avg_MapQ_code'] = mapped_mapQ
 97 |     sp['Even_p_value']  = mapped_p
 98 |     sp['Waste_value']   = waste_value
 99 |     sp['Assign_other']  = [map_num_to_size(n) for n in list(df_use['OTHER'])   ]
100 |     sp['Map_other']     = [map_num_to_size(n) for n in list(df_use['MIS_MAP']) ]
101 |     sp['MapQ'] = list(mapQ)
102 | 
103 |     #================== color map ====================
104 |     set_mapQ_value = set(sp['Avg_MapQ_code'])
105 |     color_mapQ = []
106 |     for idx in sorted(set_mapQ_value):
107 |         color_mapQ.append(colors[idx])
108 |     
109 |     set_misMap_value = set(sp['Map_other'])
110 |     color_misMap = []
111 |     for idx in sorted(set_misMap_value):
112 |         color_misMap.append(colors[idx])
113 |     
114 |     #=========================== standard ref_bias to read_distribute plot ============================
115 |     print("Ploting the Standard Ref Bias Plot!")
116 |     plt.clf()
117 |     ax = sns.scatterplot(y="ASSIGNMENT BALANCE", x="MAPPING BALANCE",  hue = "Avg_MapQ_code", data = sp, palette=sns.color_palette(color_mapQ))
118 |     #ax = sns.scatterplot(y="ASSIGNMENT BALANCE", x="MAPPING BALANCE",  hue = "Even_p_value", data = sp)#hue="size", size="size", data=tips)
119 |     #ax = sns.scatterplot(y="ASSIGNMENT BALANCE", x="MAPPING BALANCE",  hue = "Waste_value", data = sp)#hue="size", size="size", data=tips)
120 |     h, l = ax.get_legend_handles_labels()
121 |     plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(0.92, 1), loc=2, borderaxespad=0., framealpha=1)
122 |     plt.xlim([0,1])
123 |     plt.ylim([0,1])
124 |     
125 |     FN_FIG = out_prefix + '.diff-assign2map_dot.pdf'
126 |     plt.savefig(FN_FIG)
127 |     
128 |     #=========================== golden to read_distribute plot ============================
129 |     print("Ploting the Golden distribution Plot!")
130 |     plt.clf()
131 |     ax = sns.scatterplot(x="SIMULATION BALANCE", y="MAPPING BALANCE",  hue = "Avg_MapQ_code", data = sp, palette=sns.color_palette(color_mapQ))#hue="size", size="size", data=tips)
132 |     h, l = ax.get_legend_handles_labels()
133 |     plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(0.92, 1), loc=2, borderaxespad=0., framealpha=1)
134 |     plt.xlim([0,1])
135 |     plt.ylim([0,1])
136 |     
137 |     FN_FIG = out_prefix + '.diff-sim2map_dot.pdf'
138 |     plt.savefig(FN_FIG)
139 |     
140 |     #=========================== golden to ref_bias plot ============================
141 |     plt.clf()
142 |     ax = sns.scatterplot(x="SIMULATION BALANCE", y="ASSIGNMENT BALANCE",  hue = "Avg_MapQ_code", data = sp, palette=sns.color_palette(color_mapQ))#hue="size", size="size", data=tips)
143 |     h, l = ax.get_legend_handles_labels()
144 |     plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(0.92, 1), loc=2, borderaxespad=0., framealpha=1)
145 |     plt.xlim([0,1])
146 |     plt.ylim([0,1])
147 |     
148 |     FN_FIG = out_prefix + '.diff-sim2assign_dot.pdf'
149 |     plt.savefig(FN_FIG)
150 |     
151 |     #=========================== all merged plot ============================
152 |     print("Ploting the Merged golden distribution Plot!")
153 |     plt.clf()
154 |     sp['Normalized Assignment Balance'] = list(df_use['BALANCE']-df_use['SIM_BALANCE']) # the average map_q score
155 |     sp['Normalized Mapping Balance'] = list(df_use['MAP_BALANCE']-df_use['SIM_BALANCE']) # the average map_q score
156 |     ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Assignment Balance",  hue = "Avg_MapQ_code", data = sp, \
157 |             xlim=(-0.8,0.8), ylim=(-0.8,0.8), palette=sns.color_palette(color_mapQ))
158 |     #ax = sns.jointplot(x="Normalized Mapping Balance", y="Normalized Assignment Balance",  hue = "Map_other", data = sp, \
159 |     #        xlim=(-0.8,0.8), ylim=(-0.8,0.8), palette=sns.color_palette(color_misMap))
160 |     ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.2)
161 |     ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.2)
162 |     ax.ax_joint.get_legend().remove()
163 |     h, l = ax.ax_joint.get_legend_handles_labels()
164 |     plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(0, 0), loc='lower left', borderaxespad=0.2)
165 |     #plt.legend(h, n_labels, title="Mismapped Gain#", bbox_to_anchor=(1,0), loc='lower right', borderaxespad=0.2)
166 |     
167 |     FN_FIG = out_prefix + '.category.MapQ.pdf'
168 |     plt.savefig(FN_FIG)
169 |     
170 |     #======================= allelic difference plot =========================
171 |     plt.clf()
172 |     list_ref_diff = list(df_use['REF']-df_use['SIM_REF'])
173 |     list_alt_diff = list(df_use['ALT']-df_use['SIM_ALT'])
174 |     for idx in range(len(list_ref_diff)):
175 |         list_ref_diff[idx] += random.uniform(-0.3, 0.3) # scatter plot
176 |         list_alt_diff[idx] += random.uniform(-0.3, 0.3)
177 |     sp['Ref# - Simulation Ref#'] = list_ref_diff
178 |     sp['Alt# - Simulation Alt#'] = list_alt_diff
179 |     
180 |     #ax = sns.jointplot(x="Ref# - Simulation Ref#", y="Alt# - Simulation Alt#",  hue = "Even_p_value", data = sp, xlim=(-20,20), ylim=(-20,15))
181 |     ax = sns.jointplot(x="Ref# - Simulation Ref#", y="Alt# - Simulation Alt#",  hue = "Avg_MapQ_code", data = sp, xlim=(-30,30), ylim=(-30,15), palette=sns.color_palette(color_mapQ))
182 |     ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.1)
183 |     ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.1)
184 |     ax.ax_joint.get_legend().remove()
185 |     h, l = ax.ax_joint.get_legend_handles_labels()
186 |     #plt.legend(h, p_labels, title="Even P Value", bbox_to_anchor=(0,1), loc='upper right')
187 |     plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(1,1), loc='upper right')
188 |     
189 |     FN_FIG = out_prefix + '.diff2-assign2sim.pdf'
190 |     plt.savefig(FN_FIG)
191 |     
192 | 
193 |     plt.clf()
194 |     #ax = sns.jointplot(x="Ref# - Simulation Ref#", y="Alt# - Simulation Alt#",  hue = "Map_other", data = sp, xlim=(-30,30), ylim=(-30,15), palette=sns.color_palette(color_misMap))
195 |     #ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.1)
196 |     #ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.1)
197 |     #ax.ax_joint.get_legend().remove()
198 |     #h, l = ax.ax_joint.get_legend_handles_labels()
199 |     #plt.legend(h, n_labels, title="Mismapped Gain#", bbox_to_anchor=(1,1), loc='upper right')
200 |     #
201 |     #FN_FIG = out_prefix + '-read_diff_allelic.mismap.pdf'
202 |     #plt.savefig(FN_FIG)
203 |     #====================== mapping difference plot =========================
204 |     plt.clf()
205 |     list_m_ref_diff = list(df_use['MAP_REF']-df_use['SIM_REF'])
206 |     list_m_alt_diff = list(df_use['MAP_ALT']-df_use['SIM_ALT'])
207 |     for idx in range(len(list_m_ref_diff)):
208 |         list_m_ref_diff[idx] += random.uniform(-0.3, 0.3) # scatter plot
209 |         list_m_alt_diff[idx] += random.uniform(-0.3, 0.3)
210 |     sp['Mapping Ref# - Simulation Ref#'] = list_m_ref_diff
211 |     sp['Mapping Alt# - Simulation Alt#'] = list_m_alt_diff
212 |     
213 |     #ax = sns.jointplot(x="Mapping Ref# - Simulation Ref#", y="Mapping Alt# - Simulation Alt#",  hue = "Even_p_value", data = sp, xlim=(-20,20), ylim=(-20,15))
214 |     ax = sns.jointplot(x="Mapping Ref# - Simulation Ref#", y="Mapping Alt# - Simulation Alt#",  hue = "Avg_MapQ_code", data = sp, xlim=(-30,30), ylim=(-30,15), palette=sns.color_palette(color_mapQ))
215 |     ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.1)
216 |     ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.1)
217 |     ax.ax_joint.get_legend().remove()
218 |     h, l = ax.ax_joint.get_legend_handles_labels()
219 |     #plt.legend(h, p_labels, title="Even P Value", bbox_to_anchor=(0,1), loc='upper right')
220 |     plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(1,1), loc='upper right')
221 |     
222 |     FN_FIG = out_prefix + '.diff2-map2sim.pdf'
223 |     plt.savefig(FN_FIG)
224 |     
225 |     
226 |     plt.clf()
227 |     #ax = sns.jointplot(x="Mapping Ref# - Simulation Ref#", y="Mapping Alt# - Simulation Alt#",  hue = "Map_other", data = sp, xlim=(-30,30), ylim=(-30,15), palette=sns.color_palette(color_misMap))
228 |     #ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.1)
229 |     #ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.1)
230 |     #ax.ax_joint.get_legend().remove()
231 |     #h, l = ax.ax_joint.get_legend_handles_labels()
232 |     #plt.legend(h, n_labels, title="Mismapped Gain#", bbox_to_anchor=(1,1), loc='upper right')
233 |     #
234 |     #FN_FIG = out_prefix + '-read_diff_mapping.mismap.pdf'
235 |     #plt.savefig(FN_FIG)
236 |     #======================== read loss-gain plot ===========================
237 |     plt.clf()
238 |     array_m_ref_diff = -np.array(df_use['MAP_REF']-df_use['SIM_REF'])
239 |     array_m_alt_diff = -np.array(df_use['MAP_ALT']-df_use['SIM_ALT'])
240 |     list_read_loss = list(np.where(array_m_ref_diff < 0, 0, array_m_ref_diff) + np.where(array_m_alt_diff < 0, 0, array_m_alt_diff))
241 |     list_read_gain = list(df_use["MIS_MAP"])
242 |     for idx in range(len(list_m_ref_diff)):
243 |         list_read_loss[idx] += random.uniform(0,0.5) # scatter plot
244 |         list_read_gain[idx] += random.uniform(0,0.5) 
245 |     sp["Loss of Read (Ref + Alt)"] = list_read_loss
246 |     sp["Gain of Read"]             = list_read_gain
247 | 
248 |     #ax = sns.jointplot(x="Loss of Read (Ref + Alt)", y="Gain of Read", hue = "Avg_MapQ_code", data = sp, xlim=(0,30), ylim=(0,30), palette=sns.color_palette(color_mapQ))
249 |     #ax.ax_joint.axhline(y=0, color='gray', linestyle='dashdot', linewidth=0.1)
250 |     #ax.ax_joint.axvline(x=0, color='gray', linestyle='dashdot', linewidth=0.1)
251 |     #ax.ax_joint.get_legend().remove()
252 |     #h, l = ax.ax_joint.get_legend_handles_labels()
253 |     #plt.legend(h, labels, title="Avg MapQ", bbox_to_anchor=(1,1), loc='upper right')
254 |     #
255 |     #FN_FIG = out_prefix + '-loss_gain.pdf'
256 |     #plt.savefig(FN_FIG)
257 | 
258 |     plt.close("all")
259 |     sns.color_palette()
260 |     ref_loss  = list(df_use['SIM_REF']-df_use['MAP_REF'])
261 |     alt_loss  = list(df_use['SIM_ALT']-df_use['MAP_ALT'])
262 |     read_gain = list(df_use["MIS_MAP"]) 
263 |     hist_data = pd.DataFrame()
264 |     hist_data['loss/gain in a variant'] = read_gain + ref_loss + alt_loss
265 |     hist_data['category'] = ["MisMap gain"]*len(read_gain) + ["Ref loss"]*len(ref_loss) + ["Alt loss"]*len(alt_loss)
266 | 
267 |     bin_num = max(hist_data['loss/gain in a variant']) - min(hist_data['loss/gain in a variant'])
268 |     plt.clf()
269 |     ax = sns.displot(hist_data, x="loss/gain in a variant", bins=bin_num, hue="category", log_scale=(False,True), element="step")
270 |     ax.set(ylabel="occurence")
271 |     FN_FIG = out_prefix + '.loss_gain_occurence.pdf'
272 |     plt.savefig(FN_FIG)
273 |     
274 |     #plt.clf()
275 |     #ax = sns.displot(hist_data, x="loss/gain in a variant", bins=int(bin_num/3), hue="category", log_scale=(False,True), multiple="dodge")
276 |     #ax.set(ylabel="occurence")
277 |     #FN_FIG = out_prefix + '-loss_gain_occurence.dodge.pdf'
278 |     #plt.savefig(FN_FIG)
279 | 
280 | 
281 | 
282 | 
283 | if __name__ == "__main__":
284 |     parser = argparse.ArgumentParser()
285 |     parser.add_argument('-mb', '--bias_report', help='bias report, must contain the golden information')
286 |     parser.add_argument('-qt', '--quality_threshold', help='threshold that filtered the sites with avg_mapQ below the threshold', type=int, default=0)
287 |     parser.add_argument('-out', '--output_prefix', help='the prefix for the output plots and report')
288 |     args = parser.parse_args()
289 |     
290 |     fn_bias = args.bias_report
291 |     mapQ_th = args.quality_threshold
292 |     output_prefix = args.output_prefix
293 |     if output_prefix == None:
294 |         output_prefix = fn_bias
295 | 
296 |     df_use = pd.read_csv(fn_bias, sep='\t')
297 |     df_use = df_use[df_use['AVG_MAPQ'] >= mapQ_th]
298 |     df_use.head()
299 | 
300 |     plot_golden(output_prefix, df_use)
301 | 
302 | 


--------------------------------------------------------------------------------
/biastools/scanning_bias.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import gzip
  3 | 
  4 | import numpy as np
  5 | import os
  6 | import argparse
  7 | import pickle
  8 | 
  9 | 
 10 | def output_wig(
 11 |         output_name :str,
 12 |         data_name   :str,
 13 |         list_data   :list
 14 |         ) -> None:
 15 |     """
 16 |     output single wig file
 17 |     """
 18 |     f_o = gzip.open(output_name, 'wt')
 19 |     for array_info in list_data:
 20 |         ref_name, wig_start, array_wig = array_info
 21 |         wig_end = wig_start + len(array_wig)
 22 |         
 23 |         f_o.write("browser position " + ref_name + ":" + str(wig_start) + "-" + str(wig_end) + '\n')
 24 |         f_o.write("browser hide all\n")
 25 |         f_o.write("track type=wiggle_0 name=\"" + data_name + "\" description=\"variableStep format\"  visibility=hide autoScale=on" + \
 26 |                 "color=50,150,255 graphType=points priority=10\n")
 27 |         f_o.write("variableStep chrom=" + ref_name + '\n')
 28 |         for idx, depth in enumerate(array_wig):
 29 |             f_o.write(str(wig_start+idx) + ' ' + str(round(depth, 2)) + '\n')
 30 |     f_o.close()
 31 | 
 32 | 
 33 | 
 34 | def report_wig(
 35 |         fn_output           :str,
 36 |         dict_3D_measures    :dict,
 37 |         ) -> None:
 38 |     """
 39 |     output the wig format for read_depth, var_density, and dip_density
 40 |     this whole process take times
 41 |     """
 42 |     # list_info is composed of array_RD, array_VD, array_ND, array_score
 43 |     # (ref_name, region_begin, array_info)
 44 |     list_info = [[],[],[],[]]
 45 |     #ref_name, region_begin, array_read_depth, array_var_density, array_dip_density, array_score, array_score_sum = wig_info
 46 |     for ref_name, dict_array in dict_3D_measures.items():
 47 |         for start_pos, array_info in dict_array.items():
 48 |             array_RD, array_VD, array_ND, array_score = array_info
 49 |             list_info[0].append((ref_name, start_pos, array_RD))
 50 |             list_info[1].append((ref_name, start_pos, array_VD))
 51 |             list_info[2].append((ref_name, start_pos, array_ND))
 52 |             list_info[3].append((ref_name, start_pos, array_score))
 53 | 
 54 |     output_wig(
 55 |         output_name = (fn_output + '.read_depth.wig.gz'),
 56 |         data_name   = 'avg_read_depth',
 57 |         list_data   = list_info[0]
 58 |         )
 59 |     output_wig(
 60 |         output_name = (fn_output + '.var_density.wig.gz'),
 61 |         data_name   = 'var_density',
 62 |         list_data   = list_info[1]
 63 |         )
 64 |     output_wig(
 65 |         output_name = (fn_output + '.dip_density.wig.gz'),
 66 |         data_name   = 'non_diploid_density',
 67 |         list_data   = list_info[2]
 68 |         )
 69 |     output_wig(
 70 |         output_name = (fn_output + '.score_sum.wig.gz'),
 71 |         data_name   = '3D_scoring_sum',
 72 |         list_data   = list_info[3]
 73 |         )
 74 | 
 75 | 
 76 | def scanning_bias(
 77 |         f_gvcf      :pysam.VariantRecord
 78 |         ) -> dict:
 79 |     """
 80 |     Scanning the fn_gvcf to find the region with 
 81 |         - high read depth,
 82 |         - high density of variants, or
 83 |         - non diploid evidence.
 84 |     return the raw numbers
 85 |     """
 86 |     # Extract the read_depth and variant informations
 87 |     ref_name = None # record the reference name
 88 |     last_pos = -2   # record the last mpileup position
 89 |     start_pos = None # record the starting position of each region
 90 |     dict_ref_info  = {}
 91 |     for var in f_gvcf:
 92 |         if ref_name != var.contig: # new chromosome
 93 |             ref_name = var.contig
 94 |             dict_ref_info[ref_name] = {}
 95 | 
 96 |             start_pos = var.start
 97 |             dict_ref_info[ref_name][start_pos] = {'depth':[], 'var':[]}
 98 |         elif var.start > last_pos + 1: # the same chromsome, new position
 99 |             start_pos = var.start
100 |             dict_ref_info[ref_name][start_pos] = {'depth':[], 'var':[]}
101 |         elif var.start == last_pos: # duplicate position, pop the last read depth info
102 |             dict_ref_info[ref_name][start_pos]['depth'].pop() 
103 |         last_pos = var.start
104 |         
105 |         ref_name = var.contig
106 |         total_depth = var.samples[0]['DP']
107 |         
108 |         # store the read depth
109 |         dict_ref_info[ref_name][start_pos]['depth'].append((var.start, total_depth))
110 | 
111 |         alt_depth = None
112 |         if var.samples[0].get('AD'):
113 |             alt_depth = list(var.samples[0]['AD'])
114 |         else:
115 |             alt_depth = [0, total_depth]
116 |         
117 |         # calculate diploid score
118 |         list_alleles = list(var.alleles)
119 |         if sum(alt_depth) != total_depth: # often happens at indels
120 |             alt_depth.append(total_depth - sum(alt_depth))
121 |             list_alleles.append('Others')
122 |         list_alt_depth = sorted(alt_depth, reverse=True)
123 |         #max_alt_depth = list_alt_depth[0]
124 |         num_var = 0
125 |         for idx, depth in enumerate(list_alt_depth):
126 |             if depth > total_depth*15/100: # consider as variant, exclude the 0,0 case
127 |                 num_var = idx + 1
128 |             else:
129 |                 break
130 |         if num_var > 1:
131 |             nonDip_flag = False
132 |             if num_var > 2 or list_alt_depth[1]*2 < list_alt_depth[0]:
133 |                 nonDip_flag = True                                                                                           
134 |             dict_ref_info[ref_name][start_pos]['var'].append([var.start, total_depth, list_alt_depth[:num_var], nonDip_flag, \
135 |                                                               alt_depth, list_alleles])
136 |                                                               # -> for debug purpose
137 |     return dict_ref_info
138 | 
139 | 
140 | def boundary_compensate(
141 |         target_array    :np.array,
142 |         window_size     :int
143 |     ) -> np.array:
144 |     """
145 |     compensate for padding zeros
146 |     """
147 |     if len(target_array) < window_size:
148 |         return target_array
149 | 
150 |     half_window = int(window_size/2)
151 |     # compensate left side
152 |     for idx in range(half_window):
153 |         target_array[idx] *= (window_size / (half_window+idx))
154 |     # compensate right side
155 |     for idx in range(-1, -half_window-1, -1):
156 |         target_array[idx] *= (window_size / (half_window-idx-1))
157 |     return target_array
158 | 
159 | 
160 | def calculate_measures(
161 |         dict_ref_info   :dict,
162 |         window_size     :int=400
163 |     ) -> dict:
164 |     """
165 |     Take the raw data and calculate 
166 |         - the moving average of read_depth
167 |         - over window number of variants
168 |         - over window number of non_diploid site
169 |     """
170 |     # Two parameters we have:
171 |     # list_depth
172 |     # list_var_sites
173 | 
174 |     # Analyze the density of the variants
175 |     dict_3D_measures = {}
176 |     for ref_name, dict_start_pos in dict_ref_info.items():
177 |         dict_3D_measures[ref_name] = {}
178 |         for start_pos, dict_var_info in dict_start_pos.items():
179 |             list_depth     = dict_var_info['depth']
180 |             list_var_sites = dict_var_info['var']
181 | 
182 |             half_window = round(window_size/2)
183 |             # Counting average readepth over the window (moving average)
184 |             region_begin = list_depth[0][0]
185 |             region_end   = list_depth[-1][0] + 1
186 |             assert(start_pos == region_begin)
187 |             array_read_depth = np.zeros(region_end - region_begin + window_size)
188 |             for site_info in list_depth:
189 |                 index = site_info[0] - region_begin
190 |                 depth = site_info[1]
191 |                 array_read_depth[index:index+window_size] += depth
192 |             array_read_depth /= window_size
193 |             array_read_depth = array_read_depth[half_window:-half_window]
194 |             array_read_depth = boundary_compensate(array_read_depth, window_size)
195 | 
196 |             # Calculate variant density over the window
197 |             array_var_density = np.zeros(region_end - region_begin + window_size)
198 |             array_dip_density = np.zeros(region_end - region_begin + window_size)
199 |             for site_info in list_var_sites:
200 |                 index = site_info[0] - region_begin
201 |                 nonDip_flag = site_info[3]
202 |             
203 |                 array_var_density[index:index+window_size] += 1
204 |                 if nonDip_flag:
205 |                     array_dip_density[index:index+window_size] += 1
206 |             array_var_density = array_var_density[half_window:-half_window]
207 |             array_dip_density = array_dip_density[half_window:-half_window]
208 |             #array_var_density = boundary_compensate(array_var_density, window_size)
209 |             #array_dip_density = boundary_compensate(array_dip_density, window_size)
210 | 
211 |             dict_3D_measures[ref_name][region_begin] = [array_read_depth, array_var_density, array_dip_density]
212 |     return dict_3D_measures
213 | 
214 | 
215 | def link_bias_region_and_report(
216 |         array_score     :np.array,
217 |         region_begin    :int,
218 |         ref_name        :str,
219 |         f_ob            ,
220 |         f_os            ,
221 |         threshold_1     :int=3,
222 |         threshold_2     :int=5,
223 |         link_dist       :int=1000
224 |     ) -> tuple:
225 |     """
226 |     Find and link the bias region according to thresholds
227 |     report files:
228 |         - bed file: bias region
229 |         - bed file: suspicious region
230 |         - csv file: detailed report of bias and suspicious region
231 |     """
232 |     list_region = []
233 |     pos_start = -1
234 |     pos_stop  = -link_dist -1
235 |     for idx, score in enumerate(array_score):
236 |         if score > threshold_1:
237 |             if idx > pos_stop + link_dist:
238 |                 list_region.append((pos_start, pos_stop+1))
239 |                 #print(idx, pos_start, pos_stop+1)
240 |                 pos_start = idx
241 |                 pos_stop  = idx
242 |             else:
243 |                 pos_stop = idx
244 |     if len(list_region) == 0 or list_region[-1] != (pos_start, pos_stop+1):
245 |         list_region.append((pos_start, pos_stop+1))
246 |     list_region = list_region[1:] # first region is decoy
247 |     
248 |     # report bias region and suspicious region
249 |     list_bias = []
250 |     list_suspicious = []
251 |     for pos_start, pos_stop in list_region:
252 |         max_score = max(array_score[pos_start:pos_stop])
253 |         avg_score = np.mean(array_score[pos_start:pos_stop])
254 |         if max_score > threshold_2:
255 |             list_bias.append((pos_start + region_begin, pos_stop + region_begin, max_score, avg_score))
256 |         else:
257 |             list_suspicious.append((pos_start + region_begin, pos_stop + region_begin, max_score, avg_score))
258 |     if f_ob:
259 |         for segment in list_bias:
260 |             f_ob.write(ref_name + '\t' + str(segment[0]) + '\t' + str(segment[1]) + '\tlen:' + str(segment[1]-segment[0]) + ',max:' + str(round(segment[2],2)) + ',avg:' + str(round(segment[3],2)) + '\n')
261 |     if f_os:
262 |         for segment in list_suspicious:
263 |             f_os.write(ref_name + '\t' + str(segment[0]) + '\t' + str(segment[1]) + '\tlen:' + str(segment[1]-segment[0]) + ',max:' + str(round(segment[2],2)) + ',avg:' + str(round(segment[3],2)) + '\n')
264 |     for segment in sorted(list_bias, key=lambda ele: (ele[1]-ele[0])*ele[2]*ele[3], reverse=True)[:5]:
265 |         print(ref_name + ' ' + str(segment[0]) + ' ' + str(segment[1]) + ' len:' + str(segment[1]-segment[0]) + ',max:' + str(round(segment[2],2)) + ',avg:' + str(round(segment[3],2)))
266 |         pass
267 |     return list_bias, list_suspicious
268 | 
269 | 
270 | def calculate_3D_score(
271 |         dict_3D_measures :dict,
272 |         fn_out_report    :str,
273 |         list_statistics  :list
274 |     ) -> tuple:
275 |     """
276 |     Take in the 3D measures and output the 3D score
277 |     """
278 |     avg_RD, std_RD, avg_VD, std_VD, avg_ND, std_ND = list_statistics
279 | 
280 |     f_ob = open(fn_out_report + '.bias.bed', 'w')
281 |     f_os = open(fn_out_report + '.suspicious.bed', 'w')
282 |     f_ob.write('#chrom\tchromStart\tchromEnd\tname\n')
283 |     f_os.write('#chrom\tchromStart\tchromEnd\tname\n')
284 | 
285 |     link_dist = 1000
286 |     for ref_name, dict_region_begin in dict_3D_measures.items():
287 |         old_region_begin = -link_dist
288 |         old_array = np.array([])
289 |         for region_begin, array_info in sorted(dict_region_begin.items()):
290 |             array_read_depth, array_var_density, array_dip_density = array_info
291 |             #print(region_begin, region_begin+len(array_info[0]))
292 | 
293 |             #array_score_product = np.round(array_read_depth/avg_RD) * (array_var_density/avg_VD+0.1) * (array_dip_density/avg_ND+0.1)
294 |             #array_score_product = np.where(array_score_product > 30, 30, array_score_product)
295 |             """
296 |             array_score_sum = np.round(array_read_depth/avg_RD) + (array_var_density/avg_VD) + (array_dip_density/avg_ND)
297 |             array_score_sum = np.where(array_read_depth > avg_RD/2, array_score_sum, 0)
298 |             """
299 |             array_Z_score_RD = (array_read_depth-avg_RD)/std_RD - 1
300 |             array_Z_score_RD = np.where(array_Z_score_RD > 0, array_Z_score_RD, 0)
301 |             array_Z_score_VD = (array_var_density-avg_VD)/std_VD
302 |             array_Z_score_VD = np.where(array_Z_score_VD > 0, array_Z_score_VD, 0)
303 |             array_Z_score_ND = (array_dip_density-avg_ND)/std_ND
304 |             array_Z_score_ND = np.where(array_Z_score_ND > 0, array_Z_score_ND, 0)
305 |             array_score_sum  = array_Z_score_RD + array_Z_score_VD + array_Z_score_ND
306 |             array_score_product  = array_Z_score_RD * (array_Z_score_VD + array_Z_score_ND)
307 |             #array_score_sum = (array_read_depth-avg_RD)/std_RD
308 |             #array_score_sum = (array_var_density-avg_VD)/std_VD
309 |             #array_score_sum = (array_dip_density-avg_ND)/std_ND
310 |             #array_score_sum = array_read_depth/avg_RD
311 |             #array_score_sum = array_var_density/avg_VD
312 | 
313 |             #array_score_sum = np.where(array_score_sum > 0, array_score_sum, 0)
314 |             #array_score_sum = np.where(array_score_sum > 30, 30, array_score_sum)
315 |             #link_bias_region_and_report(array_score_sum, region_begin, ref_name, f_ob, f_os)
316 |             #link_bias_region_and_report(array_score_product, region_begin, ref_name, f_ob, f_os,20,30,1000)
317 |             #link_bias_region_and_report(array_score_sum, region_begin, ref_name, f_ob, f_os,3,5,link_dist)
318 |             #print(old_region_begin, old_region_begin+len(old_array), region_begin)
319 |             dict_3D_measures[ref_name][region_begin].append(array_score_sum)
320 |             if old_region_begin + len(old_array) + link_dist > region_begin:
321 |                 assert(old_region_begin + len(old_array) < region_begin)
322 |                 # Connect
323 |                 diff = region_begin - old_region_begin - len(old_array)
324 |                 old_array = np.concatenate((old_array, np.zeros(diff), array_score_sum))
325 |             else:
326 |                 if old_region_begin != -1000:
327 |                     link_bias_region_and_report(old_array, old_region_begin, ref_name, f_ob, f_os,3,5,link_dist)
328 |                 #dict_3D_measures[ref_name][old_region_begin].append(old_array)
329 |                 old_region_begin = region_begin
330 |                 old_array        = array_score_sum
331 |         link_bias_region_and_report(old_array, old_region_begin, ref_name, f_ob, f_os,3,5,link_dist)
332 |     f_ob.close()
333 |     f_os.close()
334 |     
335 |     # report the region with low Read depth
336 |     f_or = open(fn_out_report + '.lowRd.bed', 'w')
337 |     f_or.write('#chrom\tchromStart\tchromEnd\tname\n')
338 |     rd_thresh = min(int(avg_RD/5),10)
339 |     for ref_name, dict_region_begin in dict_3D_measures.items():
340 |         global_start = []
341 |         global_stop  = []
342 |         for region_begin, array_info in sorted(dict_region_begin.items()):
343 |             array_read_depth, *_ = array_info
344 | 
345 |             bool_low = array_read_depth < rd_thresh
346 |             #print(bool_low)
347 |             bool_low_shift = np.concatenate(([False], bool_low))[:-1]
348 |             bool_start = bool_low > bool_low_shift
349 |             bool_stop  = bool_low < bool_low_shift
350 |             
351 |             list_start = [idx+region_begin for idx, x in enumerate(bool_start) if x]
352 |             list_stop  = [idx+region_begin for idx, x in enumerate(bool_stop ) if x]
353 |             
354 |             if len(list_start) == len(list_stop):
355 |                 list_start.append(region_begin + len(array_read_depth))
356 |             if global_start == []:
357 |                 global_start = list_start
358 |                 global_stop  = list_stop
359 |             else:
360 |                 if list_start[0] == region_begin:
361 |                     global_start += list_start[1:]
362 |                     global_stop  += list_stop
363 |                 else:
364 |                     global_stop += [region_begin-1]
365 |                     global_start += list_start
366 |                     global_stop  += list_stop
367 |         global_start = global_start[:-1]
368 |         assert(len(global_start) == len(global_stop))
369 |         for idx in range(len(global_start)):
370 |             st = global_start[idx]
371 |             ed = global_stop[idx]
372 |             f_or.write(ref_name + '\t' + str(st) + '\t' + str(ed) + '\tlen:' + str(ed-st) + '\n')
373 |     f_or.close()
374 | 
375 | 
376 | def get_baseline(
377 |         fn_baseline :str
378 |     ) -> list:
379 |     """
380 |     Take and parse the last line of fn_baseline
381 |     """
382 |     f = open(fn_baseline, 'r')
383 |     for line in f:
384 |         pass
385 |     f.close()
386 |     _, avg_RD, std_RD, avg_VD, std_VD, avg_ND, std_ND = line.split()
387 |     return [float(avg_RD), float(std_RD), float(avg_VD), float(std_VD), float(avg_ND), float(std_ND)]
388 | 
389 | 
390 | def calculate_avg(
391 |         dict_3D_measures :dict,
392 |     ):
393 |     total_read_depth  = np.array([])
394 |     total_var_density = np.array([])
395 |     total_dip_density = np.array([])
396 |     for ref_name, dict_array in dict_3D_measures.items():
397 |         for start_pos, array_info in dict_array.items():
398 |             array_read_depth, array_var_density, array_dip_density = array_info
399 |             positive_var      = array_var_density[array_var_density != 0]
400 |             positive_dip      = array_dip_density[array_var_density != 0]
401 |             
402 |             total_read_depth  = np.concatenate((total_read_depth , array_read_depth))
403 |             total_var_density = np.concatenate((total_var_density, positive_var))
404 |             total_dip_density = np.concatenate((total_dip_density, positive_dip))
405 |     return [np.mean(total_read_depth), np.std(total_read_depth), np.mean(total_var_density), \
406 |             np.std(total_var_density), np.mean(total_dip_density), np.std(total_dip_density)]
407 | 
408 | 
409 | 
410 | 
411 | 
412 | if __name__ == "__main__":
413 |     parser = argparse.ArgumentParser()
414 |     parser.add_argument('-g', '--gvcf_file', help='the gvcf file of a specific region')
415 |     parser.add_argument('-w', '--window_size', help='window size for average depth, density analysis', type=int, default=400)
416 |     parser.add_argument('-rd', '--read_depth', help='the average sequence read depth')
417 |     parser.add_argument('-b', '--baseline', help='the baseline report generate by sample_baseline.py')
418 |     parser.add_argument('-s', '--sample', action='store_true', help='sample for the baseline')
419 |     parser.add_argument('-o',  '--out_report', help='scanning bed file and reports')
420 |     parser.add_argument('-wig', '--out_wig', help='flag for wig output', action='store_true')
421 |     args = parser.parse_args()
422 |     
423 |     fn_gvcf       = args.gvcf_file
424 |     rd_thresh     = args.read_depth
425 |     window_size   = args.window_size
426 |     fn_baseline   = args.baseline
427 |     flag_sample   = args.sample
428 |     fn_out_report = args.out_report
429 |     flag_wig      = args.out_wig
430 | 
431 |     f_gvcf = pysam.VariantFile(fn_gvcf)
432 |     # load or calculate the 3D measures depending on pickle file existance
433 |     if os.path.exists(fn_gvcf + '.pickle'):
434 |         print("Pickle file", fn_gvcf + '.pickle', 'exist, load it instead of recalculate...')
435 |         f_i = open(fn_gvcf + '.pickle', 'rb')
436 |         dict_3D_measures = pickle.load(f_i)
437 |         f_i.close()
438 |     else:
439 |         print("Process the mpileup file", fn_gvcf + '...')
440 |         dict_ref_info = scanning_bias(f_gvcf=f_gvcf)
441 |         dict_3D_measures = calculate_measures(
442 |             dict_ref_info=dict_ref_info,
443 |             window_size=window_size
444 |             )
445 |         print("Store the measures information as", fn_gvcf + '.pickle...')
446 |         f_o = open(fn_gvcf + '.pickle', 'wb')
447 |         pickle.dump(dict_3D_measures, f_o)
448 |         f_o.close()
449 |     
450 |     # Load or calculate the baseline of the measures
451 |     if fn_baseline:
452 |         # avg_RD, std_RD, avg_VD, std_VD, avg_ND, std_ND
453 |         list_statistics = get_baseline(fn_baseline)
454 |     elif flag_sample:
455 |         list_statistics = calculate_avg(dict_3D_measures)
456 |     else:
457 |         list_statistics = [30, 10, 0.7, 1.6, 0.3, 1.2]
458 |     if rd_thresh:
459 |         list_statistics[0] = rd_thresh
460 |     
461 | 
462 |     print("Calculate 3D scoring and output bed...")
463 |     calculate_3D_score(dict_3D_measures, fn_out_report, list_statistics)
464 | 
465 |     if flag_wig: # output wig files if -ow option
466 |         print("Output wig format...")
467 |         report_wig(
468 |             fn_output=fn_out_report,
469 |             dict_3D_measures=dict_3D_measures
470 |             )
471 | 


--------------------------------------------------------------------------------
/biastools/ref_bi_naive.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import re
  3 | import pickle
  4 | import os.path
  5 | from os import path
  6 | import pysam
  7 | import numpy as np
  8 | from scipy.stats import chisquare
  9 | from typing import List, Tuple, Dict, Union
 10 | 
 11 | 
 12 | def chi_square_test(var_start: int, list_pos_start: List[int]) -> float:
 13 |     if len(list_pos_start) < 2:
 14 |         return 0
 15 |     bucket_num = 5
 16 |     bucket_len = int(100 / bucket_num)
 17 |     list_count = np.zeros(bucket_num)
 18 |     input_idx = np.minimum((var_start - np.array(list_pos_start)) // bucket_len, bucket_num - 1)
 19 |     try:
 20 |         np.add.at(list_count, input_idx, 1)
 21 |     except IndexError:
 22 |         print(var_start, list_pos_start)
 23 |     _, p_value = chisquare(list_count)
 24 |     return 0 if np.isnan(p_value) else p_value
 25 | 
 26 | 
 27 | def get_division(num_1, num_2):
 28 |     if num_2 == 0:
 29 |         return 'nan'
 30 |         #return format(num_1 / (num_2+0.000001), '.4f')
 31 |     else:
 32 |         return format(num_1 / num_2, '.4f')
 33 | 
 34 | 
 35 | def output_report(
 36 |         f_vcf                   :pysam.VariantFile,
 37 |         dict_ref_bias           :dict,
 38 |         dict_set_conflict_vars  :dict,
 39 |         flag_real               :bool,
 40 |         fn_golden               :str,
 41 |         fn_output               :str
 42 |         ) -> None:
 43 |     """
 44 |     Output the reference bias report to three different files:
 45 |         - f_all: containing all the variants
 46 |         - f_gap: contains only insertions and deletions
 47 |         - f_SNP: contains only SNPs
 48 |     """
 49 |     if flag_real != True:
 50 |         with open(fn_golden, "rb") as f:
 51 |             dict_ref_var_name = pickle.load(f)
 52 | 
 53 |     f_all = open(fn_output, 'w')
 54 |     f_gap = open(fn_output + '.gap', 'w')
 55 |     f_SNP = open(fn_output + '.SNP', 'w')
 56 |     if flag_real:
 57 |         f_all.write("CHR\tHET_SITE\tNUM_READS\tAVG_MAPQ\tEVEN_P_VALUE\tBALANCE\tREF\tALT\tBOTH\tOTHER\tGAP\n")
 58 |         f_gap.write("CHR\tHET_SITE\tNUM_READS\tAVG_MAPQ\tEVEN_P_VALUE\tBALANCE\tREF\tALT\tBOTH\tOTHER\n")
 59 |         f_SNP.write("CHR\tHET_SITE\tNUM_READS\tAVG_MAPQ\tEVEN_P_VALUE\tBALANCE\tREF\tALT\tBOTH\tOTHER\n")
 60 |     else:
 61 |         f_all.write("CHR\tHET_SITE\tNUM_READS\tAVG_MAPQ\tEVEN_P_VALUE\tBALANCE\tREF\tALT\tBOTH\tOTHER\tMAP_BALANCE\tMAP_REF\tMAP_ALT\tMIS_MAP\tSIM_BALANCE\tSIM_REF\tSIM_ALT\tGAP\n")
 62 |         f_gap.write("CHR\tHET_SITE\tNUM_READS\tAVG_MAPQ\tEVEN_P_VALUE\tBALANCE\tREF\tALT\tBOTH\tOTHER\tMAP_BALANCE\tMAP_REF\tMAP_ALT\tMIS_MAP\tSIM_BALANCE\tSIM_REF\tSIM_ALT\n")
 63 |         f_SNP.write("CHR\tHET_SITE\tNUM_READS\tAVG_MAPQ\tEVEN_P_VALUE\tBALANCE\tREF\tALT\tBOTH\tOTHER\tMAP_BALANCE\tMAP_REF\tMAP_ALT\tMIS_MAP\tSIM_BALANCE\tSIM_REF\tSIM_ALT\n")
 64 |     for var in f_vcf:
 65 |         ref_name = var.contig
 66 |         hap = var.samples[0]['GT']
 67 |         # Filtering all the homozygous alleles or the alleles without reference
 68 |         if (hap[0] != 0 and hap[1] != 0) or (hap[0] == 0 and hap[1] == 0):
 69 |             continue
 70 |         if hap[0] == 0:
 71 |             idx_ref, idx_alt = 0, 1
 72 |         else:
 73 |             idx_ref, idx_alt = 1, 0
 74 |         # Filtering the conflict vars
 75 |         if var.start in dict_set_conflict_vars[ref_name]:
 76 |             continue
 77 |         n_read = dict_ref_bias[ref_name][var.start]['n_read']
 78 |         n_var  = dict_ref_bias[ref_name][var.start]['n_var']
 79 |         map_q  = dict_ref_bias[ref_name][var.start]['map_q']
 80 |         #p_value = interval_variance(var.start, dict_ref_bias[ref_name][var.start]['distribute'])
 81 |         p_value = chi_square_test(var.start, dict_ref_bias[ref_name][var.start]['distribute'][idx_alt])
 82 |         p_value = min(p_value, chi_square_test(var.start, dict_ref_bias[ref_name][var.start]['distribute'][idx_ref]))
 83 | 
 84 |         output_string = (ref_name + '\t' + str(var.start+1) + '\t')
 85 |         output_string += (str(sum(n_read)) + "\t" + get_division(sum(map_q[:2]), sum(n_read[:2])) + "\t" + format(p_value, '.4f') + '\t')
 86 |         # n_var[0,1,2,3] = hap0, hap1, both, others
 87 |         output_string += get_division(n_var[idx_ref]+n_var[2]*0.5, sum(n_var[:3])) + "\t" + str(n_var[idx_ref]) + "\t" + str(n_var[idx_alt]) + "\t" + str(n_var[2]) + "\t" + str(n_var[3])
 88 |         #output_string += get_division(n_var[idx_ref], sum(n_var[:2])) + "\t" + str(n_var[idx_ref]) + "\t" + str(n_var[idx_alt]) + "\t" + str(n_var[2]) + "\t" + str(n_var[3])
 89 |         if flag_real != True: # Golden Information
 90 |             # mapping balance information
 91 |             output_string += "\t" + get_division(n_read[idx_ref], sum(n_read[:2])) + '\t' + str(n_read[idx_ref]) + '\t' + str(n_read[idx_alt]) + '\t' + str(n_read[2])  
 92 |             read_info = dict_ref_var_name[ref_name][var.start]
 93 |             # simulation balance information
 94 |             output_string += '\t' + get_division(read_info[idx_ref+2], sum(read_info[2:4])) + '\t' + str(read_info[idx_ref+2]) + '\t' + str(read_info[idx_alt+2])  
 95 | 
 96 |         if len(var.ref) ==  len(var.alts[ hap[idx_alt] - 1]): # length of ref is equal to length of 
 97 |             f_all.write(output_string + '\t' + '\n')
 98 |             f_SNP.write(output_string + '\n')
 99 |         else:
100 |             f_all.write(output_string + '\t' + '.\n')
101 |             f_gap.write(output_string + '\n')
102 |     
103 |     f_all.close()
104 |     f_gap.close()
105 |     f_SNP.close()
106 | 
107 | 
108 | def hap_inside(
109 |         seq_read    :str,
110 |         seq_hap     :str,
111 |         padding     :int
112 |         ) -> bool:
113 |     """
114 |     Finding if the haplotype is in the read
115 |     Also considering the boundary condition
116 |     One padding side can be omitted
117 |     """
118 |     if seq_hap in seq_read:
119 |         return True
120 |     else:
121 |         len_hap = len(seq_hap)
122 |         for idx in range(1,padding):
123 |             # checking read left side
124 |             if seq_hap[idx:] == seq_read[:len_hap - idx]:
125 |                 return True
126 |             # checking read right side
127 |             if seq_hap[:-idx] == seq_read[idx - len_hap:]:
128 |                 return True
129 |     return False
130 | 
131 | 
132 | def return_locate_cigar(
133 |         read_start  :int,
134 |         target_pos  :int,
135 |         cigar_tuples:tuple
136 |         ) -> int:
137 |     """
138 |     return the cigar value of a location
139 |     according to the CIGAR string
140 |     """
141 |     ref_curser  = read_start -1
142 |     read_curser = 0
143 |     for pair_info in cigar_tuples:
144 |         code, runs = pair_info
145 |         if code == 0 or code == 7 or code == 8: # M or = or X
146 |             ref_curser += runs
147 |             if ref_curser > target_pos:
148 |                 return 0
149 |             else:
150 |                 read_curser += runs
151 |         elif code == 1: # I
152 |             ref_curser  += 1
153 |             if ref_curser > target_pos:
154 |                 return -runs
155 |             else:
156 |                 read_curser += runs
157 |         elif code == 2: # D
158 |             ref_curser += runs
159 |             if ref_curser > target_pos:
160 |                 return runs
161 |             else:
162 |                 read_curser += 1
163 |         elif code == 4 or code == 5: # S or H, pysam already parsed
164 |             pass
165 |         else:
166 |             print ("ERROR: unexpected cigar code in sequence")
167 |     return 0
168 | 
169 | 
170 | def locate_by_cigar(
171 |         read_start  :int,
172 |         target_pos  :int,
173 |         cigar_tuples:tuple
174 |         ) -> int:
175 |     """
176 |     return the location of a specific reference position in the read
177 |     according to the CIGAR string
178 |     """
179 |     ref_curser  = read_start
180 |     read_curser = 0
181 |     for pair_info in cigar_tuples:
182 |         code, runs = pair_info
183 |         if code == 0 or code == 7 or code == 8: # M or = or X
184 |             ref_curser += runs
185 |             if ref_curser > target_pos:
186 |                 return read_curser + (runs - ref_curser + target_pos)
187 |             else:
188 |                 read_curser += runs
189 |         elif code == 1: # I
190 |             #ref_curser  += 1
191 |             if ref_curser > target_pos:
192 |                 return read_curser
193 |             else:
194 |                 read_curser += runs
195 |         elif code == 2: # D
196 |             ref_curser += runs
197 |             if ref_curser > target_pos:
198 |                 return read_curser
199 |             #else:
200 |             #    read_curser += 1
201 |         elif code == 4 or code == 5: # S or H, pysam already parsed
202 |             pass
203 |         else:
204 |             print ("ERROR: unexpected cigar code in sequence")
205 |     return read_curser
206 | 
207 | 
208 | def match_to_hap(
209 |         seq_name    :str, # for debug
210 |         read_start  :int,
211 |         read_end    :int,
212 |         var_start   :int,
213 |         seq_read    :str,
214 |         seq_hap     :str,
215 |         cigar_tuples:tuple,
216 |         padding     :int,
217 |         l_min_req   :int,
218 |         r_min_req   :int,
219 |         start_flag  :bool=True
220 |         ) -> int:
221 |     """
222 |     1. Find the matching point of the variant on the read
223 |     2. Extend the padding on the read
224 |     3. compare the read to haplotype sequences
225 |     """
226 |     if read_start > var_start: # Not cover
227 |         return -1
228 |     elif read_end < var_start: # Not cover
229 |         return -1
230 |     
231 |     # locating the variant site on the read
232 |     r_start = locate_by_cigar(
233 |             read_start=read_start,
234 |             target_pos=var_start,
235 |             cigar_tuples=cigar_tuples
236 |             )
237 |     
238 |     # Matching
239 |     if start_flag:  # From var.start
240 |         l_bound = r_start - padding
241 |         r_bound = l_bound + len(seq_hap)
242 |     else:           # From var.stop
243 |         r_bound = r_start + padding
244 |         l_bound = r_bound - len(seq_hap)
245 | 
246 |     min_match = 0 # minimum match length
247 |     if l_bound < 0:
248 |         seq_hap = seq_hap[-l_bound:]
249 |         l_bound = 0
250 |         min_match = r_min_req # minimum len to cover variant
251 |     if r_bound > len(seq_read):
252 |         seq_hap = seq_hap[:len(seq_read)-r_bound]
253 |         r_bound = len(seq_read)
254 |         if min_match != 0:
255 |             print("WARNING! Both l_bound and r_bound exceed the read!!")
256 |         min_match = l_min_req # minimum len to cover variant
257 |     if r_bound - l_bound < min_match:
258 |         return -1 # Not cover
259 |     if seq_read[l_bound:r_bound].upper() == seq_hap.upper():
260 |         return 1 # Match
261 |     else:
262 |         return 0 # Not match
263 | 
264 | 
265 | def compare_sam_to_haps(
266 |     f_vcf           :pysam.VariantFile,
267 |     f_sam           :pysam.AlignmentFile,
268 |     dict_ref_alts   :dict,
269 |     dict_set_conflict_vars: dict,
270 |     flag_real       :bool,
271 |     fn_golden       :str,
272 |     run_id          :str
273 |     ) -> dict:
274 |     """
275 |     Input:  f_sam file
276 |     Output: ref bias dictionary according to variants
277 |     """
278 |     if flag_real != True:
279 |         with open(fn_golden, "rb") as f:
280 |             dict_ref_var_name = pickle.load(f)
281 |     
282 |     # build up the ref bias dictionary
283 |     dict_ref_var_bias = {}
284 |     for ref_name in dict_ref_alts.keys():
285 |         dict_ref_var_bias[ref_name] = {}
286 |         for start_pos in dict_ref_alts[ref_name]:
287 |             # n_var has hap0, hap1, both, and others
288 |             dict_ref_var_bias[ref_name][start_pos] = {'n_read':[0,0,0], 'n_var':[0,0,0,0], 'map_q':[0,0,0], 'distribute':[[],[],[],[]]}
289 |     
290 |     # parameters for pipeline design
291 |     count_others  = [0,0]
292 |     count_both    = [0,0]
293 |     count_error   = [0,0]
294 |     count_correct = [0,0]
295 | 
296 |     # scanning all the read alignments
297 |     dict_errors = {}
298 |     for segment in f_sam:
299 |         flag = segment.flag
300 |         if (flag & 4): # bitwise AND 4, segment unmapped
301 |             continue
302 |         # aligned read information
303 |         ref_name     = segment.reference_name
304 |         seq_name     = segment.query_name
305 |         flag_read_n  = segment.is_read2
306 |         pos_start    = segment.reference_start # start position in genome coordiante, need +1 for vcf coordinate
307 |         pos_end      = segment.reference_end
308 |         cigar_tuples = segment.cigartuples
309 |         mapq         = segment.mapping_quality
310 |         rg_tag       = segment.get_tag("RG")
311 |         read_seq     = segment.query_alignment_sequence # aligned sequence without SoftClip part
312 |         
313 |         #chr_tag, hap_tag = rg_tag.split('_')
314 |         if '_' in rg_tag:
315 |             chr_tag, hap_tag = rg_tag.split('_')
316 |         else:
317 |             chr_tag = None
318 |             hap_tag = rg_tag
319 |         related_vars = list(f_vcf.fetch(ref_name, pos_start, pos_end)) # list of pysam.variant
320 |         #fetching the sequence in the read_seq regarding to the variant
321 |         for var in related_vars:
322 |             if var.start in dict_set_conflict_vars[ref_name]: # neglecting the conflict variant sites
323 |                 continue
324 |             seq_hap0, seq_hap1, diff_hap0, diff_hap1 = dict_ref_alts[ref_name][var.start]
325 |             if seq_hap0 == seq_hap1:
326 |                 continue
327 | 
328 |             if diff_hap0 !=0: # if hap0 is a gap:
329 |                 diff_read = return_locate_cigar(
330 |                         read_start=pos_start, 
331 |                         target_pos=var.start, 
332 |                         cigar_tuples=cigar_tuples
333 |                         )
334 |                 if diff_read == diff_hap0:
335 |                     match_flag_0 = 1
336 |                     match_flag_1 = 0
337 |                 else:
338 |                     match_flag_0 = 0
339 |                     match_flag_1 = match_to_hap(seq_name, pos_start, pos_end, var.start, read_seq, seq_hap1, cigar_tuples, 0, 1, 1, True)
340 |             elif diff_hap1 !=0: # if hap1 is a gap:
341 |                 diff_read = return_locate_cigar(
342 |                         read_start=pos_start, 
343 |                         target_pos=var.start, 
344 |                         cigar_tuples=cigar_tuples
345 |                         )
346 |                 if diff_read == diff_hap1:
347 |                     match_flag_0 = 0
348 |                     match_flag_1 = 1
349 |                 else:
350 |                     match_flag_0 = match_to_hap(seq_name, pos_start, pos_end, var.start, read_seq, seq_hap0, cigar_tuples, 0, 1, 1, True)
351 |                     match_flag_1 = 0
352 |             else:
353 |                 match_flag_0 = match_to_hap(seq_name, pos_start, pos_end, var.start, read_seq, seq_hap0, cigar_tuples, 0, 1, 1, True)
354 |                 match_flag_1 = match_to_hap(seq_name, pos_start, pos_end, var.start, read_seq, seq_hap1, cigar_tuples, 0, 1, 1, True)
355 | 
356 |             if match_flag_0 == 1 and match_flag_1 == 1:
357 |                 print("Both Trouble!", seq_name, var.start, seq_hap0, seq_hap1)
358 | 
359 |             # 5. Assign Values
360 |             if match_flag_0 == -1 and match_flag_1 == -1:
361 |                 continue
362 |             if match_flag_0 == 1 and match_flag_1 == 1:
363 |                 dict_ref_var_bias[ref_name][var.start]['n_var'][2] += 1
364 |             elif match_flag_0 == 1:
365 |                 dict_ref_var_bias[ref_name][var.start]['n_var'][0] += 1
366 |                 # record the starting position of each read cover the variant
367 |                 dict_ref_var_bias[ref_name][var.start]['distribute'][0].append(pos_start)
368 |                 dict_ref_var_bias[ref_name][var.start]['distribute'][2].append(pos_end)
369 |             elif match_flag_1 == 1:
370 |                 dict_ref_var_bias[ref_name][var.start]['n_var'][1] += 1
371 |                 # record the starting position of each read cover the variant
372 |                 dict_ref_var_bias[ref_name][var.start]['distribute'][1].append(pos_start)
373 |                 dict_ref_var_bias[ref_name][var.start]['distribute'][3].append(pos_end)
374 |             else:
375 |                 dict_ref_var_bias[ref_name][var.start]['n_var'][3] += 1
376 |             
377 |             # standard updating of read number and mapping quality
378 |             if flag_real: # no golden information
379 |                 dict_ref_var_bias[ref_name][var.start]['n_read'][0] += 1
380 |                 dict_ref_var_bias[ref_name][var.start]['map_q'][0]  += mapq
381 |             else:
382 |                 if run_id != None and run_id != chr_tag: # not the same chromosome
383 |                     dict_ref_var_bias[ref_name][var.start]['n_read'][2] += 1
384 |                     dict_ref_var_bias[ref_name][var.start]['map_q'][2] += 1
385 |                 elif dict_ref_var_name[ref_name].get(var.start) == None:
386 |                     continue
387 |                 elif 'hapA' == hap_tag: # hapA
388 |                     #if seq_name in dict_ref_var_name[ref_name][var.start][0]: # check if the read name is in the golden set
389 |                     if (seq_name, flag_read_n) in dict_ref_var_name[ref_name][var.start][0]: # check if the read name is in the golden set
390 |                         dict_ref_var_bias[ref_name][var.start]['n_read'][0] += 1
391 |                         dict_ref_var_bias[ref_name][var.start]['map_q'][0]  += mapq
392 |                     else:
393 |                         dict_ref_var_bias[ref_name][var.start]['n_read'][2] += 1
394 |                         dict_ref_var_bias[ref_name][var.start]['map_q'][2] += 1
395 |                 elif 'hapB' == hap_tag: # hapB
396 |                     #if seq_name in dict_ref_var_name[ref_name][var.start][1]: # check if the read name is in the golden set
397 |                     if (seq_name, flag_read_n) in dict_ref_var_name[ref_name][var.start][1]: # check if the read name is in the golden set
398 |                         dict_ref_var_bias[ref_name][var.start]['n_read'][1] += 1
399 |                         dict_ref_var_bias[ref_name][var.start]['map_q'][1]  += mapq
400 |                     else:
401 |                         dict_ref_var_bias[ref_name][var.start]['n_read'][2] += 1
402 |                         dict_ref_var_bias[ref_name][var.start]['map_q'][2] += 1
403 |                 else:
404 |                     print("WARNING, there is a read without haplotype information!!")
405 | 
406 |     return dict_ref_var_bias
407 | 
408 | 
409 | def len_var_seq(
410 |         var     :pysam.VariantRecord,
411 |         genotype:int
412 |         )-> tuple :
413 |     """
414 |     Switch the ref sequence according to the haplotype information
415 |     """
416 |     if genotype == 0:
417 |         return 0, var.ref
418 |     else:
419 |         alt = var.alts[genotype - 1]
420 |         return len(var.ref) - len(alt), alt
421 | 
422 | 
423 | def variant_seq(
424 |         f_vcf       :pysam.VariantFile,
425 |         f_fasta     :pysam.FastaFile
426 |         )-> tuple: # dict_set_conflict_vars, dict_var_haps, dict_cohort
427 |     """
428 |     Output
429 |         - dict_set_conflict_vars: the dictionary marking the overlaping variants
430 |         - dict_ref_alts:
431 |             in each contig:
432 |             - key: var.start
433 |             - values: [varseq_hap0, varseq_hap1]
434 |                 # not only store the varseq but also indicating the variant length
435 |     """
436 |     dict_ref_alts = {}
437 |     dict_set_conflict_vars = {}
438 |     for ref_name in f_fasta.references:
439 |         dict_ref_alts[ref_name] = {}
440 |         dict_set_conflict_vars[ref_name] = set()
441 | 
442 |     old_ref_name = ""
443 |     for var in f_vcf:
444 |         ref_name = var.contig
445 |         if old_ref_name != ref_name: # changing the contig
446 |             # Reset the parameters
447 |             overlap0,    overlap1    =  0,  0
448 |             prev_start0, prev_start1 = -1, -1
449 |             old_ref_name = ref_name
450 |         
451 |         hap_0, hap_1 = var.samples[0]['GT']
452 |         diff_hap0, var_seq0 = len_var_seq(var, hap_0)
453 |         diff_hap1, var_seq1 = len_var_seq(var, hap_1)
454 |         if var.start > prev_start0 + overlap0 and var.start > prev_start1 + overlap1: # checking if there are overlaps
455 |             dict_ref_alts[ref_name][var.start] = [var_seq0, var_seq1, diff_hap0, diff_hap1]
456 |             # hap0
457 |             prev_start0 = var.start
458 |             overlap0 = len(var_seq0) - 1 if (diff_hap0 == 0) else diff_hap0
459 |             # hap1
460 |             prev_start1 = var.start
461 |             overlap1 = len(var_seq1) - 1 if (diff_hap1 == 0) else diff_hap1
462 |         else: # overlapping variants are consider conflicts
463 |             dict_set_conflict_vars[ref_name].add(prev_start1)
464 |             dict_set_conflict_vars[ref_name].add(var.start)
465 |     return dict_set_conflict_vars, dict_ref_alts
466 | 
467 | 
468 | 
469 | 
470 | 
471 | if __name__ == "__main__":
472 |     parser = argparse.ArgumentParser()
473 |     parser.add_argument('-v', '--vcf', help='vcf file')
474 |     parser.add_argument('-s', '--sam', help='sam file')
475 |     parser.add_argument('-f', '--fasta', help='reference fasta file')
476 |     parser.add_argument('-r', '--real_data', help='turn off hap_information warning for real data', action='store_true')
477 |     parser.add_argument('-p', '--golden_pickle', help='the pickle file contain the golden information for report reference')
478 |     parser.add_argument('-i', '--run_id', help='the tag for run_id, can be used to indicate for example chromosome number')
479 |     parser.add_argument('-t', '--thread', help='Number of threads, not supported', type=int, default=8)
480 |     parser.add_argument('-o', '--out', help='output file')
481 |     args = parser.parse_args()
482 |     
483 |     fn_vcf = args.vcf
484 |     fn_sam = args.sam
485 |     fn_fasta = args.fasta
486 |     flag_real = args.real_data
487 |     fn_golden = args.golden_pickle
488 |     fn_output = args.out
489 |     run_id    = args.run_id
490 |     
491 |     f_vcf   = pysam.VariantFile(fn_vcf)
492 |     f_sam   = pysam.AlignmentFile(fn_sam)
493 |     f_fasta = pysam.FastaFile(fn_fasta)
494 |     var_chain = 25
495 |     print("Start building the variant maps...")
496 |     dict_set_conflict_vars, dict_ref_alts = variant_seq(
497 |             f_vcf=f_vcf,
498 |             f_fasta=f_fasta
499 |             )
500 |     # extend conflict set
501 |     for ref_name in dict_set_conflict_vars.keys():
502 |         for pos in list(dict_set_conflict_vars[ref_name]):
503 |             for extend in range(pos-var_chain, pos+var_chain):
504 |                 dict_set_conflict_vars[ref_name].add(extend)
505 |     
506 |     print("Start comparing reads to the variant map...")
507 |     dict_ref_bias = compare_sam_to_haps(
508 |             f_vcf=f_vcf,
509 |             f_sam=f_sam,
510 |             dict_ref_alts=dict_ref_alts,
511 |             dict_set_conflict_vars=dict_set_conflict_vars,
512 |             flag_real=flag_real,
513 |             fn_golden=fn_golden,
514 |             run_id=run_id
515 |             )
516 |     f_vcf   = pysam.VariantFile(fn_vcf)
517 |     print("Start output report...")
518 |     output_report(
519 |             f_vcf=f_vcf,
520 |             dict_ref_bias=dict_ref_bias,
521 |             dict_set_conflict_vars=dict_set_conflict_vars, 
522 |             flag_real=flag_real,
523 |             fn_golden=fn_golden,
524 |             fn_output=fn_output
525 |             )
526 | 
527 | 
528 | 


--------------------------------------------------------------------------------
/biastools/consensus_vcf_map_adaptive.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pickle
  3 | import pysam
  4 | import numpy as np
  5 | 
  6 | 
  7 | 
  8 | def len_var_seq(
  9 |         var     :pysam.VariantRecord,
 10 |         genotype:int
 11 |         )-> tuple :
 12 |     """
 13 |     Switch the ref sequence according to the haplotype information
 14 |     """
 15 |     if genotype == 0:
 16 |         return 0, var.ref
 17 |     else:
 18 |         alt = var.alts[genotype - 1]
 19 |         return len(var.ref) - len(alt), alt
 20 | 
 21 | 
 22 | def variant_seq(
 23 |         f_vcf       :pysam.VariantFile,
 24 |         f_fasta     :pysam.FastaFile
 25 |         )-> tuple: # dict_set_conflict_vars, dict_var_haps, dict_cohort
 26 |     """
 27 |     Output
 28 |         - dict_set_conflict_vars: the dictionary marking the overlaping variants
 29 |         - dict_ref_alts:
 30 |             in each contig:
 31 |             - key: var.start
 32 |             - values: [varseq_hap0, varseq_hap1]
 33 |                 # not only store the varseq but also indicating the variant length
 34 |     """
 35 |     dict_ref_alts = {}
 36 |     dict_set_conflict_vars = {}
 37 |     for ref_name in f_fasta.references:
 38 |         dict_ref_alts[ref_name] = {}
 39 |         dict_set_conflict_vars[ref_name] = set()
 40 | 
 41 |     old_ref_name = ""
 42 |     for var in f_vcf:
 43 |         ref_name = var.contig
 44 |         if old_ref_name != ref_name: # changing the contig
 45 |             # Reset the parameters
 46 |             overlap0,    overlap1    =  0,  0
 47 |             prev_start0, prev_start1 = -1, -1
 48 |             old_ref_name = ref_name
 49 |         
 50 |         hap_0, hap_1 = var.samples[0]['GT']
 51 |         diff_hap0, var_seq0 = len_var_seq(var, hap_0)
 52 |         diff_hap1, var_seq1 = len_var_seq(var, hap_1)
 53 |         if var.start > prev_start0 + overlap0 and var.start > prev_start1 + overlap1: # checking if there are overlaps
 54 |             dict_ref_alts[ref_name][var.start] = [var_seq0, var_seq1, hap_0, hap_1]
 55 |             # hap0
 56 |             prev_start0 = var.start
 57 |             overlap0 = len(var_seq0) - 1 if (diff_hap0 == 0) else diff_hap0
 58 |             # hap1
 59 |             prev_start1 = var.start
 60 |             overlap1 = len(var_seq1) - 1 if (diff_hap1 == 0) else diff_hap1
 61 |         else: # overlapping variants are consider conflicts
 62 |             dict_set_conflict_vars[ref_name].add(prev_start1)
 63 |             dict_set_conflict_vars[ref_name].add(var.start)
 64 |     return dict_set_conflict_vars, dict_ref_alts
 65 | 
 66 | 
 67 | def hap_seq(
 68 |         var     :pysam.VariantRecord,
 69 |         genotype:int
 70 |         )-> str :
 71 |     """
 72 |     return variant sequence according to haplotype information
 73 |     """
 74 |     if genotype == 0:
 75 |         return var.ref
 76 |     else:
 77 |         return var.alts[genotype - 1]
 78 | 
 79 | 
 80 | def left_right_check(seq_hap0, seq_hap1):
 81 |     """
 82 |     Check the extension direction of the repetitiveness
 83 |     return:
 84 |         - 0: right side extension
 85 |         - 1: left side extension
 86 |         - 2: both sides are extensible
 87 |     """
 88 |     assert(seq_hap0 != seq_hap1)
 89 |     assert((seq_hap0 in seq_hap1) or (seq_hap1 in seq_hap0))
 90 |     len_0 = len(seq_hap0)
 91 |     len_1 = len(seq_hap1)
 92 |     if len_0 > len_1:
 93 |         if seq_hap0[:len_1] == seq_hap1:
 94 |             return 0 # right side repetitive
 95 |         elif seq_hap0[-len_1:] == seq_hap1:
 96 |             return 1 # left side repetitive
 97 |     else:
 98 |         if seq_hap1[:len_0] == seq_hap0:
 99 |             return 0 # right side repetitive
100 |         elif seq_hap1[-len_0:] == seq_hap0:
101 |             return 1 # left side repetitive
102 |     return 2 # in the middle
103 | 
104 | 
105 | def extend_ref_seq(
106 |         seq_hap0,
107 |         seq_hap1,
108 |         ref_extend_0,
109 |         ref_extend_1,
110 |         flag_right=True
111 |         )-> tuple:
112 |     """
113 |     Extend the seq_hap0 and seq_hap1 till they makes a difference
114 |     """
115 |     seq_hap0_extend = seq_hap0
116 |     seq_hap1_extend = seq_hap1
117 |     assert((seq_hap0_extend in seq_hap1_extend) or (seq_hap1_extend in seq_hap0_extend))
118 |     len_iterate = min(len(ref_extend_0), len(ref_extend_1))
119 |     if flag_right: # extend to the right
120 |         for idx in range(len_iterate):
121 |             seq_hap0_extend += ref_extend_0[idx]
122 |             seq_hap1_extend += ref_extend_1[idx]
123 |             if (seq_hap0_extend in seq_hap1_extend) or (seq_hap1_extend in seq_hap0_extend): # still indistinguishable
124 |                 continue
125 |             else:
126 |                 return seq_hap0_extend, seq_hap1_extend, idx+1
127 |     else: # extend to the left
128 |         for idx in range(len_iterate):
129 |             seq_hap0_extend = ref_extend_0[-idx-1] + seq_hap0_extend
130 |             seq_hap1_extend = ref_extend_1[-idx-1] + seq_hap1_extend
131 |             if (seq_hap0_extend in seq_hap1_extend) or (seq_hap1_extend in seq_hap0_extend): # still indistinguishable
132 |                 continue
133 |             else:
134 |                 return seq_hap0_extend, seq_hap1_extend, idx+1
135 |     return seq_hap0_extend, seq_hap1_extend, False
136 | 
137 | 
138 | def extend_ref_seq_padding(
139 |         seq_hap0,
140 |         seq_hap1,
141 |         ref_extend_0,
142 |         ref_extend_1,
143 |         flag_right=True,
144 |         padding=5
145 |         ):
146 |     """
147 |     Call the extend_ref_seq and add padding in the end
148 |     """
149 |     if flag_right:
150 |         seq_hap0_extend, seq_hap1_extend, len_extend = extend_ref_seq(seq_hap0, seq_hap1, ref_extend_0[:-padding], ref_extend_1[:-padding], flag_right)
151 |         if len_extend:
152 |             return seq_hap0_extend + ref_extend_0[len_extend:len_extend+padding], seq_hap1_extend + ref_extend_1[len_extend:len_extend+padding], len_extend+padding
153 |         else:
154 |             return seq_hap0, seq_hap1, False
155 |     else:
156 |         seq_hap0_extend, seq_hap1_extend, len_extend = extend_ref_seq(seq_hap0, seq_hap1, ref_extend_0[padding:], ref_extend_1[padding:], flag_right)
157 |         if len_extend:
158 |             return ref_extend_0[-len_extend-padding:-len_extend] + seq_hap0_extend, ref_extend_1[-len_extend-padding:-len_extend] + seq_hap1_extend, len_extend+padding
159 |         else:
160 |             return seq_hap0, seq_hap1, False
161 | 
162 | 
163 | 
164 | def nearest_left_right_var(
165 |         left_0, 
166 |         right_0, 
167 |         f_hap0_fasta,
168 |         left_1,
169 |         right_1,
170 |         f_hap1_fasta,
171 |         ref_name,
172 |         left_extend=40,
173 |         right_extend=40
174 |         ) -> tuple:
175 |     left_seq_0 = f_hap0_fasta.fetch(reference=ref_name, start=left_0 - left_extend, end=left_0)
176 |     left_seq_1 = f_hap1_fasta.fetch(reference=ref_name, start=left_1 - left_extend, end=left_1)
177 |     left_var = -1
178 |     for idx in range(left_extend-1, 0, -1):
179 |         if left_seq_0[idx] != left_seq_1[idx]:
180 |             left_var = left_extend-idx
181 |             break
182 |     right_seq_0 = f_hap0_fasta.fetch(reference=ref_name, start=right_0, end=right_0 + right_extend)
183 |     right_seq_1 = f_hap1_fasta.fetch(reference=ref_name, start=right_1, end=right_1 + right_extend)
184 |     right_var = -1
185 |     for idx in range(right_extend):
186 |         if right_seq_0[idx] != right_seq_1[idx]:
187 |             right_var = idx
188 |             break
189 |     return left_var, right_var
190 | 
191 | 
192 | 
193 | def check_coordinate(
194 |         dict_ref_alts   :dict,
195 |         f_hap0_fasta    :pysam.FastaFile,
196 |         f_hap1_fasta    :pysam.FastaFile,
197 |         dict_ref_consensus_map0: dict,
198 |         dict_ref_consensus_map1: dict,
199 |         dict_set_conflict_vars:  dict,
200 |         extend_limit    :int=100,
201 |         padding         :int=5
202 |         ) -> dict:
203 |     """
204 |     Make sure the mapping point result in the same sequence as shown in the vcf file
205 |     dict_effective_variant {}
206 |         - key: var_start (at reference coordinate)
207 |         - values: [flag_side, len_extend] # len_extend can be either right or left
208 |     """
209 |     dict_effective_variant = {}
210 |     count_discrepency = 0
211 |     for ref_name, dict_var_seq in dict_ref_alts.items():
212 |         set_conflict = dict_set_conflict_vars[ref_name]
213 |         for var_start, pair_var_seq in dict_var_seq.items():
214 |             if var_start in set_conflict:
215 |                 continue
216 |         
217 |             seq_hap0 = pair_var_seq[0]
218 |             seq_hap1 = pair_var_seq[1]
219 |             pos_map0 = dict_ref_consensus_map0[ref_name][var_start]
220 |             pos_map1 = dict_ref_consensus_map1[ref_name][var_start]
221 |             long_hap0 = f_hap0_fasta.fetch(reference=ref_name, start=pos_map0-padding, end=pos_map0 + len(seq_hap0)+padding)
222 |             long_hap1 = f_hap1_fasta.fetch(reference=ref_name, start=pos_map1-padding, end=pos_map1 + len(seq_hap1)+padding)
223 | 
224 |             if long_hap0 != long_hap1:
225 |                 if (long_hap0 in long_hap1) or (long_hap1 in long_hap0):
226 |                     flag_side = left_right_check(long_hap0, long_hap1) # check which side the repetitive be
227 |                     if flag_side == 0: # right side
228 |                         # get additional extend_limit (default 100) bp from the reference
229 |                         extend_hap0 = f_hap0_fasta.fetch(reference=ref_name, start=pos_map0+len(seq_hap0)+padding, end=pos_map0+len(seq_hap0)+extend_limit)
230 |                         extend_hap1 = f_hap1_fasta.fetch(reference=ref_name, start=pos_map1+len(seq_hap1)+padding, end=pos_map1+len(seq_hap1)+extend_limit)
231 |                         effect_hap0, effect_hap1, len_extend = extend_ref_seq_padding(long_hap0, long_hap1, extend_hap0, extend_hap1, True, padding)
232 |                         if len_extend:
233 |                             left_var, right_var = nearest_left_right_var(pos_map0, pos_map0+len(seq_hap0), f_hap0_fasta, \
234 |                                                                          pos_map1, pos_map1+len(seq_hap1), f_hap1_fasta, ref_name, 40, 40+len_extend)
235 |                             dict_effective_variant[var_start] = (0, len_extend, left_var, right_var)
236 |                         else:
237 |                             print("--- 0 EFFECTIVE VARIANT too long at", var_start, seq_hap0, seq_hap1)
238 |                     elif flag_side == 1: # left side
239 |                         extend_hap0 = f_hap0_fasta.fetch(reference=ref_name, start=pos_map0-extend_limit-padding, end=pos_map0-padding)
240 |                         extend_hap1 = f_hap1_fasta.fetch(reference=ref_name, start=pos_map1-extend_limit-padding, end=pos_map1-padding)
241 |                         effect_hap0, effect_hap1, len_extend = extend_ref_seq_padding(long_hap0, long_hap1, extend_hap0, extend_hap1, False, padding)
242 |                         if len_extend:
243 |                             left_var, right_var = nearest_left_right_var(pos_map0, pos_map0+len(seq_hap0), f_hap0_fasta, \
244 |                                                                          pos_map1, pos_map1+len(seq_hap1), f_hap1_fasta, ref_name, 40+len_extend, 40)
245 |                             dict_effective_variant[var_start] = (1, len_extend, left_var, right_var)
246 |                         else:
247 |                             print("--- 1 EFFECTIVE VARIANT too long at", var_start, seq_hap0, seq_hap1)
248 |                     else: # both sides are extensible
249 |                         extend_hap0 = f_hap0_fasta.fetch(reference=ref_name, start=pos_map0+len(seq_hap0)+padding, end=pos_map0+len(seq_hap0)+extend_limit)
250 |                         extend_hap1 = f_hap1_fasta.fetch(reference=ref_name, start=pos_map1+len(seq_hap1)+padding, end=pos_map1+len(seq_hap1)+extend_limit)
251 |                         r_effect_hap0, r_effect_hap1, r_len_extend = extend_ref_seq_padding(long_hap0, long_hap1, extend_hap0, extend_hap1, True, padding)
252 |                         
253 |                         extend_hap0 = f_hap0_fasta.fetch(reference=ref_name, start=pos_map0-extend_limit-padding, end=pos_map0-padding)
254 |                         extend_hap1 = f_hap1_fasta.fetch(reference=ref_name, start=pos_map1-extend_limit-padding, end=pos_map1-padding)
255 |                         l_effect_hap0, l_effect_hap1, l_len_extend = extend_ref_seq_padding(long_hap0, long_hap1, extend_hap0, extend_hap1, False, padding)
256 |                         flag_extend = -1
257 |                         if l_len_extend == 0: # right anyway
258 |                             if r_len_extend == 0:
259 |                                 print("--- 2 EFFECTIVE VARIANT ENCOUNTER at", var_start, seq_hap0, seq_hap1, "L", l_len_extend)
260 |                             else:
261 |                                 flag_extend=0
262 |                         elif r_len_extend == 0: # left anyway
263 |                             flag_extend=1
264 |                         elif r_len_extend < l_len_extend: # right is better
265 |                             flag_extend=0
266 |                         else: # left is better
267 |                             flag_extend=1
268 | 
269 |                         if flag_extend == 0:
270 |                             left_var, right_var = nearest_left_right_var(pos_map0, pos_map0+len(seq_hap0), f_hap0_fasta, \
271 |                                                                          pos_map1, pos_map1+len(seq_hap1), f_hap1_fasta, ref_name, 40, 40+r_len_extend)
272 |                             dict_effective_variant[var_start] = (0, r_len_extend, left_var, right_var)
273 |                         elif flag_extend == 1:
274 |                             left_var, right_var = nearest_left_right_var(pos_map0, pos_map0+len(seq_hap0), f_hap0_fasta, \
275 |                                                                          pos_map1, pos_map1+len(seq_hap1), f_hap1_fasta, ref_name, 40+l_len_extend, 40)
276 |                             dict_effective_variant[var_start] = (1, l_len_extend, left_var, right_var)
277 | 
278 |             fetch_hap_0 = long_hap0[padding:-padding] 
279 |             fetch_hap_1 = long_hap1[padding:-padding]
280 |             if seq_hap0.upper() != fetch_hap_0.upper() and seq_hap0 != '*':
281 |                 print("Discrepency at", ref_name, str(var_start), str(pos_map0), "haplotype 0! Expect", seq_hap0, ", get", fetch_hap_0, "...")
282 |                 count_discrepency += 1
283 |             if seq_hap1.upper() != fetch_hap_1.upper() and seq_hap1 != '*':
284 |                 print("Discrepency at", ref_name, str(var_start), str(pos_map1), "haplotype 1! Expect", seq_hap1, ", get", fetch_hap_1, "...")
285 |                 count_discrepency += 1
286 |     print("Total Discrepency:", count_discrepency)
287 |     return dict_effective_variant
288 | 
289 | 
290 | def variant_map(
291 |         fn_chain                :str,
292 |         dict_ref_alts           :dict,
293 |         dict_set_conflict_vars  :dict
294 |         ) -> tuple:
295 |     """
296 |     Using chain file to build the variant map
297 |     mapping from reference to target genome coordinate
298 |     """
299 |     dict_ref_consensus_map = {}
300 |     for ref_name in dict_ref_alts.keys():
301 |         dict_ref_consensus_map[ref_name] = {}
302 |     
303 |     # Read and parse the chain file
304 |     dict_chain_info = {}
305 |     key_tuple = None
306 |     fc = open(fn_chain, 'r')
307 |     for line in fc:
308 |         fields = line.strip().split()
309 |         if len(fields) > 0 and fields[0] == "chain":
310 |             key_tuple = tuple(fields)
311 |             dict_chain_info[key_tuple] = []
312 |         else:
313 |             dict_chain_info[key_tuple].append(fields)
314 |     fc.close()
315 |     
316 |     for key_tuple, list_info in dict_chain_info.items():
317 |         assert(key_tuple[4] == key_tuple[9])
318 |         t_start  = int(key_tuple[5])
319 |         assert(t_start == 0) # in this version, we only support one whole genome
320 |         t_stop   = int(key_tuple[6])
321 |         h_start  = int(key_tuple[10])
322 |         ref_name = key_tuple[2]
323 |         
324 |         list_var_start  = sorted(dict_ref_alts[ref_name].keys())
325 |         set_conflict = dict_set_conflict_vars[ref_name]
326 |         idx_chain = 0
327 |         pos_chain = t_start + int(list_info[idx_chain][0])
328 |         offset = 0
329 |         for var_start in list_var_start:
330 |             if var_start in set_conflict:
331 |                 continue
332 |             elif var_start < t_start:
333 |                 continue
334 |             elif var_start > t_stop:
335 |                 break
336 | 
337 |             if var_start < pos_chain:
338 |                 dict_ref_consensus_map[ref_name][var_start] = var_start + offset
339 |             else:
340 |                 while pos_chain <= var_start:
341 |                     pos_chain += int(list_info[idx_chain][1])
342 |                     offset -= int(list_info[idx_chain][1])
343 |                     offset += int(list_info[idx_chain][2])
344 |                     idx_chain += 1
345 |                     pos_chain += int(list_info[idx_chain][0])
346 |                 dict_ref_consensus_map[ref_name][var_start] = var_start + offset
347 |     return dict_ref_consensus_map
348 |             
349 | 
350 | def count_haps(
351 |         dict_ref_alts   :dict,
352 |         f_sam0          :pysam.AlignmentFile,
353 |         f_sam1          :pysam.AlignmentFile,
354 |         dict_ref_consensus_map0 :dict,
355 |         dict_ref_consensus_map1 :dict,
356 |         dict_set_conflict_vars  :dict,
357 |         debug           :bool=False
358 |         ) -> dict:
359 |     """
360 |     Count the number of reads in each golden haplotype sam covering the variants
361 |     """
362 |     dict_ref_var_count = {}
363 |     for ref_name, dict_vars in dict_ref_alts.items():
364 |         dict_ref_var_count[ref_name] = {}
365 |         set_conflict = dict_set_conflict_vars[ref_name]
366 |         for var_start, hap_seqs in dict_vars.items():
367 |             if var_start in set_conflict:
368 |                 continue
369 |             if hap_seqs[2] == hap_seqs[3]: # if the var is homozygous
370 |                 continue
371 |             hap0_start = dict_ref_consensus_map0[ref_name][var_start]
372 |             hap0_stop  = hap0_start + len(hap_seqs[0])
373 |             hap1_start = dict_ref_consensus_map1[ref_name][var_start]
374 |             hap1_stop  = hap1_start + len(hap_seqs[1])
375 |             
376 |             # read numbers overlapping the variants
377 |             count0 = f_sam0.count(contig=ref_name, start=hap0_start, stop=hap0_stop)
378 |             count1 = f_sam1.count(contig=ref_name, start=hap1_start, stop=hap1_stop)
379 |             if debug:
380 |                 print(ref_name, var_start, ':\n\thapA (' + str(count0) + "): ", end="")
381 |                 for read in f_sam0.fetch(contig=ref_name, start=hap0_start, stop=hap0_stop):
382 |                     print(read.query_name, end=", ")
383 |                 print("\n\thapB (" + str(count1) + "): ", end="")
384 |                 for read in f_sam1.fetch(contig=ref_name, start=hap1_start, stop=hap1_stop):
385 |                     print(read.query_name, end=", ")
386 |                 print("\n", end="")
387 |                 
388 |             dict_ref_var_count[ref_name][var_start] = (count0,count1)
389 |     return dict_ref_var_count
390 | 
391 | 
392 | def get_bound(
393 |         hap0_start,
394 |         hap0_stop,
395 |         hap1_start,
396 |         hap1_stop,
397 |         len_hap0,
398 |         len_hap1,
399 |         flag_side,
400 |         len_extend
401 |     ) -> tuple:
402 |     """
403 |     return: (eff_lbound0, eff_rbound0, eff_lbound1, eff_rbound1)
404 |     """
405 |     min_len = min(len_hap0, len_hap1)
406 |     if flag_side == 0: # extend to the right
407 |         if len_hap0 < len_hap1:
408 |             return (hap0_stop+len_extend, \
409 |                     hap0_start, \
410 |                     hap1_start+min_len+len_extend, \
411 |                     hap1_stop) #min(hap1_stop, hap1_start+min_len+len_extend))
412 |         else:
413 |             return (hap0_start+min_len+len_extend, \
414 |                     hap0_stop, \
415 |                     hap1_stop+len_extend, \
416 |                     hap1_start)
417 |                     #min(hap0_stop, hap0_start+min_len+len_extend), \
418 |     else: # extend to the left
419 |         if len_hap0 < len_hap1:
420 |             return (hap0_stop, \
421 |                     hap0_start-len_extend, \
422 |                     hap1_start, \
423 |                     hap1_stop-min_len-len_extend)
424 |                     #max(hap1_start, hap1_stop-min_len-len_extend), \
425 |         else:
426 |             return (hap0_start, \
427 |                     hap0_stop-min_len-len_extend, \
428 |                     hap1_stop, \
429 |                     hap1_start-len_extend)
430 |                     #max(hap0_start, hap0_stop-min_len-len_extend), \
431 | 
432 | 
433 | 
434 | def count_haps_n_report_name(
435 |         dict_ref_alts   :dict,
436 |         f_sam0          :pysam.AlignmentFile,
437 |         f_sam1          :pysam.AlignmentFile,
438 |         dict_ref_consensus_map0 :dict,
439 |         dict_ref_consensus_map1 :dict,
440 |         dict_set_conflict_vars  :dict,
441 |         dict_effective_var      :dict,
442 |         padding         :int=5,
443 |         debug           :bool=False
444 |         ) -> dict:
445 |     """
446 |     Count the number of reads in each golden haplotype sam covering the variants
447 |     """
448 |     dict_ref_var_count = {}
449 |     dict_ref_var_name  = {}
450 |     for ref_name, dict_vars in dict_ref_alts.items():
451 |         dict_ref_var_count[ref_name] = {}
452 |         dict_ref_var_name [ref_name] = {}
453 |         set_conflict = dict_set_conflict_vars[ref_name]
454 | 
455 |         len_dict_vars = len(dict_vars)
456 |         for idx, (var_start, hap_seqs) in enumerate(dict_vars.items()):
457 |             if var_start in set_conflict:
458 |                 continue
459 |             if hap_seqs[2] == hap_seqs[3]: # if the var is homozygous
460 |                 continue
461 |             hap0_start = dict_ref_consensus_map0[ref_name][var_start]
462 |             hap0_stop  = hap0_start + len(hap_seqs[0])
463 |             hap1_start = dict_ref_consensus_map1[ref_name][var_start]
464 |             hap1_stop  = hap1_start + len(hap_seqs[1])
465 |             
466 |             #if var_start < 6611800:
467 |             #    continue
468 |             # read numbers overlapping the variants
469 |             if dict_effective_var.get(var_start): # if the site has larger effective var size
470 |                 flag_side, len_extend, left_var, right_var = dict_effective_var[var_start]
471 |                 min_len = min(len(hap_seqs[0]), len(hap_seqs[1]))
472 |                 if flag_side == 0: # right extend
473 |                     eff_start0 = hap0_start
474 |                     eff_stop0  = hap0_start + len(hap_seqs[0]) + len_extend
475 |                     eff_start1 = hap1_start
476 |                     eff_stop1  = hap1_start + len(hap_seqs[1]) + len_extend
477 |                 else:
478 |                     eff_start0 = hap0_stop - len(hap_seqs[0]) - len_extend
479 |                     eff_stop0  = hap0_stop
480 |                     eff_start1 = hap1_stop - len(hap_seqs[1]) - len_extend
481 |                     eff_stop1  = hap1_stop
482 |                 eff_lbound0, eff_rbound0, eff_lbound1, eff_rbound1 = get_bound(hap0_start, hap0_stop, hap1_start, hap1_stop, \
483 |                                                                                len(hap_seqs[0]), len(hap_seqs[1]), flag_side, len_extend)
484 |                 # compensate for the nearby variants
485 |                 if left_var != -1:
486 |                     eff_lbound0 = min(eff_lbound0, hap0_start-left_var)
487 |                     eff_lbound1 = min(eff_lbound1, hap1_start-left_var)
488 |                 if right_var != -1:
489 |                     eff_rbound0 = max(eff_rbound0, hap0_stop+right_var)
490 |                     eff_rbound1 = max(eff_rbound1, hap1_stop+right_var)
491 |                 
492 |                 read_segment0 = f_sam0.fetch(contig=ref_name, start=eff_start0, stop=eff_stop0)
493 |                 set_expand0 = set()
494 |                 set_inside0 = set()
495 |                 for read in read_segment0:
496 |                     if read.reference_end >= eff_lbound0 and read.reference_start <= eff_rbound0:
497 |                         set_expand0.add((read.query_name, read.is_read2))
498 |                     else:
499 |                         set_inside0.add((read.query_name, read.is_read2))
500 | 
501 |                 read_segment1 = f_sam1.fetch(contig=ref_name, start=eff_start1, stop=eff_stop1)
502 |                 set_expand1 = set()
503 |                 set_inside1 = set()
504 |                 #print(hap1_start, hap1_start+len(hap_seqs[0]), len_extend, left_var, right_var, flag_side)
505 |                 #print(eff_lbound1, eff_rbound1)
506 |                 for read in read_segment1:
507 |                     if read.reference_end >= eff_lbound1 and read.reference_start <= eff_rbound1:
508 |                         set_expand1.add((read.query_name, read.is_read2))
509 |                     else:
510 |                         set_inside1.add((read.query_name, read.is_read2))
511 |                 
512 |                 """
513 |                 if var_start == 6611841:
514 |                     print(flag_side, len_extend)
515 |                     print(hap_seqs[0], hap_seqs[1])
516 |                     print(hap0_start, hap0_stop)
517 |                     print(hap1_start, hap1_stop)
518 | 
519 |                     print(set_expand1)
520 |                     print(set_inside1)
521 |                     print(hap0_start, eff_start0, hap0_stop, eff_stop0)
522 |                     print(hap1_start, eff_start1, hap1_stop, eff_stop1)
523 |                     print(len(set_expand0) + len(set_expand1))
524 |                         
525 |                 read_segment0_start = f_sam0.fetch(contig=ref_name, start=eff_start0)
526 |                 read_segment0_stop  = f_sam0.fetch(contig=ref_name, start=eff_stop0)
527 |                 read_segment1_start = f_sam1.fetch(contig=ref_name, start=eff_start1)
528 |                 read_segment1_stop  = f_sam1.fetch(contig=ref_name, start=eff_stop1)
529 |                 name_set0_start = set([read.query_name for read in read_segment0_start])
530 |                 name_set0_stop  = set([read.query_name for read in read_segment0_stop])
531 |                 name_set1_start = set([read.query_name for read in read_segment1_start])
532 |                 name_set1_stop  = set([read.query_name for read in read_segment1_stop])
533 |                 
534 |                 name_set0 = name_set0_start.intersection(name_set0_stop)
535 |                 name_set1 = name_set1_start.intersection(name_set1_stop)
536 |                 count0 = len(name_set0)
537 |                 count1 = len(name_set1)
538 |                 #symmetric_difference
539 |                 print(var_start)
540 |                 print(name_set0_start.symmetric_difference(name_set0_stop))
541 |                 print(name_set1_start.symmetric_difference(name_set1_stop))"""
542 |                 
543 |                 count0 = len(set_expand0) 
544 |                 count1 = len(set_expand1) 
545 |                 dict_ref_var_count[ref_name][var_start] = (count0, count1)
546 |                 dict_ref_var_name [ref_name][var_start] = (set_expand0, set_expand1, count0, count1, set_inside0, set_inside1)
547 |             else:
548 |                 read_segment0 = f_sam0.fetch(contig=ref_name, start=hap0_start, stop=hap0_stop)
549 |                 name_set0    = set([(read.query_name, read.is_read2) for read in read_segment0])
550 |                 count0 = len(name_set0)
551 |                 read_segment1 = f_sam1.fetch(contig=ref_name, start=hap1_start, stop=hap1_stop)
552 |                 name_set1    = set([(read.query_name, read.is_read2) for read in read_segment1])
553 |                 count1 = len(name_set1)
554 |                 
555 |                 dict_ref_var_count[ref_name][var_start] = (count0, count1)
556 |                 dict_ref_var_name [ref_name][var_start] = (name_set0, name_set1, count0, count1)
557 |             
558 |             if debug:
559 |                 print(ref_name, var_start, ':\n\thapA (' + str(count0) + "): ", end="")
560 |                 for read in f_sam0.fetch(contig=ref_name, start=hap0_start, stop=hap0_stop):
561 |                     print(read.query_name, end=", ")
562 |                 print("\n\thapB (" + str(count1) + "): ", end="")
563 |                 for read in f_sam1.fetch(contig=ref_name, start=hap1_start, stop=hap1_stop):
564 |                     print(read.query_name, end=", ")
565 |                 print("\n", end="")
566 |     return dict_ref_var_count, dict_ref_var_name
567 | 
568 | 
569 | def output_report(
570 |         f_vcf               :pysam.VariantFile,
571 |         dict_ref_var_count  :dict,
572 |         fn_output           :str
573 |         ) -> None:
574 |     """
575 |     ourput report
576 |     """
577 |     f_all = open(fn_output, 'w')
578 |     f_gap = open(fn_output + '.gap', 'w')
579 |     f_SNP = open(fn_output + '.SNP', 'w')
580 |     f_all.write("CHR\tHET_SITE\tGOLDEN_DISTRIBUTION\tREF_COUNT\tALT_COUNT\tGAP\n")
581 |     f_gap.write("CHR\tHET_SITE\tGOLDEN_DISTRIBUTION\tREF_COUNT\tALT_COUNT\n")
582 |     f_SNP.write("CHR\tHET_SITE\tGOLDEN_DISTRIBUTION\tREF_COUNT\tALT_COUNT\n")
583 |     for var in f_vcf:
584 |         hap_0, hap_1 = var.samples[0]['GT']
585 |         if hap_0 != 0 and hap_1 != 0:
586 |             continue
587 |         ref_name = var.contig
588 |         if dict_ref_var_count[ref_name].get(var.start): # Exist legal variant
589 |             count0, count1 = dict_ref_var_count[ref_name][var.start]
590 |             len_var = 0
591 |             if hap_0 == 0:
592 |                 read_distribution = count0/max((count0+count1),0.001)
593 |                 distring = format(read_distribution, '.8f') + '\t' + str(count0) + '\t' + str(count1)
594 |                 len_var = len(var.alts[hap_1-1])
595 |             else:
596 |                 read_distribution = count1/max((count0+count1),0.001)
597 |                 distring = format(read_distribution, '.8f') + '\t' + str(count1) + '\t' + str(count0)
598 |                 len_var = len(var.alts[hap_0-1])
599 |             f_all.write(ref_name + '\t' + str(var.start+1) + '\t' + distring + '\t')
600 |             if len(var.ref) != len_var:
601 |                 f_gap.write(ref_name + '\t' + str(var.start+1) + '\t' + distring + '\n')
602 |                 f_all.write('.\n')
603 |             else:
604 |                 f_SNP.write(ref_name + '\t' + str(var.start+1) + '\t' + distring + '\n')
605 |                 f_all.write('\n')
606 |     
607 |     f_all.close()
608 |     f_gap.close()
609 |     f_SNP.close()
610 | 
611 | 
612 | 
613 | if __name__ == "__main__":
614 |     parser = argparse.ArgumentParser()
615 |     parser.add_argument('-v', '--vcf', help='vcf file')
616 |     parser.add_argument('-c0', '--hap0_chain', help='hap0 chain file')
617 |     parser.add_argument('-c1', '--hap1_chain', help='hap1 chain file')
618 |     parser.add_argument('-f0', '--hap0_fasta', help='hap0 consensus fasta file')
619 |     parser.add_argument('-f1', '--hap1_fasta', help='hap1 consensus fasta file')
620 |     parser.add_argument('-s0', '--hap0_sam', help='hap0 sam file')
621 |     parser.add_argument('-s1', '--hap1_sam', help='hap1 sam file')
622 |     parser.add_argument('-o', '--out', help='output file')
623 |     args = parser.parse_args()
624 |     
625 |     fn_vcf = args.vcf
626 |     fn_chain0 = args.hap0_chain
627 |     fn_chain1 = args.hap1_chain
628 |     fn_hap0_fasta = args.hap0_fasta
629 |     fn_hap1_fasta = args.hap1_fasta
630 |     fn_sam0 = args.hap0_sam
631 |     fn_sam1 = args.hap1_sam
632 |     fn_output = args.out
633 |     var_chain = 25
634 |     
635 |     f_vcf = pysam.VariantFile(fn_vcf)
636 |     f_hap0_fasta = pysam.FastaFile(fn_hap0_fasta)
637 |     f_hap1_fasta = pysam.FastaFile(fn_hap1_fasta)
638 |     print("Start locating variants and the conflicting variants...")
639 |     dict_set_conflict_vars, dict_ref_alts = variant_seq(
640 |             f_vcf=f_vcf,
641 |             f_fasta=f_hap0_fasta
642 |             )
643 |     # extend conflict set
644 |     for ref_name in dict_set_conflict_vars.keys():
645 |         for pos in list(dict_set_conflict_vars[ref_name]):
646 |             for extend in range(pos-var_chain, pos+var_chain):
647 |                 dict_set_conflict_vars[ref_name].add(extend)
648 |     print("Start building the mapping consensus coordinate...")
649 |     dict_ref_consensus_map0 = variant_map(
650 |             fn_chain=fn_chain0,
651 |             dict_ref_alts=dict_ref_alts,
652 |             dict_set_conflict_vars=dict_set_conflict_vars
653 |             )
654 |     dict_ref_consensus_map1 = variant_map(
655 |             fn_chain=fn_chain1,
656 |             dict_ref_alts=dict_ref_alts,
657 |             dict_set_conflict_vars=dict_set_conflict_vars
658 |             )
659 |     # obsolete if you are confident
660 |     print("Checking if the coordinate is correct...")
661 |     dict_effective_var = check_coordinate(
662 |             dict_ref_alts=dict_ref_alts,
663 |             f_hap0_fasta=f_hap0_fasta,
664 |             f_hap1_fasta=f_hap1_fasta,
665 |             dict_ref_consensus_map0=dict_ref_consensus_map0,
666 |             dict_ref_consensus_map1=dict_ref_consensus_map1,
667 |             dict_set_conflict_vars=dict_set_conflict_vars,
668 |             padding=10
669 |             )
670 |     print("Checking the simulation sam file covering of the variants")
671 |     f_sam0 = pysam.AlignmentFile(fn_sam0)
672 |     f_sam1 = pysam.AlignmentFile(fn_sam1)
673 |     dict_ref_var_count, dict_ref_var_name = count_haps_n_report_name(
674 |             dict_ref_alts=dict_ref_alts,
675 |             f_sam0=f_sam0,
676 |             f_sam1=f_sam1,
677 |             dict_ref_consensus_map0=dict_ref_consensus_map0,
678 |             dict_ref_consensus_map1=dict_ref_consensus_map1,
679 |             dict_set_conflict_vars=dict_set_conflict_vars,
680 |             dict_effective_var=dict_effective_var,
681 |             padding=10,
682 |             debug=False
683 |             )
684 |     f_vcf = pysam.VariantFile(fn_vcf)
685 |     print("Start output report...")
686 |     output_report(
687 |             f_vcf=f_vcf,
688 |             dict_ref_var_count=dict_ref_var_count,
689 |             fn_output=fn_output)
690 |     print("Dump golden read names pickle file...")
691 |     with open(fn_output + '.pickle', 'wb') as f:
692 |         pickle.dump(dict_ref_var_name, f)
693 | 
694 | 


--------------------------------------------------------------------------------