├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── benchmarking
    ├── benchmark-Bmallei-fasta-all.sh
    ├── benchmark-Bmallei-fastq-snp.sh
    ├── benchmark-Bmallei-fastq-sv.sh
    ├── benchmark-bwa-vs-mm-all.sh
    ├── benchmark-cov-all.sh
    ├── benchmark-cov-snp.sh
    ├── benchmark-cov-sv.sh
    ├── benchmark-numvariants-snp.sh
    ├── benchmark-numvariants-sv.sh
    ├── benchmark-real_data-Bush-fasta-snp.sh
    ├── benchmark-real_data-Bush-fastq-snp.sh
    ├── benchmark-real_data-HG002-sv.sh
    ├── benchmark-sim_data-Bush-Ecoli-divergent-snp.sh
    ├── benchmark-sim_data-Bush-Ecoli-same-snp.sh
    ├── benchmark-sim_data-PR-Bmallei-all.sh
    ├── benchmark-sim_data-PR-Ecoli-all.sh
    ├── benchmark-sim_data-PR-Lacidophilus-all.sh
    ├── benchmark-threads-all.sh
    ├── benchmark-threads-snp.sh
    ├── benchmark-threads-sv.sh
    ├── parameter_snp.txt
    ├── parameter_sv.txt
    ├── parameter_sv_bmallei.txt
    ├── parameter_sv_ecoli.txt
    └── parameter_sv_lacidophilus.txt
├── build.sh
├── res
    └── logo.png
├── setup.py
├── spec-file.txt
├── testdata
    ├── testdata_mut.fasta
    ├── testdata_ref.fasta
    ├── testdata_snp.vcf
    └── testdata_sv.vcf
├── variantdetective.py
└── variantdetective
    ├── __init__.py
    ├── combine_variants.py
    ├── fragment_lengths.py
    ├── main.py
    ├── simulate.py
    ├── simulate_tools.py
    ├── snp_indel.py
    ├── structural_variant.py
    ├── tools.py
    ├── validate_inputs.py
    └── version.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | recipe/
3 | variantdetective.egg-info/
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | ## [1.0.1] - 2024-03-27
 6 | 
 7 | ### Added
 8 | 
 9 | - Created a changelog file to document project changes.
10 | 
11 | ### Fixed
12 | - Fixed issue #9 to allow running the pipeline in a folder that already exists and overwrite the results, enhancing usability and automation capability.
13 | 
14 | ## [1.0.0] - 2024-01-16
15 | 
16 | ### Added
17 | 
18 | The first release of VariantDetective. This version of the tool is represented in the manuscript found here: https://academic.oup.com/bioinformatics/article/40/2/btae066/7609103
19 | 
20 | 
21 | [1.0.1]: https://github.com/OLF-Bioinformatics/VariantDetective/compare/v1.0.0...v1.0.1
22 | [1.0.0]: https://github.com/OLF-Bioinformatics/VariantDetective/releases/tag/v1.0.0


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Government of Canada
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align='center'><img src='res/logo.png' alt="VariantDetective" width="75%"></p>
  2 | 
  3 | This program is designed to identify short variants and structural variants. Variants can be identified from genomic sequences (FASTA) or from combinations of short and/or long reads (FASTQ). If genomic sequences are provided as input, long reads will be simulated to detect variants.
  4 | 
  5 | This tool makes use of other open-source variant callers and creates consensus sets in order to validate a variant. Summary files for short variants and structural variants are generated outlining the different types of variants found in the sample.
  6 | 
  7 | ## Author
  8 | 
  9 | Phil Charron \<<phil.charron@inspection.gc.ca>\>
 10 | 
 11 | ## Table of Contents
 12 |   - [Installation](#installation)
 13 |      - [Installation from Source](#installation-from-source)
 14 |      - [Conda Installation](#conda-installation)
 15 |   - [Quick Usage](#quick-usage)
 16 |   - [List of Commands](#list-of-commands)
 17 |   - [Variant Callers](#variant-callers)
 18 |      - [Short Variant Callers](#short-variant-callers)
 19 |      - [Structural Variant Callers](#structural-variant-callers)
 20 |   - [Long Read Simulator](#long-read-simulator)
 21 |   - [Parameters](#parameters)
 22 |   - [Outputs](#outputs)
 23 |      - [Output files - snp_indel directory](#output-files---snp_indel-directory)
 24 |      - [Output files - structural_variant directory](#output-files---structural_variant-directory)  
 25 |   - [Reporting Issues](#reporting-issues)
 26 |   - [Citing VariantDetective](#citing-variantdetective)
 27 | 
 28 | 
 29 | ## Installation
 30 | 
 31 | All software and tools used by VariantDetective can be found in the [spec-file.txt](spec-file.txt). VariantDetective can be installed via pip after creating the conda environment to support it or via conda.
 32 | 
 33 | ### Installation from Source
 34 | VariantDetective can be installed from source using the following method.
 35 | ```
 36 | # Download VariantDetective repository
 37 | git clone https://github.com/OLF-Bioinformatics/VariantDetective.git
 38 | cd VariantDetective
 39 | # Create conda variant for tools
 40 | conda create -n variantdetective -y && conda activate variantdetective
 41 | # Install specific versions of tools
 42 | conda install -n variantdetective --file spec-file.txt
 43 | # Install VariantDetective
 44 | pip install -e .
 45 | ```
 46 | 
 47 | ### Conda Installation
 48 | ```
 49 | conda create -n vd -y
 50 | conda activate vd
 51 | conda install -c bioconda -c conda-forge -c charronp variantdetective
 52 | ```
 53 | 
 54 | ## Test Data
 55 | After successfully installing VariantDetective, you can verify its functionality
 56 | running test data. This step ensures that the isntallation has been completed 
 57 | correctly and the project is functioning as expected.
 58 | 
 59 | From within the VariantDetective directory, the test can be run using the following command:
 60 | 
 61 | ```
 62 | variantdetective all_variants -r testdata/testdata_ref.fasta -g testdata/testdata_mut.fasta -o testdata/test
 63 | ```
 64 | 
 65 | Once the tool is done, check the output. Compare the `testdata/test/snp_indel/snp_final.vcf` file with `testdata/testdata_snp.vcf`
 66 | and the `testdata/test/structural_variant/combined_sv.vcf` file with `testdata/testdata_sv.vcf`.
 67 | 
 68 | ## Quick Usage
 69 | 
 70 | **Find snps/indels and structural variants from an assembled genome (FASTA)**
 71 | 
 72 | ```
 73 | variantdetective all_variants -r REFERENCE.fasta -g SAMPLE.fasta
 74 | ```
 75 | 
 76 | **Find snps/indels and structural variants from raw reads (FASTQ)**
 77 | 
 78 | ```
 79 | variantdetective all_variants -r REFERENCE.fasta -1 SHORT_READ_1.fastq -2 SHORT_READ_2.fastq -l LONG_READ.fastq
 80 | ```
 81 | 
 82 | **Find snps/indels from an assembled genome (FASTA)**
 83 | 
 84 | ```
 85 | variantdetective snp_indel -r REFERENCE.fasta -g SAMPLE.fasta
 86 | ```
 87 | 
 88 | **Find snps/indels from raw reads (FASTQ)**
 89 | 
 90 | ```
 91 | variantdetective snp_indel -r REFERENCE.fasta -1 SHORT_READ_1.fastq -2 SHORT_READ_2.fastq 
 92 | ```
 93 | 
 94 | **Find structural variants from an assembled genome (FASTA)**
 95 | 
 96 | ```
 97 | variantdetective structural_variant -r REFERENCE.fasta -g SAMPLE.fasta
 98 | ```
 99 | 
100 | **Find structural variants from raw reads (FASTQ)**
101 | 
102 | ```
103 | variantdetective structural_variant -r REFERENCE.fasta -l LONG_READ.fastq
104 | ```
105 | 
106 | **Combine SNP VCF files predicted from other tools and get consensus set of minimum 2 callers**
107 | 
108 | ```
109 | variantdetective combine_variants --snp_vcf  TOOL1.vcf TOOL2.vcf TOOL3.VCF --snp_consensus 2
110 | ```
111 | 
112 | **Combine SV VCF files predicted from other tools and get consensus set of minimum 2 callers**
113 | 
114 | ```
115 | variantdetective combine_variants --sv_vcf  TOOL1.vcf TOOL2.vcf TOOL3.VCF --sv_consensus 2
116 | ```
117 | 
118 | ## List of Commands
119 | | Command | Description |
120 | | --- | --- |
121 | | `variantdetective all_variants` | Identify structural variants (SV) from long reads (FASTQ) and SNPs/indels from short reads (FASTQ), or both types of variants from genome sequence (FASTA). If genome sequence (FASTA) is provided, reads will be simulated to predict SV, SNPs and indels. |
122 | | `variantdetective structural_variant` | Identify structural variants (SV) from long reads (FASTQ) or genome sequence (FASTA).  If genome sequence (FASTA) is provided, reads will be simulated to predict SVs. |
123 | | `variantdetective snp_indel` | Identify SNPs/indels from short reads (FASTQ) or genome sequence (FASTA). If genome sequence (FASTA) is provided instead, reads will be simulated to predict SNPs and indels. |
124 | | `variantdetective combine_variants` |  Combine SNPs/indels VCF files or SV VCF files predicted from other tools. |
125 | 
126 | ## Variant Callers
127 | VariantDetective makes use of published open-source variant callers and creates consensus sets in order to validate a variant.
128 | 
129 | ### Short Variant Callers
130 | - [Clair3](https://github.com/HKU-BAL/Clair3)
131 | - [Freebayes](https://github.com/freebayes/freebayes)
132 | - [GATK4 HaplotypeCaller](https://github.com/broadinstitute/gatk)
133 | 
134 | Intersections of VCF files are created using the [VCFtools](https://github.com/vcftools) `vcf-isec` tool. The final VCF output consensus file containing variants found in at least 2 variant callers (default) is created using the [BCFtools](https://github.com/samtools/bcftools) `concat` tool.
135 | 
136 | ### Structural Variant Callers
137 | - [cuteSV](https://github.com/tjiangHIT/cuteSV)
138 | - [NanoSV](https://github.com/mroosmalen/nanosv)
139 | - [NanoVar](https://github.com/cytham/nanovar)
140 | - [SVIM](https://github.com/eldariont/svim)
141 | 
142 | The consensus VCF file is created using the [SURVIVOR](https://github.com/fritzsedlazeck/SURVIVOR) `merge` tool. Parameters for merging structural variants are a maximum allowed distance of 1 kbp between breakpoints and calls supported by at least 3 variant callers (default) where they agree on both type and strand.  
143 | 
144 | ## Long Read Simulator
145 | When a genomic FASTA file is provided as query input, long reads are simulated in order to detect variants. The long read simulation tool is adapted from [Badread](https://github.com/rrwick/Badread), a tool that creates simulated reads. It has been modified to create perfectly matching reads to the reference file and to allow multi-threading to speed up the process.
146 | 
147 | ## Parameters
148 | All input files can be uncompressed (.fasta/.fastq) or gzipped (.fastq.gz/.fastq.gz)
149 | 
150 | | Options | Available Command | Description | Default | 
151 | | --- | :---: | --- | :---: |
152 | | `-r FASTA` | `all_variants`<br>`structural_variant`<br>`snp_indel` | Path to reference genome in FASTA. Required | - |
153 | | `-g FASTA` |  `all_variants`<br>`structural_variant`<br>`snp_indel` | Path to query genomic FASTA file. Can't be combined with `-1`, `-2` or `-l`| - |
154 | | `-1 FASTQ`<br>`--short1 FASTQ` | `all_variants`<br>`snp_indel` | Path to pair 1 of short reads FASTQ file. Must always be combined with `-2`. If running `all_variants`, must be combined with `-l`| - |
155 | | `-2 FASTQ`<br>`--short2 FASTQ` | `all_variants`<br>`snp_indel` | Path to pair 2 of short reads FASTQ file. Must always be combined with `-1`. If running `all_variants`, must be combined with `-l`| - |
156 | | `-l FASTQ`<br>`--long FASTQ` | `all_variants`<br>`structural_variant` | Path to long reads FASTQ file. If running `all_variants`, must be combined with `-1` and `-2`| - |
157 | | `--readcov READCOV` |  `all_variants`<br>`structural_variant`<br>`snp_indel` | If using `-g` as input, define the absolute amount of simulated reads (e.g. 250M) or relative simulated read depth (e.g. 50x) | `50x` | 
158 | | `--readlen MEAN,STDEV` |  `all_variants`<br>`structural_variant`<br>`snp_indel` | If using `-g` as input, define the mean length and standard deviation of simulated reads | `15000,13000` |
159 | | `--mincov_snp MINCOV_SNP` |  `all_variants`<br>`snp_indel` | Minimum number of reads required to call SNP/Indel | `2` |
160 | | `--minqual_snp MINQUAL_SNP` |  `all_variants`<br>`snp_indel` | Minimum quality of SNP/Indel to be filtered out | `20` |
161 | | `--assembler {bwa,minimap2}` |  `all_variants`<br>`snp_indel` | Choose which assembler (bwa or minimap2) to use when using paired-end short reads | `bwa` |
162 | | `--snp_consensus SNP_CONSENSUS` |  `all_variants`<br>`snp_indel` | Specifies the minimum number of tools required to detect an SNP or Indel to include it in the consensus list | `2` |
163 | | `--mincov_sv MINCOV_SV` | `all_variants`<br>`structural_variant` | Minimum number of reads required to call SV  | `2` |
164 | | `--minlen_sv MINLEN_SV` | `all_variants`<br>`structural_variant` | Minimum length of SV to be detected | `25` |
165 | | `--minqual_sv MINQUAL_SV` |  `all_variants`<br>`structural_variant` | Minimum quality of SV to be filtered out from SVIM | `15` |
166 | | `--sv_consensus SV_CONSENSUS` |  `all_variants`<br>`structural_variant` | Specifies the minimum number of tools required to detect an SV to include it in the consensus list | `3` |
167 | | `-o OUT`<br>`--out OUT` | `all_variants`<br>`structural_variant`<br>`snp_indel` |  Output directory. Will be created if it does not exist | `./` |
168 | | `-t THREADS` <br> `--threads THREADS` | `all_variants` <br> `structural_variant` <br> `snp_indel` | Number of threads used for job | `1` |
169 | | `-h` <br> `--help` | `all_variants`<br>`structural_variant`<br>`snp_indel` | Show help message and exit | - |
170 | | `-v` <br> `--version` |  `all_variants`<br>`structural_variant`<br>`snp_indel`| Show program version number and exit | - |
171 | 
172 | ## Outputs
173 | 
174 | All input files will be copied to the output folder. Within the output folder, directories containing the `structural_variant` and `snp_indel` results will be created.
175 | 
176 | ### Output files - `snp_indel` directory
177 | 
178 | | Output |  Description |
179 | |---:|---|
180 | | `snp_final.vcf` | Variants that were found in at least 2 variant callers in VCF format |
181 | | `snp_final.csv` | Variants that were found in at least 2 variant callers in CSV format |
182 | | `snp_final.tab` | Variants that were found in at least 2 variant callers in TSV format |
183 | | `snp_final_summary.txt` | Summary of different short variant types found in snp_final files |
184 | | `freebayes.haplotypecaller.clair3.vcf.gz` | Variants in common between all variants callers |
185 | | `freebayes.clair3.vcf.gz` | Variants in common between Freebayes and Clair3 |
186 | | `freebayes.haplotypecaller.vcf.gz` | Variants in common between Freebayes and HaplotypeCaller |
187 | | `haplotypecaller.unique.vcf.gz` | Variants in common between HaplotypeCaller and Clair3 |
188 | | `clair3.unique.vcf.gz` | Variants only found by Clair3 |
189 | | `freebayes.unique.vcf.gz` | Variants only found by Freebayes |
190 | | `haplotypecaller.unique.vcf.gz` | Variants only found by HaplotypeCaller | 
191 | | `alignment.mm.rg.sorted.bam` |  Alignment in BAM format |
192 | | `alignment.mm.rg.sorted.bam.bai` | Index file of alignments |
193 | | `clair3/` | Directory containing files related to Clair3 variant calling |
194 | | `freebayes/` | Directory containing files related to Freebayes variant calling |
195 | | `haplotypecaller/` | Directory containing files related to HaplotypeCaller variant calling |
196 | 
197 | ### Output files - `structural_variant` directory
198 | 
199 | | Output |  Description |
200 | |---:|---|
201 | | `combined_sv.vcf` | Variants that were found in at least 2 variant callers in VCF format |
202 | | `combined_sv.csv` | Variants that were found in at least 2 variant callers in CSV format |
203 | | `combined_sv.tab` | Variants that were found in at least 2 variant callers in TSV format |
204 | | `combined_sv_summary.txt` | Summary of different structural variant types found in combined_sv files |
205 | | `alignment.mm.sorted.bam` |  Alignment in BAM format |
206 | | `alignment.mm.sorted.bam.bai` | Index file of alignments |
207 | | `cutesv/` | Directory containing files related to cuteSV variant calling |
208 | | `nanosv/` | Directory containing files related to NanoSV variant calling |
209 | | `nanovar/` | Directory containing files related to NanoVar variant calling |
210 | | `svim/` | Directory containing files related to SVIM variant calling |
211 | 
212 | 
213 | ## Reporting Issues
214 | 
215 | If you have any issues installing or running VariantDetective, or would like a new feature added to the tool, please open an issue here on GitHub. 
216 | 
217 | ## Citing VariantDetective
218 | 
219 | The manuscript describing this tool is available [here](https://academic.oup.com/bioinformatics/article/40/2/btae066/7609103).
220 | 
221 | The tool should be cited as follows:
222 | 
223 | > Philippe Charron, Mingsong Kang, "VariantDetective: An Accurate All-in-One Pipeline for Detecting Consensus Bacterial SNPs and SVs," Bioinformatics, Vol. 40, No. 2, February 2024, btae066, https://doi.org/10.1093/bioinformatics/btae066.
224 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-Bmallei-fasta-all.sh:
--------------------------------------------------------------------------------
 1 | for file in benchmarking/benchmark-Bmallei-fasta-all/*.fasta
 2 | do
 3 |      name=${file::-6}
 4 |      echo ${file}
 5 |      ./variantdetective.py all_variants \
 6 |         -g ${file} \
 7 |         -r benchmarking/benchmark-Bmallei-fasta-all/GCA_000011705.1.fa \
 8 |         -t 24 \
 9 |         --readcov 50X \
10 |         -o $name
11 |      rm $name/GCA_000011705.1*
12 |      rm $name/*fast*
13 |      rm $name/*/*bam*
14 | done
15 | cat 


--------------------------------------------------------------------------------
/benchmarking/benchmark-Bmallei-fastq-snp.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-Bmallei-fastq-snp'
 2 | mkdir -p $dir
 3 | 
 4 | for sra in {SRR1618492,SRR1618671,SRR1618688,SRR1618499,SRR1618349,SRR1616952,SRR2146904,SRR2146899,SRR2146902,SRR2147667,SRR8283094,SRR8072932,SRR8072935,SRR8072938,ERR9616711};
 5 | do
 6 | echo $sra;
 7 | ./variantdetective.py snp_indel \
 8 |         -1 ${sra}_1.fastq \
 9 | 	-2 ${sra}_2.fastq \
10 |         -r GCA_000011705.1.fa \
11 |         -t 24 \
12 |         -o $dir/$sra
13 |      rm $dir/$sra/GCA_000011705.1*
14 |      rm $dir/$sra/*fast*
15 |      rm $dir/$sra/*/*bam*
16 | done
17 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-Bmallei-fastq-sv.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-Bmallei-fastq-sv'
 2 | mkdir -p $dir
 3 | 
 4 | for sra in {SRR1618494,SRR1618669,SRR1618686,SRR1618500,SRR1618350,SRR1617359,SRR2146906,SRR2146901,SRR2146903,SRR2147669,SRR8283092,SRR8072934,SRR8072936,SRR8072939,ERR9616715};
 5 | do
 6 | echo $sra;
 7 | ./variantdetective.py structural_variant \
 8 |         -l ${sra}.fastq \
 9 |         -r GCA_000011705.1.fa \
10 |         -t 24 \
11 |         -o $dir/$sra
12 |      rm $dir/$sra/GCA_000011705.1*
13 |      rm $dir/$sra/*fast*
14 |      rm $dir/$sra/*/*bam*
15 | done
16 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-bwa-vs-mm-all.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-bwa-vs-mm-all'
 2 | mkdir -p $dir
 3 | cp parameter_sv.txt $dir/parameter_sv.txt
 4 | cp parameter_snp.txt $dir/parameter_snp.txt
 5 | j=50X
 6 | for i in {1..5}
 7 | do
 8 |   SURVIVOR simSV GCA_000011705.1.fa parameter_snp.txt 0.00017 0 $dir/sim_snp_${i}
 9 |   SURVIVOR simSV $dir/sim_snp_${i}.fasta parameter_sv.txt 0 0 $dir/sim_snp_sv_${i}
10 |   # Run using BWA
11 |   ./variantdetective.py all_variants \
12 |     	-g $dir/sim_snp_sv_${i}.fasta \
13 |     	-r GCA_000011705.1.fa \
14 |     	-t 24 \
15 |         --readcov $j \
16 |         --assembler bwa \
17 |     	-o $dir/sim_${i}_${j}_bwa
18 |      rm $dir/sim_${i}_${j}_bwa/GCA_000011705.1*
19 |      rm $dir/sim_${i}_${j}_bwa/*fast*
20 |      rm $dir/sim_${i}_${j}_bwa/*/*bam*
21 |   # Run using minimap2
22 |   ./variantdetective.py all_variants \
23 |     	-g $dir/sim_snp_sv_${i}.fasta \
24 |     	-r GCA_000011705.1.fa \
25 |     	-t 24 \
26 |         --readcov $j \
27 |         --assembler minimap2 \
28 |     	-o $dir/sim_${i}_${j}_mm
29 |      rm $dir/sim_${i}_${j}_mm/GCA_000011705.1*
30 |      rm $dir/sim_${i}_${j}_mm/*fast*
31 |      rm $dir/sim_${i}_${j}_mm/*/*bam*
32 | done


--------------------------------------------------------------------------------
/benchmarking/benchmark-cov-all.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-cov-all'
 2 | mkdir -p $dir
 3 | cp parameter_sv.txt $dir/parameter_sv.txt
 4 | for j in {25X,50X,100X,200X}
 5 | do
 6 |   for i in {1..5}
 7 |   do 
 8 |     SURVIVOR simSV GCA_000011705.1.fa $dir/parameter_sv.txt 0.00017 0 $dir/sim_$i
 9 |     start=`date +%s%N`
10 |     ./variantdetective.py all_variants \
11 |     	-g $dir/sim_$i.fasta \
12 |     	-r GCA_000011705.1.fa \
13 |     	-t 24 \
14 |       --readcov $j \
15 |     	-o $dir/sim_$i
16 |     end=`date +%s%N`
17 |     rm -r $dir/sim_$i*
18 | 
19 |     echo `expr $end - $start` >> $dir/results-$dir-$j.txt
20 |   done
21 | done
22 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-cov-snp.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-cov-snp'
 2 | mkdir -p $dir
 3 | cp parameter_snp.txt $dir/parameter_snp.txt
 4 | for j in {25X,50X,100X,200X,500X}
 5 | do
 6 |   for i in {1..5}
 7 |   do 
 8 |     SURVIVOR simSV GCA_000011705.1.fa $dir/parameter_snp.txt 0.00017 0 $dir/sim_$i
 9 |     start=`date +%s%N`
10 |     ./variantdetective.py snp_indel \
11 |     	-g $dir/sim_$i.fasta \
12 |     	-r GCA_000011705.1.fa \
13 |     	-t 24 \
14 |       --readcov $j \
15 |     	-o $dir/sim_$i
16 |     end=`date +%s%N`
17 |     rm -r $dir/sim_$i*
18 |     echo `expr $end - $start` >> $dir/results-$dir-$j.txt
19 |   done
20 | done
21 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-cov-sv.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-cov-sv'
 2 | mkdir -p $dir
 3 | cp parameter_sv.txt $dir/parameter_sv.txt
 4 | for j in {25X,50X,100X,200X}
 5 | do
 6 |   for i in {1..5}
 7 |   do 
 8 |     SURVIVOR simSV GCA_000011705.1.fa $dir/parameter_sv.txt 0 0 $dir/sim_$i
 9 |     start=`date +%s%N`
10 |     ./variantdetective.py structural_variant \
11 |     	-g $dir/sim_$i.fasta \
12 |     	-r GCA_000011705.1.fa \
13 |     	-t 24 \
14 |       --readcov $j \
15 |     	-o $dir/sim_$i
16 |     end=`date +%s%N`
17 |     rm -r $dir/sim_$i*
18 |     echo `expr $end - $start` >> $dir/results-$dir-$j.txt
19 |   done
20 | done
21 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-numvariants-snp.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-numvariants-snp'
 2 | mkdir -p $dir
 3 | cp parameter_snp.txt $dir/parameter_snp.txt
 4 | for j in {0.000017,0.000034,0.00017,0.00034,0.0017,0.0034,0.017}
 5 | do
 6 |   for i in {1..5}
 7 |   do 
 8 |     SURVIVOR simSV GCA_000011705.1.fa $dir/parameter_snp.txt $j 0 $dir/sim_$i
 9 |     start=`date +%s%N`
10 |     ./variantdetective.py snp_indel \
11 |     	-g $dir/sim_$i.fasta \
12 |     	-r GCA_000011705.1.fa \
13 |     	-t 24 \
14 |     	-o $dir/sim_$i
15 |     end=`date +%s%N`
16 |     rm -r $dir/sim_$i*
17 |     echo `expr $end - $start` >> $dir/results-$dir-$j.txt
18 |   done
19 | done
20 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-numvariants-sv.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-numvariants-sv'
 2 | mkdir -p $dir
 3 | for j in {21,46,66,128,210}
 4 | do
 5 |   for i in {1..5}
 6 |   do 
 7 |     SURVIVOR simSV GCA_000011705.1.fa $dir/parameter_sv_$j.txt 0 0 $dir/sim_$i
 8 |     start=`date +%s%N`
 9 |     ./variantdetective.py structural_variant \
10 |     	-g $dir/sim_$i.fasta \
11 |     	-r GCA_000011705.1.fa \
12 |     	-t 24 \
13 |         --readcov 50X \
14 |     	-o $dir/sim_$i
15 |     end=`date +%s%N`
16 |     rm -r $dir/sim_$i*
17 |     echo `expr $end - $start` >> $dir/results-$dir-$j.txt
18 |   done
19 | done
20 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-real_data-Bush-fasta-snp.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-real_data-Bush-fasta-snp'
 2 | for i in {cft073,mgh78578,rbhstw00029,rbhstw00053,rbhstw00059,rbhstw00122,rbhstw00127,rbhstw00128,rbhstw00131,rbhstw00167,rbhstw00189,rbhstw00277,rbhstw00309,rbhstw00340,rbhstw00350,rhb10c07,rhb11c04,rhb14c01}
 3 | do
 4 |     ./variantdetective.py snp_indel \
 5 |     	-g $dir/${i}/${i}.fasta \
 6 |         -r $dir/${i}/*.fa \
 7 |       	-t 40 \
 8 |     	-o $dir/${i}/ \
 9 | 		--readcov 100x \
10 | 		--readlen 100000,2000 \
11 |     rm $dir/${i}/*/*bam*
12 | done
13 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-real_data-Bush-fastq-snp.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-real_data-Bush-fastq-snp'
 2 | for i in {cft073,mgh78578,rbhstw00029,rbhstw00053,rbhstw00059,rbhstw00122,rbhstw00127,rbhstw00128,rbhstw00131,rbhstw00167,rbhstw00189,rbhstw00277,rbhstw00309,rbhstw00340,rbhstw00350,rhb10c07,rhb11c04,rhb14c01}
 3 | do
 4 |     ./variantdetective.py snp_indel \
 5 |     	-1 $dir/${i}/${i}.1.fq.gz \
 6 |         -2 $dir/${i}/${i}.2.fq.gz \
 7 |     	-r $dir/${i}/*.fa \
 8 |     	-t 40 \
 9 |     	-o $dir/${i}/
10 |      rm $dir/${i}/*/*bam*
11 | done
12 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-real_data-HG002-sv.sh:
--------------------------------------------------------------------------------
1 | dir='benchmark-real_data_HG002-sv'
2 | mkdir -p $dir
3 | ./variantdetective.py structural_variant \
4 |     -l $dir/NA12878-12.5mil.fq.gz \
5 |     -r $dir/hg19_ucsc_main.fa \
6 |     -t 24 \
7 |     -o $dir/NA12878_results


--------------------------------------------------------------------------------
/benchmarking/benchmark-sim_data-Bush-Ecoli-divergent-snp.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-sim_data-Bush-Ecoli-divergent-snp'
 2 | for i in {09-00049,1223,14EC017,14EC020,2012C-4227,2013C-4465,2016C-3878,210221272,2149,28RC1,317,350,51008369SK1,5CRE51,746,789,9000,94-3024,95JB1,ABWA45,AR_0006,AR_0017,AR_0055,AR_0069,AR_0151,AR_0369,AR436,ATCC25922,BH100Lsubstr.MG2017}
 3 | do
 4 |     ./variantdetective.py snp_indel \
 5 |     	-g $dir/${i}/${i}_mutated_simulation.fasta \
 6 |         -r $dir/NC_000913.3.fasta \
 7 |       	-t 40 \
 8 |     	-o $dir/${i}/ \
 9 | 		--readcov 100x \
10 | 		--readlen 100000,2000 \
11 |     rm $dir/${i}/*/*bam*
12 | done
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-sim_data-Bush-Ecoli-same-snp.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-sim_data-Bush-Ecoli-same-snp'
 2 | for i in {09-00049,1223,14EC017,14EC020,2012C-4227,2013C-4465,2016C-3878,210221272,2149,28RC1,317,350,51008369SK1,5CRE51,746,789,9000,94-3024,95JB1,ABWA45,AR_0006,AR_0017,AR_0055,AR_0069,AR_0151,AR_0369,AR436,ATCC25922,BH100Lsubstr.MG2017}
 3 | do
 4 |     ./variantdetective.py snp_indel \
 5 |     	-g $dir/${i}/${i}_mutated_simulation.fasta \
 6 |         -r $dir/${i}/${i}_simulated.fasta \
 7 |       	-t 40 \
 8 |     	-o $dir/${i}/ \
 9 | 		--readcov 100x \
10 | 		--readlen 100000,2000 \
11 |     rm $dir/${i}/*/*bam*
12 | done
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-sim_data-PR-Bmallei-all.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-PR-Bmallei-all'
 2 | mkdir -p $dir
 3 | cp parameter_sv_bmallei.txt $dir/parameter_sv_bmallei.txt
 4 | cp parameter_snp.txt $dir/parameter_snp.txt
 5 | for i in {1..5}
 6 | do
 7 |   SURVIVOR simSV GCA_000011705.1.fa parameter_snp.txt 0.00017 0 $dir/sim_snp_${i}
 8 |   SURVIVOR simSV $dir/sim_snp_${i}.fasta parameter_sv_bmallei.txt 0 0 $dir/sim_snp_sv_${i}
 9 |   variantdetective all_variants \
10 |     	-g $dir/sim_snp_sv_${i}.fasta \
11 |     	-r GCA_000011705.1.fa \
12 |     	-t 24 \
13 |       --readcov 50X \
14 |     	-o $dir/sim_${i}_${j}
15 |      rm $dir/sim_${i}_${j}/GCA_000011705*
16 |      rm $dir/sim_${i}_${j}/*fast*
17 |      rm $dir/sim_${i}_${j}/*/*bam*
18 |      rm $dir/sim_${i}_${j}/structural_variant/nanovar/*bam*
19 | done
20 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-sim_data-PR-Ecoli-all.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-PR-Ecoli-all'
 2 | mkdir -p $dir
 3 | cp parameter_sv_ecoli.txt $dir/parameter_sv_ecoli.txt
 4 | cp parameter_snp.txt $dir/parameter_snp.txt
 5 | for i in {1..5}
 6 | do
 7 |   SURVIVOR simSV NC_000913.3.fasta parameter_snp.txt 0.00017 0 $dir/sim_snp_${i}
 8 |   SURVIVOR simSV $dir/sim_snp_${i}.fasta parameter_sv_ecoli.txt 0 0 $dir/sim_snp_sv_${i}
 9 |   variantdetective all_variants \
10 |     	-g $dir/sim_snp_sv_${i}.fasta \
11 |     	-r NC_000913.3.fasta \
12 |     	-t 24 \
13 |       --readcov 50X \
14 |     	-o $dir/sim_${i}_${j}
15 |   rm $dir/sim_${i}_${j}/NC_000913*
16 |   rm $dir/sim_${i}_${j}/*fast*
17 |   rm $dir/sim_${i}_${j}/*/*bam*
18 |   rm $dir/sim_${i}_${j}/structural_variant/nanovar/*bam*
19 | done
20 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-sim_data-PR-Lacidophilus-all.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-PR-Lacidophilus-all'
 2 | mkdir -p $dir
 3 | cp parameter_sv_lacidophilus.txt $dir/parameter_sv_lacidophilus.txt
 4 | cp parameter_snp.txt $dir/parameter_snp.txt
 5 | j=50X
 6 | for i in {1..5}
 7 | do
 8 |   SURVIVOR simSV NC_000913.3.fasta parameter_snp.txt 0.00017 0 $dir/sim_snp_${i}
 9 |   SURVIVOR simSV $dir/sim_snp_${i}.fasta parameter_sv_lacidophilus.txt 0 0 $dir/sim_snp_sv_${i}
10 |   variantdetective all_variants \
11 |         -g $dir/sim_snp_sv_${i}.fasta \
12 |         -r NC_000913.3.fasta \
13 |         -t 24 \
14 |         --readcov $j \
15 |         -o $dir/sim_${i}_${j}
16 |      rm $dir/sim_${i}_${j}/NC_000913*
17 |      rm $dir/sim_${i}_${j}/*fast*
18 |      rm $dir/sim_${i}_${j}/*/*bam*
19 |      rm $dir/sim_${i}_${j}/structural_variant/nanovar/*bam*
20 | done
21 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-threads-all.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-threads-all'
 2 | mkdir -p $dir
 3 | cp parameter_sv.txt $dir/parameter_sv.txt
 4 | for j in {1,2,4,8,12,24,48}
 5 | do
 6 |   for i in {1..5}
 7 |   do 
 8 |     SURVIVOR simSV GCA_000011705.1.fa $dir/parameter_sv.txt 0.00017 0 $dir/sim_$i
 9 |     start=`date +%s%N`
10 |     ./variantdetective.py all_variants \
11 |     	-g $dir/sim_$i.fasta \
12 |     	-r GCA_000011705.1.fa \
13 |     	-t $j \
14 |     	-o $dir/sim_$i
15 |     end=`date +%s%N`
16 |     rm -r $dir/sim_$i*
17 |     echo `expr $end - $start` >> $dir/results-$dir-$j.txt
18 |   done
19 | done
20 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-threads-snp.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-threads-snp'
 2 | mkdir -p $dir
 3 | cp parameter_snp.txt $dir/parameter_snp.txt
 4 | for j in {1,2,4,8,12,24,48}
 5 | do
 6 |   for i in {1..5}
 7 |   do 
 8 |     SURVIVOR simSV GCA_000011705.1.fa $dir/parameter_snp.txt 0.00017 0 $dir/sim_$i
 9 |     start=`date +%s%N`
10 |     ./variantdetective.py snp_indel \
11 |     	-g $dir/sim_$i.fasta \
12 |     	-r GCA_000011705.1.fa \
13 |     	-t $j \
14 |     	-o $dir/sim_$i
15 |     end=`date +%s%N`
16 |     rm -r $dir/sim_$i*
17 |     echo `expr $end - $start` >> $dir/results-$dir-$j.txt
18 |   done
19 | done
20 | 


--------------------------------------------------------------------------------
/benchmarking/benchmark-threads-sv.sh:
--------------------------------------------------------------------------------
 1 | dir='benchmark-threads-sv'
 2 | mkdir -p $dir
 3 | cp parameter_sv.txt $dir/parameter_sv.txt
 4 | for j in {1,2,4,8,12,24,48}
 5 | do
 6 |   for i in {1..5}
 7 |   do 
 8 |     SURVIVOR simSV GCA_000011705.1.fa $dir/parameter_sv.txt 0 0 $dir/sim_$i
 9 |     start=`date +%s%N`
10 |     ./variantdetective.py structural_variant \
11 |     	-g $dir/sim_$i.fasta \
12 |     	-r GCA_000011705.1.fa \
13 |     	-t $j \
14 |     	-o $dir/sim_$i
15 |     end=`date +%s%N`
16 |     rm -r $dir/sim_$i*
17 |     echo `expr $end - $start` >> $dir/results-$dir-$j.txt
18 |   done
19 | done
20 | 


--------------------------------------------------------------------------------
/benchmarking/parameter_snp.txt:
--------------------------------------------------------------------------------
 1 | PARAMETER FILE: DO JUST MODIFY THE VALUES AND KEEP THE SPACES!
 2 | DUPLICATION_minimum_length: 1000
 3 | DUPLICATION_maximum_length: 20000
 4 | DUPLICATION_number: 0
 5 | INDEL_minimum_length: 1000
 6 | INDEL_maximum_length: 30000
 7 | INDEL_number: 0
 8 | TRANSLOCATION_minimum_length: 10000
 9 | TRANSLOCATION_maximum_length: 30000
10 | TRANSLOCATION_number: 0
11 | INVERSION_minimum_length: 500
12 | INVERSION_maximum_length: 10000
13 | INVERSION_number: 0
14 | INV_del_minimum_length: 600
15 | INV_del_maximum_length: 800
16 | INV_del_number: 0
17 | INV_dup_minimum_length: 600
18 | INV_dup_maximum_length: 800
19 | INV_dup_number: 0
20 | 


--------------------------------------------------------------------------------
/benchmarking/parameter_sv.txt:
--------------------------------------------------------------------------------
 1 | PARAMETER FILE: DO JUST MODIFY THE VALUES AND KEEP THE SPACES!
 2 | DUPLICATION_minimum_length: 1000
 3 | DUPLICATION_maximum_length: 20000
 4 | DUPLICATION_number: 3
 5 | INDEL_minimum_length: 1000
 6 | INDEL_maximum_length: 30000
 7 | INDEL_number: 85
 8 | TRANSLOCATION_minimum_length: 10000
 9 | TRANSLOCATION_maximum_length: 30000
10 | TRANSLOCATION_number: 1
11 | INVERSION_minimum_length: 500
12 | INVERSION_maximum_length: 10000
13 | INVERSION_number: 10
14 | INV_del_minimum_length: 600
15 | INV_del_maximum_length: 800
16 | INV_del_number: 0
17 | INV_dup_minimum_length: 600
18 | INV_dup_maximum_length: 800
19 | INV_dup_number: 0
20 | 


--------------------------------------------------------------------------------
/benchmarking/parameter_sv_bmallei.txt:
--------------------------------------------------------------------------------
 1 | PARAMETER FILE: DO JUST MODIFY THE VALUES AND KEEP THE SPACES!
 2 | DUPLICATION_minimum_length: 1000
 3 | DUPLICATION_maximum_length: 20000
 4 | DUPLICATION_number: 3
 5 | INDEL_minimum_length: 1000
 6 | INDEL_maximum_length: 30000
 7 | INDEL_number: 85
 8 | TRANSLOCATION_minimum_length: 10000
 9 | TRANSLOCATION_maximum_length: 30000
10 | TRANSLOCATION_number: 1
11 | INVERSION_minimum_length: 500
12 | INVERSION_maximum_length: 10000
13 | INVERSION_number: 10
14 | INV_del_minimum_length: 600
15 | INV_del_maximum_length: 800
16 | INV_del_number: 0
17 | INV_dup_minimum_length: 600
18 | INV_dup_maximum_length: 800
19 | INV_dup_number: 0
20 | 


--------------------------------------------------------------------------------
/benchmarking/parameter_sv_ecoli.txt:
--------------------------------------------------------------------------------
 1 | PARAMETER FILE: DO JUST MODIFY THE VALUES AND KEEP THE SPACES!
 2 | DUPLICATION_minimum_length: 1000
 3 | DUPLICATION_maximum_length: 20000
 4 | DUPLICATION_number: 3
 5 | INDEL_minimum_length: 1000
 6 | INDEL_maximum_length: 30000
 7 | INDEL_number: 87
 8 | TRANSLOCATION_minimum_length: 10000
 9 | TRANSLOCATION_maximum_length: 30000
10 | TRANSLOCATION_number: 0
11 | INVERSION_minimum_length: 500
12 | INVERSION_maximum_length: 10000
13 | INVERSION_number: 10
14 | INV_del_minimum_length: 600
15 | INV_del_maximum_length: 800
16 | INV_del_number: 0
17 | INV_dup_minimum_length: 600
18 | INV_dup_maximum_length: 800
19 | INV_dup_number: 0
20 | 


--------------------------------------------------------------------------------
/benchmarking/parameter_sv_lacidophilus.txt:
--------------------------------------------------------------------------------
 1 | PARAMETER FILE: DO JUST MODIFY THE VALUES AND KEEP THE SPACES!
 2 | DUPLICATION_minimum_length: 333
 3 | DUPLICATION_maximum_length: 6667
 4 | DUPLICATION_number: 3
 5 | INDEL_minimum_length: 333
 6 | INDEL_maximum_length: 10000
 7 | INDEL_number: 87
 8 | TRANSLOCATION_minimum_length: 3333
 9 | TRANSLOCATION_maximum_length: 10000
10 | TRANSLOCATION_number: 0
11 | INVERSION_minimum_length: 167
12 | INVERSION_maximum_length: 3333
13 | INVERSION_number: 10
14 | INV_del_minimum_length: 600
15 | INV_del_maximum_length: 800
16 | INV_del_number: 0
17 | INV_dup_minimum_length: 600
18 | INV_dup_maximum_length: 800
19 | INV_dup_number: 0
20 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | $PYTHON -m pip install --no-deps --ignore-installed .     # Python command to install the script.
4 | 


--------------------------------------------------------------------------------
/res/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OLF-Bioinformatics/VariantDetective/5ca633e4879865db12112440f1f0c691f6767cd2/res/logo.png


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(name='variantdetective',
 4 |       version='1.0.1',
 5 |       description='VariantDetective: an accurate all-in-one pipeline for detecting consensus bacterial SNPs and SVs',
 6 |       url='https://github.com/OLF-Bioinformatics/VariantDetective',
 7 |       author='Phil Charron', 
 8 |       author_email='phil.charron@inspection.gc.ca', 
 9 |       license='MIT', 
10 |       #install_requires=['pandas>=2.1.3', 'numpy>=1.26', 'tensorflow>=2.8.0'], 
11 |       packages=find_packages(),
12 |       include_package_data=True,
13 |       package_data={
14 |           'variantdetective': ['clair3_models/*/*'],
15 |       },
16 |       entry_points={
17 |           'console_scripts': [
18 |               'variantdetective=variantdetective.main:main',
19 |           ],
20 |       }
21 | )
22 | 


--------------------------------------------------------------------------------
/spec-file.txt:
--------------------------------------------------------------------------------
  1 | # This file may be used to create an environment using:
  2 | # $ conda create --name <env> --file <this file>
  3 | # platform: linux-64
  4 | @EXPLICIT
  5 | https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
  6 | https://conda.anaconda.org/conda-forge/noarch/_r-mutex-1.0.1-anacondar_1.tar.bz2
  7 | https://repo.anaconda.com/pkgs/main/linux-64/_tflow_select-2.3.0-mkl.conda
  8 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/ca-certificates-2023.11.17-hbcca054_0.conda
  9 | https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-2.6.32-he073ed8_16.conda
 10 | https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.36.1-hea4e1c9_2.tar.bz2
 11 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/libgcc-devel_linux-64-7.5.0-hda03d7c_20.tar.bz2
 12 | https://conda.anaconda.org/conda-forge/linux-64/libgfortran4-7.5.0-h14aa051_20.tar.bz2
 13 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/libstdcxx-devel_linux-64-7.5.0-hb016644_20.tar.bz2
 14 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/libstdcxx-ng-13.2.0-h7e041cc_3.conda
 15 | https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-7.5.0-h14aa051_20.tar.bz2
 16 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/libgomp-13.2.0-h807b86a_3.conda
 17 | https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.12-he073ed8_16.conda
 18 | https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2
 19 | https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.36.1-h193b22a_2.tar.bz2
 20 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/binutils_linux-64-2.36-hf3e587d_33.tar.bz2
 21 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_3.conda
 22 | https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.3.2-h166bdaf_0.tar.bz2
 23 | https://conda.anaconda.org/conda-forge/linux-64/bc-1.07.1-h7f98852_0.tar.bz2
 24 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda
 25 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/c-ares-1.22.1-hd590300_0.conda
 26 | https://conda.anaconda.org/conda-forge/linux-64/fribidi-1.0.10-h36c2ea0_0.tar.bz2
 27 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/gcc_impl_linux-64-7.5.0-habd7529_20.tar.bz2
 28 | https://conda.anaconda.org/conda-forge/linux-64/gettext-0.21.1-h27087fc_0.tar.bz2
 29 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/giflib-5.2.1-h0b41bf4_3.conda
 30 | https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.13-h58526e2_1001.tar.bz2
 31 | https://conda.anaconda.org/conda-forge/linux-64/icu-58.2-hf484d3e_1000.tar.bz2
 32 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/isa-l-2.30.0-hd590300_6.conda
 33 | https://conda.anaconda.org/conda-forge/linux-64/jpeg-9e-h0b41bf4_3.conda
 34 | https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2
 35 | https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.13-h166bdaf_0.tar.bz2
 36 | https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-h516909a_1.tar.bz2
 37 | https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.5.0-hcb278e6_1.conda
 38 | https://conda.anaconda.org/conda-forge/linux-64/libffi-3.2.1-he1b5a44_1007.tar.bz2
 39 | https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.17-h166bdaf_0.tar.bz2
 40 | https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda
 41 | https://repo.anaconda.com/pkgs/main/linux-64/libopenblas-0.3.18-hf726d26_0.conda
 42 | https://conda.anaconda.org/conda-forge/linux-64/libunistring-0.9.10-h7f98852_0.tar.bz2
 43 | https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda
 44 | https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.1.0-h36c2ea0_3.tar.bz2
 45 | https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda
 46 | https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.9.2-he1b5a44_3.tar.bz2
 47 | https://conda.anaconda.org/conda-forge/linux-64/lzo-2.10-h516909a_1000.tar.bz2
 48 | https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.2-h58526e2_4.tar.bz2
 49 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/openssl-1.1.1w-hd590300_0.conda
 50 | https://conda.anaconda.org/conda-forge/linux-64/ossuuid-1.6.2-hf484d3e_1000.tar.bz2
 51 | https://conda.anaconda.org/conda-forge/linux-64/pcre-8.45-h9c3ff4c_0.tar.bz2
 52 | https://conda.anaconda.org/conda-forge/linux-64/pixman-0.38.0-h516909a_1003.tar.bz2
 53 | https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-h36c2ea0_1001.tar.bz2
 54 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/snappy-1.1.10-h9fff704_0.conda
 55 | https://conda.anaconda.org/conda-forge/linux-64/xorg-inputproto-2.3.2-h7f98852_1002.tar.bz2
 56 | https://conda.anaconda.org/conda-forge/linux-64/xorg-kbproto-1.0.7-h7f98852_1002.tar.bz2
 57 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/xorg-libice-1.1.1-hd590300_0.conda
 58 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/xorg-libxau-1.0.11-hd590300_0.conda
 59 | https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.3-h7f98852_0.tar.bz2
 60 | https://conda.anaconda.org/conda-forge/linux-64/xorg-recordproto-1.14.2-h7f98852_1002.tar.bz2
 61 | https://conda.anaconda.org/conda-forge/linux-64/xorg-renderproto-0.11.1-h7f98852_1002.tar.bz2
 62 | https://conda.anaconda.org/conda-forge/linux-64/xorg-xextproto-7.3.0-h0b41bf4_1003.conda
 63 | https://conda.anaconda.org/conda-forge/linux-64/xorg-xproto-7.0.31-h7f98852_1007.tar.bz2
 64 | https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2
 65 | https://conda.anaconda.org/conda-forge/linux-64/blosc-1.21.1-hd32f23e_0.tar.bz2
 66 | https://conda.anaconda.org/conda-forge/linux-64/expat-2.5.0-hcb278e6_1.conda
 67 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/gcc_linux-64-7.5.0-h47867f9_33.tar.bz2
 68 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/gfortran_impl_linux-64-7.5.0-h56cb351_20.tar.bz2
 69 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/gxx_impl_linux-64-7.5.0-hd0bb8aa_20.tar.bz2
 70 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/libblas-3.9.0-13_linux64_openblas.tar.bz2
 71 | https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2
 72 | https://conda.anaconda.org/conda-forge/linux-64/libidn2-2.3.4-h166bdaf_0.tar.bz2
 73 | https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.51.0-hdcd2b5c_0.conda
 74 | https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.39-h753d276_0.conda
 75 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/libsqlite-3.44.2-h2797004_0.conda
 76 | https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.10.0-haa6b8db_3.tar.bz2
 77 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/libxcb-1.15-h0b41bf4_0.conda
 78 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/pbzip2-1.1.13-h1fcc475_2.conda
 79 | https://conda.anaconda.org/conda-forge/linux-64/perl-5.32.1-4_hd590300_perl5.conda
 80 | https://conda.anaconda.org/t/ch-9f6a872e-99a2-45f8-b0d4-b6aae824fb17/conda-forge/linux-64/readline-7.0-hf8c457e_1001.tar.bz2
 81 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda
 82 | https://conda.anaconda.org/conda-forge/linux-64/xorg-fixesproto-5.0-h7f98852_1002.tar.bz2
 83 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/xorg-libsm-1.2.4-h7391055_0.conda
 84 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/zlib-1.2.13-hd590300_5.conda
 85 | https://conda.anaconda.org/bioconda/linux-64/bedtools-2.30.0-h468198e_3.tar.bz2
 86 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/bioconda/linux-64/bwa-0.7.17-he4a0461_11.tar.bz2
 87 | https://conda.anaconda.org/conda-forge/linux-64/bwidget-1.9.14-ha770c72_1.tar.bz2
 88 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/freetype-2.12.1-h267a509_2.conda
 89 | https://conda.anaconda.org/t/ch-9f6a872e-99a2-45f8-b0d4-b6aae824fb17/conda-forge/linux-64/gdbm-1.18-h941a26a_0.tar.bz2
 90 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/gfortran_linux-64-7.5.0-h78c8a43_33.tar.bz2
 91 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/gxx_linux-64-7.5.0-h555fc39_33.tar.bz2
 92 | https://conda.anaconda.org/bioconda/linux-64/k8-0.2.5-hdcf5f25_4.tar.bz2
 93 | https://conda.anaconda.org/conda-forge/linux-64/krb5-1.20.1-hf9c8cef_0.conda
 94 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/libcblas-3.9.0-13_linux64_openblas.tar.bz2
 95 | https://conda.anaconda.org/conda-forge/linux-64/libglib-2.66.3-hbe7bbb4_0.tar.bz2
 96 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/liblapack-3.9.0-13_linux64_openblas.tar.bz2
 97 | https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-3.18.0-h780b84a_1.tar.bz2
 98 | https://repo.anaconda.com/pkgs/main/linux-64/libxml2-2.9.14-h74e7548_0.conda
 99 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/bioconda/linux-64/ngmlr-0.2.7-hdcf5f25_6.tar.bz2
100 | https://conda.anaconda.org/conda-forge/linux-64/parallel-20191122-0.tar.bz2
101 | https://conda.anaconda.org/conda-forge/linux-64/perl-capture-tiny-0.48-pl5321ha770c72_1.tar.bz2
102 | https://conda.anaconda.org/conda-forge/noarch/perl-common-sense-3.75-pl5321hd8ed1ab_0.tar.bz2
103 | https://conda.anaconda.org/conda-forge/linux-64/perl-compress-raw-bzip2-2.201-pl5321h166bdaf_0.tar.bz2
104 | https://conda.anaconda.org/conda-forge/linux-64/perl-compress-raw-zlib-2.202-pl5321h166bdaf_0.tar.bz2
105 | https://conda.anaconda.org/conda-forge/noarch/perl-constant-1.33-pl5321hd8ed1ab_0.tar.bz2
106 | https://conda.anaconda.org/conda-forge/noarch/perl-exporter-5.74-pl5321hd8ed1ab_0.tar.bz2
107 | https://conda.anaconda.org/conda-forge/noarch/perl-exporter-tiny-1.002002-pl5321hd8ed1ab_0.tar.bz2
108 | https://conda.anaconda.org/conda-forge/noarch/perl-extutils-makemaker-7.70-pl5321hd8ed1ab_0.conda
109 | https://conda.anaconda.org/bioconda/noarch/perl-ffi-checklib-0.28-pl5321hdfd78af_0.tar.bz2
110 | https://conda.anaconda.org/conda-forge/noarch/perl-file-which-1.24-pl5321hd8ed1ab_0.tar.bz2
111 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/perl-importer-0.026-pl5321hd8ed1ab_0.tar.bz2
112 | https://conda.anaconda.org/bioconda/noarch/perl-io-zlib-1.14-pl5321hdfd78af_0.tar.bz2
113 | https://conda.anaconda.org/bioconda/linux-64/perl-list-moreutils-xs-0.430-pl5321h031d066_2.tar.bz2
114 | https://conda.anaconda.org/conda-forge/noarch/perl-parent-0.241-pl5321hd8ed1ab_0.conda
115 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/perl-path-tiny-0.124-pl5321hd8ed1ab_0.tar.bz2
116 | https://conda.anaconda.org/conda-forge/linux-64/perl-scalar-list-utils-1.63-pl5321h166bdaf_0.tar.bz2
117 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/perl-scope-guard-0.21-pl5321hd8ed1ab_0.tar.bz2
118 | https://conda.anaconda.org/conda-forge/linux-64/perl-storable-3.15-pl5321h166bdaf_0.tar.bz2
119 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/perl-test-warnings-0.031-pl5321ha770c72_0.conda
120 | https://conda.anaconda.org/conda-forge/linux-64/perl-try-tiny-0.31-pl5321ha770c72_0.tar.bz2
121 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/perl-xml-sax-base-1.09-pl5321hd8ed1ab_0.tar.bz2
122 | https://repo.anaconda.com/pkgs/main/linux-64/pigz-2.4-h84994c4_0.conda
123 | https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.33.0-h62c20be_0.conda
124 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/bioconda/linux-64/survivor-1.0.7-hdcf5f25_4.tar.bz2
125 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/tktable-2.10-h0c5db8f_5.conda
126 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/bioconda/linux-64/vcftools-0.1.16-pl5321hdcf5f25_9.tar.bz2
127 | https://conda.anaconda.org/conda-forge/linux-64/wget-1.20.3-ha56f1ee_1.tar.bz2
128 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/xorg-libx11-1.8.7-h8ee46fc_0.conda
129 | https://conda.anaconda.org/conda-forge/linux-64/zstd-1.4.4-h6597ccf_3.tar.bz2
130 | https://conda.anaconda.org/bioconda/linux-64/entrez-direct-16.2-he881be0_1.tar.bz2
131 | https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.14.2-h14ed4e7_0.conda
132 | https://conda.anaconda.org/conda-forge/linux-64/gsl-2.5-h294904e_1.tar.bz2
133 | https://conda.anaconda.org/conda-forge/linux-64/libcurl-7.87.0-h6312ad2_0.conda
134 | https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.1.0-hc7e4089_6.tar.bz2
135 | https://conda.anaconda.org/bioconda/linux-64/minimap2-2.24-h7132678_1.tar.bz2
136 | https://conda.anaconda.org/conda-forge/noarch/perl-carp-1.50-pl5321hd8ed1ab_0.tar.bz2
137 | https://conda.anaconda.org/conda-forge/linux-64/perl-encode-3.19-pl5321h166bdaf_0.tar.bz2
138 | https://conda.anaconda.org/conda-forge/noarch/perl-file-path-2.18-pl5321hd8ed1ab_0.tar.bz2
139 | https://conda.anaconda.org/bioconda/noarch/perl-list-moreutils-0.430-pl5321hdfd78af_0.tar.bz2
140 | https://conda.anaconda.org/conda-forge/linux-64/perl-test-fatal-0.016-pl5321ha770c72_0.tar.bz2
141 | https://conda.anaconda.org/bioconda/noarch/perl-types-serialiser-1.01-pl5321hdfd78af_0.tar.bz2
142 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/perl-xml-namespacesupport-1.12-pl5321hd8ed1ab_0.tar.bz2
143 | https://conda.anaconda.org/conda-forge/linux-64/pypy3.6-7.3.2-h45e8706_2.tar.bz2
144 | https://repo.anaconda.com/pkgs/main/linux-64/python-3.6.10-h191fe78_1.conda
145 | https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.4-h0b41bf4_2.conda
146 | https://conda.anaconda.org/conda-forge/linux-64/xorg-libxfixes-5.0.3-h7f98852_1004.tar.bz2
147 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/xorg-libxrender-0.9.11-hd590300_0.conda
148 | https://conda.anaconda.org/conda-forge/noarch/astor-0.8.1-pyh9f0ad1d_0.tar.bz2
149 | https://conda.anaconda.org/conda-forge/noarch/async-timeout-3.0.1-py_1000.tar.bz2
150 | https://conda.anaconda.org/conda-forge/noarch/attrs-22.2.0-pyh71513ae_0.conda
151 | https://conda.anaconda.org/conda-forge/noarch/blinker-1.5-pyhd8ed1ab_0.tar.bz2
152 | https://conda.anaconda.org/conda-forge/noarch/cachetools-5.0.0-pyhd8ed1ab_0.tar.bz2
153 | https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-2.1.1-pyhd8ed1ab_0.tar.bz2
154 | https://conda.anaconda.org/bioconda/noarch/cigar-0.1.3-pyh864c0ab_1.tar.bz2
155 | https://conda.anaconda.org/conda-forge/linux-64/curl-7.87.0-h6312ad2_0.conda
156 | https://conda.anaconda.org/conda-forge/noarch/cycler-0.11.0-pyhd8ed1ab_0.tar.bz2
157 | https://conda.anaconda.org/conda-forge/noarch/dataclasses-0.8-pyh787bdff_2.tar.bz2
158 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/gast-0.3.3-py_0.tar.bz2
159 | https://conda.anaconda.org/conda-forge/linux-64/glib-2.66.3-h58526e2_0.tar.bz2
160 | https://conda.anaconda.org/t/ch-9f6a872e-99a2-45f8-b0d4-b6aae824fb17/conda-forge/linux-64/hdf5-1.10.6-nompi_h7c3c948_1111.tar.bz2
161 | https://conda.anaconda.org/bioconda/linux-64/htslib-1.10.2-hd3b49d5_1.tar.bz2
162 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/idna-3.6-pyhd8ed1ab_0.conda
163 | https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.11-hcbb858e_1.tar.bz2
164 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/mock-5.1.0-pyhd8ed1ab_0.conda
165 | https://conda.anaconda.org/conda-forge/noarch/natsort-8.2.0-pyhd8ed1ab_0.tar.bz2
166 | https://conda.anaconda.org/conda-forge/noarch/olefile-0.46-pyh9f0ad1d_1.tar.bz2
167 | https://conda.anaconda.org/conda-forge/noarch/perl-business-isbn-data-20210112.006-pl5321hd8ed1ab_0.tar.bz2
168 | https://conda.anaconda.org/conda-forge/noarch/perl-file-temp-0.2304-pl5321hd8ed1ab_0.tar.bz2
169 | https://conda.anaconda.org/bioconda/linux-64/perl-io-compress-2.201-pl5321hdbdd923_2.tar.bz2
170 | https://conda.anaconda.org/bioconda/linux-64/perl-json-xs-2.34-pl5321h4ac6f70_6.tar.bz2
171 | https://conda.anaconda.org/conda-forge/linux-64/perl-pathtools-3.75-pl5321h166bdaf_0.tar.bz2
172 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/perl-sub-info-0.002-pl5321hd8ed1ab_0.tar.bz2
173 | https://conda.anaconda.org/bioconda/noarch/perl-term-table-0.016-pl5321hdfd78af_0.tar.bz2
174 | https://conda.anaconda.org/conda-forge/noarch/progress-1.6-pyhd8ed1ab_0.tar.bz2
175 | https://conda.anaconda.org/t/ch-9f6a872e-99a2-45f8-b0d4-b6aae824fb17/conda-forge/noarch/pyasn1-0.5.1-pyhd8ed1ab_0.conda
176 | https://conda.anaconda.org/conda-forge/noarch/pycparser-2.21-pyhd8ed1ab_0.tar.bz2
177 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/pyjwt-2.8.0-pyhd8ed1ab_0.conda
178 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/pyparsing-3.1.1-pyhd8ed1ab_0.conda
179 | https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.6-2_cp36m.tar.bz2
180 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/pytz-2023.3.post1-pyhd8ed1ab_0.conda
181 | https://conda.anaconda.org/bioconda/noarch/pyvcf3-1.0.3-pyhdfd78af_0.tar.bz2
182 | https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2
183 | https://conda.anaconda.org/conda-forge/noarch/tensorboard-plugin-wit-1.8.1-pyhd8ed1ab_0.tar.bz2
184 | https://conda.anaconda.org/conda-forge/noarch/termcolor-1.1.0-pyhd8ed1ab_3.tar.bz2
185 | https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.1.1-pyha770c72_0.tar.bz2
186 | https://conda.anaconda.org/conda-forge/noarch/wheel-0.37.1-pyhd8ed1ab_0.tar.bz2
187 | https://conda.anaconda.org/conda-forge/linux-64/xorg-libxi-1.7.10-h7f98852_0.tar.bz2
188 | https://conda.anaconda.org/conda-forge/noarch/zipp-3.6.0-pyhd8ed1ab_0.tar.bz2
189 | https://conda.anaconda.org/t/ch-9f6a872e-99a2-45f8-b0d4-b6aae824fb17/conda-forge/noarch/absl-py-0.15.0-pyhd8ed1ab_0.tar.bz2
190 | https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_0.tar.bz2
191 | https://conda.anaconda.org/bioconda/linux-64/bcftools-1.10-h5d15f04_0.tar.bz2
192 | https://conda.anaconda.org/conda-forge/linux-64/cairo-1.16.0-h18b612c_1001.tar.bz2
193 | https://conda.anaconda.org/conda-forge/linux-64/certifi-2021.5.30-py36h5fab9bb_0.tar.bz2
194 | https://conda.anaconda.org/conda-forge/linux-64/cffi-1.14.4-py36h211aa47_0.tar.bz2
195 | https://conda.anaconda.org/conda-forge/linux-64/chardet-4.0.0-py36h5fab9bb_1.tar.bz2
196 | https://conda.anaconda.org/conda-forge/noarch/google-pasta-0.2.0-pyh8c360ce_0.tar.bz2
197 | https://conda.anaconda.org/conda-forge/noarch/idna_ssl-1.1.0-pyhd8ed1ab_1002.tar.bz2
198 | https://conda.anaconda.org/conda-forge/linux-64/importlib-metadata-4.8.1-py36h5fab9bb_0.tar.bz2
199 | https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.3.1-py36h605e78d_1.tar.bz2
200 | https://conda.anaconda.org/conda-forge/linux-64/multidict-5.2.0-py36h8f6f2f9_0.tar.bz2
201 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/numpy-1.19.2-py36h68c22af_1.tar.bz2
202 | https://conda.anaconda.org/conda-forge/noarch/packaging-21.3-pyhd8ed1ab_0.tar.bz2
203 | https://conda.anaconda.org/bioconda/noarch/perl-archive-tar-2.40-pl5321hdfd78af_0.tar.bz2
204 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/perl-business-isbn-3.007-pl5321hd8ed1ab_0.tar.bz2
205 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/perl-file-chdir-0.1011-pl5321hd8ed1ab_0.tar.bz2
206 | https://conda.anaconda.org/bioconda/noarch/perl-json-4.10-pl5321hdfd78af_0.tar.bz2
207 | https://conda.anaconda.org/bioconda/noarch/perl-test2-suite-0.000145-pl5321hdfd78af_0.tar.bz2
208 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/perl-xml-sax-1.02-pl5321hd8ed1ab_0.tar.bz2
209 | https://conda.anaconda.org/conda-forge/linux-64/pillow-8.1.0-py36h4f9996e_1.tar.bz2
210 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/psutil-5.8.0-py36h8f6f2f9_1.tar.bz2
211 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/pyasn1-modules-0.3.0-pyhd8ed1ab_0.conda
212 | https://conda.anaconda.org/t/ch-9f6a872e-99a2-45f8-b0d4-b6aae824fb17/bioconda/linux-64/pysam-0.21.0-py36h50b03f4_0.tar.bz2
213 | https://conda.anaconda.org/conda-forge/linux-64/pysocks-1.7.1-py36h5fab9bb_3.tar.bz2
214 | https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.2-pyhd8ed1ab_0.tar.bz2
215 | https://conda.anaconda.org/conda-forge/linux-64/python-isal-0.11.1-py36h8f6f2f9_0.tar.bz2
216 | https://conda.anaconda.org/conda-forge/noarch/pyu2f-0.1.5-pyhd8ed1ab_0.tar.bz2
217 | https://conda.anaconda.org/conda-forge/linux-64/pyvcf-0.6.8-py36h9f0ad1d_1002.tar.bz2
218 | https://conda.anaconda.org/conda-forge/noarch/rsa-4.9-pyhd8ed1ab_0.tar.bz2
219 | https://conda.anaconda.org/bioconda/linux-64/samtools-1.10-h2e538c0_3.tar.bz2
220 | https://conda.anaconda.org/conda-forge/linux-64/setuptools-58.0.4-py36h5fab9bb_2.tar.bz2
221 | https://conda.anaconda.org/bioconda/noarch/tabix-1.11-hdfd78af_0.tar.bz2
222 | https://conda.anaconda.org/conda-forge/linux-64/tensorboard-data-server-0.6.0-py36hc39840e_0.tar.bz2
223 | https://conda.anaconda.org/conda-forge/linux-64/tornado-6.1-py36h8f6f2f9_1.tar.bz2
224 | https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.1.1-hd8ed1ab_0.tar.bz2
225 | https://conda.anaconda.org/conda-forge/noarch/werkzeug-2.0.2-pyhd8ed1ab_0.tar.bz2
226 | https://conda.anaconda.org/conda-forge/linux-64/wrapt-1.13.1-py36h8f6f2f9_0.tar.bz2
227 | https://conda.anaconda.org/conda-forge/linux-64/xorg-libxtst-1.2.3-h7f98852_1002.tar.bz2
228 | https://conda.anaconda.org/conda-forge/linux-64/biopython-1.79-py36h8f6f2f9_0.tar.bz2
229 | https://conda.anaconda.org/conda-forge/linux-64/brotlipy-0.7.0-py36h8f6f2f9_1001.tar.bz2
230 | https://conda.anaconda.org/conda-forge/linux-64/click-8.0.1-py36h5fab9bb_0.tar.bz2
231 | https://conda.anaconda.org/conda-forge/linux-64/cryptography-35.0.0-py36hb60f036_0.tar.bz2
232 | https://conda.anaconda.org/conda-forge/linux-64/grpcio-1.38.1-py36h8e87921_0.tar.bz2
233 | https://conda.anaconda.org/conda-forge/linux-64/h5py-2.10.0-nompi_py36h4510012_106.tar.bz2
234 | https://conda.anaconda.org/t/ch-9f6a872e-99a2-45f8-b0d4-b6aae824fb17/conda-forge/linux-64/harfbuzz-2.4.0-h37c48d4_1.tar.bz2
235 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/markdown-3.5.1-pyhd8ed1ab_0.conda
236 | https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.3.4-py36hd391965_0.tar.bz2
237 | https://conda.anaconda.org/bioconda/noarch/nanosv-1.2.4-py_0.tar.bz2
238 | https://conda.anaconda.org/conda-forge/linux-64/numexpr-2.7.3-py36h0cdc3f0_0.tar.bz2
239 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/opt_einsum-3.3.0-pyhc1e730c_2.conda
240 | https://conda.anaconda.org/conda-forge/linux-64/pandas-1.1.5-py36h284efc9_0.tar.bz2
241 | https://conda.anaconda.org/bioconda/linux-64/perl-alien-build-2.48-pl5321hec16e2b_0.tar.bz2
242 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/perl-uri-5.17-pl5321ha770c72_0.conda
243 | https://conda.anaconda.org/conda-forge/noarch/pip-21.3.1-pyhd8ed1ab_0.tar.bz2
244 | https://conda.anaconda.org/conda-forge/linux-64/protobuf-3.18.0-py36hc4f0c31_0.tar.bz2
245 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/scipy-1.5.3-py36h976291a_0.tar.bz2
246 | https://conda.anaconda.org/bioconda/linux-64/tabixpp-1.1.0-hd2e4403_5.tar.bz2
247 | https://conda.anaconda.org/conda-forge/linux-64/xopen-1.2.0-py36h5fab9bb_0.tar.bz2
248 | https://conda.anaconda.org/conda-forge/linux-64/yarl-1.6.3-py36h8f6f2f9_2.tar.bz2
249 | https://conda.anaconda.org/conda-forge/linux-64/aiohttp-3.7.4.post0-py36h8f6f2f9_0.tar.bz2
250 | https://conda.anaconda.org/bioconda/noarch/cutesv-1.0.13-pyhdfd78af_0.tar.bz2
251 | https://conda.anaconda.org/conda-forge/noarch/keras-preprocessing-1.1.2-pyhd8ed1ab_0.tar.bz2
252 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/networkx-2.7-pyhd8ed1ab_0.tar.bz2
253 | https://conda.anaconda.org/conda-forge/noarch/oauthlib-3.2.2-pyhd8ed1ab_0.tar.bz2
254 | https://conda.anaconda.org/conda-forge/linux-64/openjdk-11.0.8-hacce0ff_0.tar.bz2
255 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/linux-64/pango-1.42.4-h7062337_4.tar.bz2
256 | https://conda.anaconda.org/bioconda/linux-64/perl-alien-libxml2-0.17-pl5321hec16e2b_0.tar.bz2
257 | https://conda.anaconda.org/bioconda/linux-64/pybedtools-0.9.0-py36h7281c5b_1.tar.bz2
258 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/bioconda/noarch/pyfaidx-0.7.2.1-pyh7cba7a3_1.tar.bz2
259 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/pyopenssl-22.0.0-pyhd8ed1ab_1.tar.bz2
260 | https://conda.anaconda.org/conda-forge/linux-64/pytables-3.6.1-py36hb7ec5aa_3.tar.bz2
261 | https://conda.anaconda.org/conda-forge/linux-64/tensorflow-estimator-2.6.0-py36hc4f0c31_0.tar.bz2
262 | https://conda.anaconda.org/bioconda/linux-64/vcflib-1.0.2-hfbaaabd_3.tar.bz2
263 | https://conda.anaconda.org/bioconda/linux-64/freebayes-1.3.5-py36h11ea90d_2.tar.bz2
264 | https://conda.anaconda.org/bioconda/noarch/gatk4-4.2.6.1-py36hdfd78af_1.tar.bz2
265 | https://conda.anaconda.org/bioconda/linux-64/perl-xml-libxml-2.0207-pl5321h661654b_0.tar.bz2
266 | https://repo.anaconda.com/pkgs/r/linux-64/r-base-3.4.3-h9bb98a2_5.conda
267 | https://conda.anaconda.org/bioconda/noarch/svim-1.4.2-py_0.tar.bz2
268 | https://repo.anaconda.com/pkgs/main/linux-64/tensorflow-base-2.2.0-mkl_py36hd506778_0.conda
269 | https://conda.anaconda.org/conda-forge/noarch/urllib3-1.26.15-pyhd8ed1ab_0.conda
270 | https://conda.anaconda.org/bioconda/linux-64/whatshap-1.0-py36hf1ae8f4_1.tar.bz2
271 | https://conda.anaconda.org/t/ch-9f6a872e-99a2-45f8-b0d4-b6aae824fb17/bioconda/linux-64/ncbi-vdb-3.0.9-hdbdd923_0.tar.bz2
272 | https://conda.anaconda.org/bioconda/noarch/picard-2.27.3-hdfd78af_0.tar.bz2
273 | https://conda.anaconda.org/conda-forge/noarch/requests-2.28.1-pyhd8ed1ab_0.tar.bz2
274 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/bioconda/linux-64/blast-2.15.0-pl5321h6f7f691_1.tar.bz2
275 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/conda-forge/noarch/google-auth-2.23.0-pyh1a96a4e_0.conda
276 | https://conda.anaconda.org/conda-forge/noarch/requests-oauthlib-1.3.1-pyhd8ed1ab_0.tar.bz2
277 | https://conda.anaconda.org/conda-forge/noarch/google-auth-oauthlib-0.4.6-pyhd8ed1ab_0.tar.bz2
278 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/bioconda/linux-64/hs-blastn-0.0.5-h4ac6f70_5.tar.bz2
279 | https://conda.anaconda.org/conda-forge/noarch/tensorboard-2.11.2-pyhd8ed1ab_0.conda
280 | https://repo.anaconda.com/pkgs/main/linux-64/tensorflow-2.2.0-mkl_py36h5a57954_0.conda
281 | https://conda.anaconda.org/t/ch-d7e07fb7-b42b-4fb7-a192-f5b023e0d555/bioconda/linux-64/clair3-0.1.11-py36hb9dc472_6.tar.bz2
282 | https://repo.anaconda.com/pkgs/main/linux-64/tensorflow-mkl-2.2.0-h4fcabd2_0.conda
283 | https://conda.anaconda.org/bioconda/linux-64/nanovar-1.3.9-py36hc5360cc_1.tar.bz2
284 | 


--------------------------------------------------------------------------------
/testdata/testdata_snp.vcf:
--------------------------------------------------------------------------------
  1 | ##fileformat=VCFv4.2
  2 | ##source=Sniffles
  3 | ##fileDate=20240116
  4 | ##contig=<ID=testdata_ref,length=100000>
  5 | ##ALT=<ID=DEL,Description="Deletion">
  6 | ##ALT=<ID=DUP,Description="Duplication">
  7 | ##ALT=<ID=INV,Description="Inversion">
  8 | ##ALT=<ID=INVDUP,Description="InvertedDUP with unknown boundaries">
  9 | ##ALT=<ID=TRA,Description="Translocation">
 10 | ##ALT=<ID=INS,Description="Insertion">
 11 | ##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for END coordinate in case of a translocation">
 12 | ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant">
 13 | ##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the SV">
 14 | ##INFO=<ID=SVMETHOD,Number=1,Type=String,Description="Type of approach used to detect SV">
 15 | ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
 16 | ##INFO=<ID=AF,Number=.,Type=Integer,Description="Allele Frequency.">
 17 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 18 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE
 19 | testdata_ref	5779	SNP7SURVIVOR	C	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 20 | testdata_ref	6259	SNP8SURVIVOR	C	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 21 | testdata_ref	6321	SNP9SURVIVOR	G	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 22 | testdata_ref	7667	SNP10SURVIVOR	G	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 23 | testdata_ref	7686	SNP11SURVIVOR	A	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 24 | testdata_ref	8152	SNP12SURVIVOR	C	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 25 | testdata_ref	8832	SNP13SURVIVOR	C	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 26 | testdata_ref	9785	SNP14SURVIVOR	C	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 27 | testdata_ref	10580	SNP15SURVIVOR	T	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 28 | testdata_ref	10657	SNP16SURVIVOR	C	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 29 | testdata_ref	11402	SNP17SURVIVOR	G	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 30 | testdata_ref	12419	SNP18SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 31 | testdata_ref	13067	SNP19SURVIVOR	G	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 32 | testdata_ref	16352	SNP20SURVIVOR	C	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 33 | testdata_ref	16934	SNP21SURVIVOR	C	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 34 | testdata_ref	17920	SNP22SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 35 | testdata_ref	20963	SNP23SURVIVOR	T	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 36 | testdata_ref	21772	SNP24SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 37 | testdata_ref	21830	SNP25SURVIVOR	T	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 38 | testdata_ref	22589	SNP26SURVIVOR	T	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 39 | testdata_ref	24676	SNP27SURVIVOR	C	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 40 | testdata_ref	26487	SNP28SURVIVOR	G	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 41 | testdata_ref	30044	SNP29SURVIVOR	A	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 42 | testdata_ref	30181	SNP30SURVIVOR	A	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 43 | testdata_ref	30955	SNP31SURVIVOR	A	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 44 | testdata_ref	32641	SNP32SURVIVOR	A	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 45 | testdata_ref	32869	SNP33SURVIVOR	T	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 46 | testdata_ref	36042	SNP34SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 47 | testdata_ref	44779	SNP39SURVIVOR	G	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 48 | testdata_ref	45035	SNP40SURVIVOR	G	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 49 | testdata_ref	45062	SNP41SURVIVOR	G	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 50 | testdata_ref	45661	SNP43SURVIVOR	G	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 51 | testdata_ref	46262	SNP44SURVIVOR	G	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 52 | testdata_ref	47054	SNP45SURVIVOR	G	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 53 | testdata_ref	49239	SNP46SURVIVOR	C	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 54 | testdata_ref	49552	SNP47SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 55 | testdata_ref	51973	SNP48SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 56 | testdata_ref	54182	SNP49SURVIVOR	G	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 57 | testdata_ref	54955	SNP50SURVIVOR	G	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 58 | testdata_ref	55590	SNP51SURVIVOR	C	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 59 | testdata_ref	56291	SNP52SURVIVOR	G	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 60 | testdata_ref	56331	SNP53SURVIVOR	C	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 61 | testdata_ref	56441	SNP54SURVIVOR	A	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 62 | testdata_ref	57509	SNP55SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 63 | testdata_ref	57751	SNP56SURVIVOR	C	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 64 | testdata_ref	58388	SNP57SURVIVOR	G	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 65 | testdata_ref	58772	SNP58SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 66 | testdata_ref	60984	SNP59SURVIVOR	T	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 67 | testdata_ref	62354	SNP60SURVIVOR	T	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 68 | testdata_ref	63089	SNP61SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 69 | testdata_ref	63729	SNP62SURVIVOR	T	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 70 | testdata_ref	63743	SNP63SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 71 | testdata_ref	63760	SNP64SURVIVOR	A	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 72 | testdata_ref	65044	SNP65SURVIVOR	G	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 73 | testdata_ref	66500	SNP66SURVIVOR	G	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 74 | testdata_ref	67874	SNP67SURVIVOR	C	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 75 | testdata_ref	67940	SNP68SURVIVOR	G	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 76 | testdata_ref	68668	SNP69SURVIVOR	T	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 77 | testdata_ref	68848	SNP70SURVIVOR	A	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 78 | testdata_ref	69389	SNP71SURVIVOR	C	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 79 | testdata_ref	69600	SNP72SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 80 | testdata_ref	70764	SNP73SURVIVOR	A	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 81 | testdata_ref	70883	SNP74SURVIVOR	C	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 82 | testdata_ref	70975	SNP75SURVIVOR	A	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 83 | testdata_ref	71216	SNP76SURVIVOR	G	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 84 | testdata_ref	72316	SNP77SURVIVOR	C	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 85 | testdata_ref	73897	SNP78SURVIVOR	T	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 86 | testdata_ref	74704	SNP79SURVIVOR	C	A	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 87 | testdata_ref	77564	SNP80SURVIVOR	C	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 88 | testdata_ref	79018	SNP81SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 89 | testdata_ref	85378	SNP84SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 90 | testdata_ref	85921	SNP85SURVIVOR	T	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 91 | testdata_ref	85975	SNP86SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 92 | testdata_ref	86100	SNP87SURVIVOR	C	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 93 | testdata_ref	88025	SNP88SURVIVOR	A	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 94 | testdata_ref	88437	SNP89SURVIVOR	G	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 95 | testdata_ref	88767	SNP90SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 96 | testdata_ref	89951	SNP91SURVIVOR	T	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 97 | testdata_ref	90021	SNP92SURVIVOR	A	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 98 | testdata_ref	90347	SNP93SURVIVOR	T	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
 99 | testdata_ref	91548	SNP94SURVIVOR	T	C	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
100 | testdata_ref	91571	SNP95SURVIVOR	A	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
101 | testdata_ref	93429	SNP96SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
102 | testdata_ref	93866	SNP97SURVIVOR	A	T	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
103 | testdata_ref	94637	SNP98SURVIVOR	T	G	PRECISE;SVMETHOD=SURVIVOR_sim;SVLEN=1	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
104 | 


--------------------------------------------------------------------------------
/testdata/testdata_sv.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.2
 2 | ##source=Sniffles
 3 | ##fileDate=20240116
 4 | ##contig=<ID=testdata_ref,length=105622>
 5 | ##ALT=<ID=DEL,Description="Deletion">
 6 | ##ALT=<ID=DUP,Description="Duplication">
 7 | ##ALT=<ID=INV,Description="Inversion">
 8 | ##ALT=<ID=INVDUP,Description="InvertedDUP with unknown boundaries">
 9 | ##ALT=<ID=TRA,Description="Translocation">
10 | ##ALT=<ID=INS,Description="Insertion">
11 | ##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for END coordinate in case of a translocation">
12 | ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant">
13 | ##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the SV">
14 | ##INFO=<ID=SVMETHOD,Number=1,Type=String,Description="Type of approach used to detect SV">
15 | ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
16 | ##INFO=<ID=AF,Number=.,Type=Integer,Description="Allele Frequency.">
17 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
18 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT
19 | testdata_ref	82364	DEL0SURVIVOR	N	<DEL>	.	LowQual	PRECISE;SVTYPE=DEL;SVMETHOD=SURVIVOR_sim;CHR2=testdata_ref;END=85015;SVLEN=2651	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
20 | testdata_ref	4723	INS1SURVIVOR	N	<INS>	.	LowQual	PRECISE;SVTYPE=INS;SVMETHOD=SURVIVOR_sim;CHR2=testdata_ref;END=7492;SVLEN=2769	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
21 | testdata_ref	9901	INS2SURVIVOR	N	<INS>	.	LowQual	PRECISE;SVTYPE=INS;SVMETHOD=SURVIVOR_sim;CHR2=testdata_ref;END=12754;SVLEN=2853	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
22 | testdata_ref	39769	DEL3SURVIVOR	N	<DEL>	.	LowQual	PRECISE;SVTYPE=DEL;SVMETHOD=SURVIVOR_sim;CHR2=testdata_ref;END=44753;SVLEN=4984	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
23 | testdata_ref	81152	INV4SURVIVOR	N	<INV>	.	LowQual	PRECISE;SVTYPE=INV;SVMETHOD=SURVIVOR_sim;CHR2=testdata_ref;END=81781;SVLEN=629	GT:GL:GQ:FT:RC:DR:DV:RR:RV	1/1
24 | 


--------------------------------------------------------------------------------
/variantdetective.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | This module contains miscellaneous functions that are used in various
 4 | components of VariantDetective.
 5 | 
 6 | Copyright (C) 2022 Phil Charron (phil.charron@inspection.gc.ca)
 7 | https://github.com/philcharron-cfia/VariantDetective
 8 | """
 9 | 
10 | from variantdetective.main import main
11 | 
12 | if __name__ == '__main__':
13 |     main()


--------------------------------------------------------------------------------
/variantdetective/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OLF-Bioinformatics/VariantDetective/5ca633e4879865db12112440f1f0c691f6767cd2/variantdetective/__init__.py


--------------------------------------------------------------------------------
/variantdetective/combine_variants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains code for the combine_variants subcommand.
 3 | 
 4 | Copyright (C) 2024 Phil Charron (phil.charron@inspection.gc.ca)
 5 | https://github.com/OLF-Bioinformatics/VariantDetective
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | import datetime
11 | from .tools import run_process, read_vcf, generate_tab_csv_snp_summary, generate_tab_csv_sv_summary
12 | 
13 | 
14 | def combine_variants(args, vcf_lists, output=sys.stderr):
15 |     if len(vcf_lists[0]) != 0:
16 |         snp_indel_outdir = os.path.join(args.out, 'snp_indel')
17 |         if not os.path.isdir(snp_indel_outdir):
18 |             os.makedirs(snp_indel_outdir)
19 | 
20 |         out_vcf_list = []
21 |         for vcf_file in vcf_lists[0]:
22 |             basename = os.path.basename(vcf_file)
23 |             out_vcf_file = snp_indel_outdir + '/' + basename + '.gz'
24 |             out_vcf_list.append(out_vcf_file)
25 |             command = 'bgzip -c -@ ' + str(args.threads) + ' ' + \
26 |                 vcf_file + ' > ' + out_vcf_file
27 |             run_process(command)
28 | 
29 |             command = 'tabix -p vcf ' + out_vcf_file
30 |             run_process(command)
31 |         out_vcf_string = ' '.join(out_vcf_list)
32 |         print(str(datetime.datetime.now().replace(microsecond=0)) + '\tCombining SNP VCF files...', file=output)
33 |         command = 'vcf-isec -p ' + snp_indel_outdir + '/snp_ ' + out_vcf_string
34 |         run_process(command)
35 |         
36 |         snp_output_files = []
37 |         for path, currentDirectory, files in os.walk(snp_indel_outdir):
38 |             for file in files:
39 |                 if file.startswith("snp_") and file.endswith(".vcf.gz"):
40 |                     snp_output_files.append(file) 
41 | 
42 | 
43 |         for snp_file in snp_output_files:
44 |             command = 'tabix -f -p vcf ' + snp_indel_outdir + '/' + snp_file
45 |             run_process(command)
46 | 
47 |         filtered_list = [filename for filename in snp_output_files if filename.count('_') >= args.snp_consensus]
48 |         updated_list = [snp_indel_outdir + "/" + filename for filename in filtered_list]
49 | 
50 |         filtered_string = ' '.join(updated_list)
51 |         if len(updated_list) > 0:
52 |             command = 'bcftools concat -a ' + filtered_string + \
53 |             ' -o ' + snp_indel_outdir + '/snp_final.vcf'
54 |             run_process(command)
55 | 
56 |         command = 'rm ' + snp_indel_outdir + '/*.tbi '
57 |         run_process(command)
58 | 
59 |         generate_tab_csv_snp_summary(read_vcf(snp_indel_outdir + '/snp_final.vcf'), snp_indel_outdir)
60 | 
61 |     if len(vcf_lists[1]) != 0:
62 |         structural_variant_outdir = os.path.join(args.out, 'structural_variant')
63 |         if not os.path.isdir(structural_variant_outdir):
64 |             os.makedirs(structural_variant_outdir)
65 | 
66 |         print(str(datetime.datetime.now().replace(microsecond=0)) + '\tCombining SV VCF files...', file=output)
67 |         vcf_string = ' '.join(vcf_lists[1])
68 |         command = 'ls ' + vcf_string + ' > ' + structural_variant_outdir + '/vcf_list'
69 |         run_process(command)
70 | 
71 |         command = 'SURVIVOR merge ' + structural_variant_outdir + \
72 |             '/vcf_list 1000 ' + str(args.sv_consensus) + ' 1 1 0 ' + str(args.minlen_sv) + ' ' \
73 |             + structural_variant_outdir + '/combined_sv.vcf'
74 |         run_process(command)
75 | 
76 |         command = 'rm ' + structural_variant_outdir + '/vcf_list'
77 |         run_process(command)
78 | 
79 |         generate_tab_csv_sv_summary(read_vcf(structural_variant_outdir + '/combined_sv.vcf'), structural_variant_outdir)


--------------------------------------------------------------------------------
/variantdetective/fragment_lengths.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains a class for describing fragment length distributions
 3 | (described by the gamma distribution) and related functions.
 4 | 
 5 | Copyright (C) 2024 Phil Charron (phil.charron@inspection.gc.ca)
 6 | https://github.com/OLF-Bioinformatics/VariantDetective
 7 | Portions Copyright (C) 2018 Ryan Wick (rrwick@gmail.com)
 8 | https://github.com/rrwick/Badread
 9 | """
10 | 
11 | import numpy as np
12 | import sys
13 | 
14 | ### NEEDED
15 | class FragmentLengths(object):
16 |     def __init__(self, mean, stdev, output=sys.stderr):
17 |         self.mean = mean
18 |         self.stdev = stdev
19 |         if self.stdev == 0:
20 |             self.gamma_k, self.gamma_t = None, None
21 |         else:  # gamma distribution
22 |             gamma_a, gamma_b, self.gamma_k, self.gamma_t = gamma_parameters(mean, stdev)
23 | 
24 |     def get_fragment_length(self):
25 |         if self.stdev == 0:
26 |             return int(round(self.mean))
27 |         else:  # gamma distribution
28 |             fragment_length = int(round(np.random.gamma(self.gamma_k, self.gamma_t)))
29 |             return max(fragment_length, 1)
30 | 
31 | def gamma_parameters(gamma_mean, gamma_stdev):
32 |     # Shape and rate parametrisation:
33 |     gamma_a = (gamma_mean ** 2) / (gamma_stdev ** 2)
34 |     gamma_b = gamma_mean / (gamma_stdev ** 2)
35 | 
36 |     # Shape and scale parametrisation:
37 |     gamma_k = (gamma_mean ** 2) / (gamma_stdev ** 2)
38 |     gamma_t = (gamma_stdev ** 2) / gamma_mean
39 | 
40 |     return gamma_a, gamma_b, gamma_k, gamma_t


--------------------------------------------------------------------------------
/variantdetective/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains the main (entry-point) function into VariantDetective.
  3 | It can be run using the variantdetective.py script. 
  4 | 
  5 | Copyright (C) 2024 Phil Charron (phil.charron@inspection.gc.ca)
  6 | https://github.com/OLF-Bioinformatics/VariantDetective
  7 | """
  8 | 
  9 | import argparse
 10 | import os
 11 | import pathlib
 12 | import shutil
 13 | import sys
 14 | import datetime
 15 | 
 16 | from variantdetective.tools import get_new_filename
 17 | from .validate_inputs import validate_inputs
 18 | from .version import __version__
 19 | 
 20 | def main(output=sys.stderr):
 21 |     check_python_version()
 22 |     args = parse_args(sys.argv[1:])
 23 | 
 24 |     if args.subparser_name == 'structural_variant':
 25 |         check_structural_variant_args(args)
 26 |     
 27 |     elif args.subparser_name == 'snp_indel':
 28 |         check_snp_indel_args(args)
 29 |     
 30 |     elif args.subparser_name == 'all_variants':
 31 |         check_all_variants_args(args)
 32 |     
 33 |     elif args.subparser_name == 'combine_variants':
 34 |         check_combine_variants_args(args)
 35 |     
 36 |     create_outdir(args)
 37 |     copy_inputs(args)
 38 |     #print(str(datetime.datetime.now().replace(microsecond=0)) + '\tValidating input files...', file=output)
 39 |     validate_inputs(args, output=output)    
 40 | 
 41 | def parse_args(args):
 42 |     parser = argparse.ArgumentParser(description='VariantDetective: Identify single nucleotide variants (SNV), '
 43 |                                         'insertions/deletions (indel) and/or structural variants (SV) from '
 44 |                                         'FASTQ reads or FASTA genomic sequences.',
 45 |                       formatter_class=NoSubparsersMetavarFormatter,
 46 |                       add_help=False)
 47 | 
 48 |     help_args = parser.add_argument_group('Help')
 49 |     help_args.add_argument('-h', '--help', action='help',
 50 |                            default=argparse.SUPPRESS,
 51 |                            help='Show this help message and exit')
 52 |     help_args.add_argument('-v', '--version', action='version',
 53 |                            version='VariantDetective v' + __version__,
 54 |                            help="Show program version number and exit")
 55 | 
 56 |     subparsers = parser.add_subparsers(title='Commands', dest='subparser_name',
 57 |                                        metavar=None)
 58 |     all_variants_subparser(subparsers)
 59 |     structural_variant_subparser(subparsers)
 60 |     snp_indel_subparser(subparsers)
 61 |     combine_variants_subparser(subparsers)
 62 | 
 63 |     # If no arguments were used, print the base-level help.
 64 |     if len(args) == 0:
 65 |         parser.print_help(file=sys.stderr)
 66 |         sys.exit(1)
 67 | 
 68 |     return parser.parse_args(args)
 69 | 
 70 | 
 71 | def structural_variant_subparser(subparsers):
 72 |     help = 'Identify structural variants (SV) from long reads (FASTQ) or genome sequence (FASTA). \
 73 |                  If input is FASTA, long reads will be simulated to detect SVs.'
 74 |     definition = 'Identify structural variants (SV) from long reads (FASTQ) or genome sequence (FASTA). \
 75 |                  If input is FASTA, long reads will be simulated to detect SVs.'
 76 | 
 77 |     group = subparsers.add_parser('structural_variant', description=definition,
 78 |                                   help=help, 
 79 |                                   formatter_class=argparse.HelpFormatter,
 80 |                                   add_help=False)
 81 | 
 82 |     help_args = group.add_argument_group('Help')
 83 |     help_args.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS,
 84 |                             help='Show this help message and exit')
 85 |     help_args.add_argument('-v', '--version', action='version',
 86 |                             version='V v' + __version__,
 87 |                             help="Show program version number and exit")
 88 | 
 89 |     input_args = group.add_argument_group('Input')
 90 |     input_args.add_argument('-l', '--long', type=str, metavar='[FASTQ]',
 91 |                                help="Path to long reads FASTQ file. Can't be combined with -g")
 92 |     input_args.add_argument('-g', '--genome', type=str, metavar='[FASTA]',
 93 |                                help="Path to query genomic FASTA file. Can't be combined with -l")
 94 |     input_args.add_argument('-r', '--reference', type=str, required=True, metavar='[FASTA]',
 95 |                                help='Path to reference genome in FASTA. Required')
 96 | 
 97 |     simulate_args = group.add_argument_group('Simulate')
 98 |     simulate_args.add_argument("--readcov", type=str, default='50x',
 99 |                                 help='Either an absolute value (e.g. 250M) or a relative depth (e.g. 50x) (default: %(default)s)')
100 |     simulate_args.add_argument("--readlen", type=str, default='15000,13000',
101 |                                 help='Fragment length distribution (mean,stdev) (default: %(default)s)')
102 |     
103 |     nanovar_args = group.add_argument_group('Structural Variant Call')
104 |     nanovar_args.add_argument("--data_type_sv", type=str, default='ont', choices=['ont', 'pacbio'],
105 |                                 help="Type of long-read data (ont or pacbio) (default: %(default)s)")
106 |     nanovar_args.add_argument("--mincov_sv", type=int, default=2,
107 |                                 help='Minimum number of reads required to call variant (default: %(default)i)')
108 |     nanovar_args.add_argument("--minlen_sv", type=int, default=25,
109 |                                 help='Minimum length of SV to be detected (default: %(default)i)')
110 |     nanovar_args.add_argument("--minqual_sv", type=int, default=15,
111 |                                 help='Minimum quality of SV to be filtered out from SVIM (default: %(default)i)')
112 |     nanovar_args.add_argument("--sv_consensus", type=int, default=3,
113 |                                 help='Specifies the minimum number of tools required to detect an SV to include it in the consensus list (default: %(default)i)')
114 | 
115 | 
116 |     other_args = group.add_argument_group('Other')
117 |     other_args.add_argument('-o', "--out", type=str, default='./',
118 |                                 help='Output directory. Will be created if it does not exist')
119 |     other_args.add_argument('-t', '--threads', type=int, default=1,
120 |                                 help='Number of threads used for job (default: %(default)i)')                            
121 | 
122 | 
123 | def all_variants_subparser(subparsers):
124 |     help = 'Identify structural variants (SV) from long reads (FASTQ) and SNPs/indels from short reads (FASTQ). \
125 |         If genome sequence (FASTA) is provided instead, simulate reads and predict SV, SNPs and indels.'
126 |     definition = 'Identify structural variants (SV) from long reads (FASTQ) and SNPs/indels from short reads (FASTQ). \
127 |         If genome sequence (FASTA) is provided instead, simulate reads and predict SV, SNPs and indels.'
128 | 
129 |     group = subparsers.add_parser('all_variants', description=definition,
130 |                                   help=help, 
131 |                                   formatter_class=argparse.HelpFormatter,
132 |                                   add_help=False)
133 | 
134 |     help_args = group.add_argument_group('Help')
135 |     help_args.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS,
136 |                             help='Show this help message and exit')
137 |     help_args.add_argument('-v', '--version', action='version',
138 |                             version='VariantDetective v' + __version__,
139 |                             help="Show program version number and exit")
140 | 
141 |     input_args = group.add_argument_group('Input')
142 |     input_args.add_argument('-l', '--long', type=str, metavar='[FASTQ]',
143 |                                help="Path to long reads FASTQ file. Must be combined with -1 and -2")
144 |     input_args.add_argument('-1', '--short1', type=str, metavar='[FASTQ]',
145 |                                help="Path to pair 1 of short reads FASTQ file. Must be combined with -l and -2")
146 |     input_args.add_argument('-2', '--short2', type=str, metavar='[FASTQ]',
147 |                                help="Path to pair 2 of short reads FASTQ file. Must be combined with -l and -1")
148 |     input_args.add_argument('-g', '--genome', type=str, metavar='[FASTA]',
149 |                                help="Path to query genomic FASTA file. Can't be combined with -l, -1 or -2")
150 |     input_args.add_argument('-r', '--reference', type=str, required=True, metavar='[FASTA]',
151 |                                help='Path to reference genome in FASTA. Required')
152 | 
153 |     simulate_args = group.add_argument_group('Simulate')
154 |     simulate_args.add_argument("--readcov", type=str, default='50x',
155 |                                 help='Either an absolute value (e.g. 250M) or a relative depth (e.g. 50x) (default: %(default)s)')
156 |     simulate_args.add_argument("--readlen", type=str, default='15000,13000',
157 |                                 help='Fragment length distribution (mean,stdev) (default: %(default)s)')
158 |     
159 |     nanovar_args = group.add_argument_group('Structural Variant Call')
160 |     nanovar_args.add_argument("--data_type_sv", type=str, default='ont', choices=['ont', 'pacbio'],
161 |                                 help="Type of long-read data (ont or pacbio) (default: %(default)s)")
162 |     nanovar_args.add_argument("--mincov_sv", type=int, default=2,
163 |                                 help='Minimum number of reads required to call SV (default: %(default)i)')
164 |     nanovar_args.add_argument("--minlen_sv", type=int, default=25,
165 |                                 help='Minimum length of SV to be detected (default: %(default)i)')
166 |     nanovar_args.add_argument("--minqual_sv", type=int, default=15,
167 |                                 help='Minimum quality of SV to be filtered out with SVIM (default: %(default)i)')
168 |     nanovar_args.add_argument("--sv_consensus", type=int, default=3,
169 |                                 help='Specifies the minimum number of tools required to detect an SV to include it in the consensus list (default: %(default)i)')
170 | 
171 |     snp_args = group.add_argument_group('SNP/Indel Call')
172 |     snp_args.add_argument("--mincov_snp", type=int, default=2,
173 |                                 help='Minimum number of reads required to call SNP/Indel (default: %(default)i)')
174 |     snp_args.add_argument("--minqual_snp", type=int, default=20,
175 |                                 help='Minimum quality of SNP/Indel to be filtered out (default: %(default)i)')
176 |     snp_args.add_argument("--assembler", type=str, default='bwa', choices=['bwa', 'minimap2'],
177 |                                 help='Choose which assembler (bwa or minimap2) to use when using paired-end short reads (default: %(default)s)')
178 |     snp_args.add_argument("--snp_consensus", type=int, default=2,
179 |                                 help='Specifies the minimum number of tools required to detect an SNP or Indel to include it in the consensus list (default: %(default)i)')
180 |     snp_args.add_argument("--custom_clair3_model", type=str,
181 |                                 help='Path to custom model for Clair3 variant calling (such as ones from Rerio)')
182 | 
183 |     other_args = group.add_argument_group('Other')
184 |     other_args.add_argument('-o', "--out", type=str, default='./',
185 |                                 help='Output directory. Will be created if it does not exist')
186 |     other_args.add_argument('-t', '--threads', type=int, default=1,
187 |                                 help='Number of threads used for job (default: %(default)i)')
188 | 
189 | def snp_indel_subparser(subparsers):
190 |     help = 'Identify SNPs/indels from short reads (FASTQ). \
191 |         If genome sequence (FASTA) is provided instead, simulate reads and predict SNPs and indels.'
192 |     definition = 'Identify SNPs/indels from short reads (FASTQ). \
193 |         If genome sequence (FASTA) is provided instead, simulate reads and predict  SNPs and indels.'
194 | 
195 |     group = subparsers.add_parser('snp_indel', description=definition,
196 |                                   help=help, 
197 |                                   formatter_class=argparse.HelpFormatter,
198 |                                   add_help=False)
199 | 
200 |     help_args = group.add_argument_group('Help')
201 |     help_args.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS,
202 |                             help='Show this help message and exit')
203 |     help_args.add_argument('-v', '--version', action='version',
204 |                             version='VariantDetective v' + __version__,
205 |                             help="Show program version number and exit")
206 | 
207 |     input_args = group.add_argument_group('Input')
208 |     input_args.add_argument('-1', '--short1', type=str, metavar='[FASTQ]',
209 |                                help="Path to pair 1 of short reads FASTQ file. Must be combined with -2")
210 |     input_args.add_argument('-2', '--short2', type=str, metavar='[FASTQ]',
211 |                                help="Path to pair 2 of short reads FASTQ file. Must be combined with -1")
212 |     input_args.add_argument('-g', '--genome', type=str, metavar='[FASTA]',
213 |                                help="Path to query genomic FASTA file. Can't be combined with -1 or -2")
214 |     input_args.add_argument('-r', '--reference', type=str, required=True, metavar='[FASTA]',
215 |                                help='Path to reference genome in FASTA. Required')
216 | 
217 |     simulate_args = group.add_argument_group('Simulate')
218 |     simulate_args.add_argument("--readcov", type=str, default='50x',
219 |                                 help='Either an absolute value (e.g. 250M) or a relative depth (e.g. 50x) (default: %(default)s)')
220 |     simulate_args.add_argument("--readlen", type=str, default='15000,13000',
221 |                                 help='Fragment length distribution (mean,stdev) (default: %(default)s)')
222 |     
223 |     snp_args = group.add_argument_group('SNP/Indel Call')
224 |     snp_args.add_argument("--mincov_snp", type=int, default=2,
225 |                                 help='Minimum number of reads required to call SNP/Indel (default: %(default)i)')
226 |     snp_args.add_argument("--minqual_snp", type=int, default=20,
227 |                                 help='Minimum quality of SNP/Indel to be filtered out (default: %(default)i)')
228 |     snp_args.add_argument("--assembler", type=str, default='bwa', choices=['bwa', 'minimap2'],
229 |                                 help='Choose which assembler (bwa or minimap2) to use when using paired-end short reads (default: %(default)s)')
230 |     snp_args.add_argument("--snp_consensus", type=int, default=2,
231 |                                 help='Specifies the minimum number of tools required to detect an SNP or Indel to include it in the consensus list (default: %(default)i)')
232 |     snp_args.add_argument("--custom_clair3_model", type=str,
233 |                                 help='Path to custom model for Clair3 variant calling (such as ones from Rerio)')
234 |     
235 |     other_args = group.add_argument_group('Other')
236 |     other_args.add_argument('-o', "--out", type=str, default='./',
237 |                                 help='Output directory. Will be created if it does not exist')
238 |     other_args.add_argument('-t', '--threads', type=int, default=1,
239 |                                 help='Number of threads used for job (default: %(default)i)')
240 | 
241 | def combine_variants_subparser(subparsers):
242 |     help = 'Combine VCF files predicted using other tools.' 
243 |     definition = 'Combine VCF files predicted using other tools.'
244 | 
245 |     group = subparsers.add_parser('combine_variants', description=definition,
246 |                                   help=help, 
247 |                                   formatter_class=argparse.HelpFormatter,
248 |                                   add_help=False)
249 |     help_args = group.add_argument_group('Help')
250 |     help_args.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS,
251 |                             help='Show this help message and exit')
252 |     help_args.add_argument('-v', '--version', action='version',
253 |                             version='VariantDetective v' + __version__,
254 |                             help="Show program version number and exit")
255 | 
256 |     input_args = group.add_argument_group('Input')
257 |     input_args.add_argument('--snp_vcf', type=str, nargs='+',
258 |                             help="Path to SNP VCF files. Separate each VCF file path with a space.")   
259 |     input_args.add_argument("--snp_consensus", type=int,
260 |                             help='Specifies the minimum number of tools required to detect an SNP or Indel to include it in the consensus list.')
261 |     input_args.add_argument('--sv_vcf', type=str, nargs='+',
262 |                             help="Path to SV VCF files. Separate each VCF file path with a space.")   
263 |     input_args.add_argument("--sv_consensus", type=int,
264 |                             help='Specifies the minimum number of tools required to detect an SV to include it in the consensus list.')
265 |     input_args.add_argument("--minlen_sv", type=int, default=25,
266 |                                 help='Minimum length of SV to be detected (default: %(default)i)')
267 |     
268 |     other_args = group.add_argument_group('Other')
269 |     other_args.add_argument('-o', "--out", type=str, default='./',
270 |                                 help='Output directory. Will be created if it does not exist')
271 |     other_args.add_argument('-t', '--threads', type=int, default=1,
272 |                                 help='Number of threads used for job (default: %(default)i)')
273 | 
274 | def check_all_variants_args(args):
275 |     if args.long is not None and not pathlib.Path(args.long).is_file():
276 |         sys.exit(f'Error: input file {args.long} does not exist')
277 |     if args.short1 is not None and not pathlib.Path(args.short1).is_file():
278 |         sys.exit(f'Error: input file {args.short1} does not exist')
279 |     if args.short2 is not None and not pathlib.Path(args.short2).is_file():
280 |         sys.exit(f'Error: input file {args.short2} does not exist')        
281 |     if args.genome is not None and not pathlib.Path(args.genome).is_file():
282 |         sys.exit(f'Error: input file {args.genome} does not exist')
283 |     if not pathlib.Path(args.reference).is_file():
284 |         sys.exit(f'Error: reference file {args.reference} does not exist')
285 |       
286 |     if args.long is None and args.short1 is None and args.short2 is None and args.genome is None:
287 |         sys.exit("At least one input must be specified. Must use genomic FASTA (-g) or long read FASTQ (-l), short read pair 1 FASTQ (-1) and short read pair 2 FASTQ (-2).")
288 |     if args.genome is not None and (args.long is not None or args.short1 is not None or args.short2 is not None):
289 |         sys.exit("Cannot use FASTA (-g) with other inputs.")
290 |     if args.genome is None and (args.long is None or args.short1 is None or args.short2 is None):
291 |         sys.exit("Must use long read FASTQ (-l), short read pair 1 FASTQ (-1) and short read pair 2 FASTQ (-2) when calling all variants.")
292 |     if args.mincov_sv < 1:
293 |         sys.exit(f'Error: minimum coverage must be over 1')
294 |     if args.minlen_sv < 1:
295 |         sys.exit('Error: minimum length of SV must be over 1')
296 |     if args.mincov_snp < 1:
297 |         sys.exit(f'Error: minimum coverage must be over 1')
298 |     if args.snp_consensus < 1 or args.snp_consensus > 3:
299 |         sys.exit(f'Error: snp_consensus must be between 1 and 3')
300 |     if args.sv_consensus < 1 or args.sv_consensus > 4:
301 |         sys.exit(f'Error: sv_consensus must be between 1 and 4')
302 |     if args.minqual_sv < 0:
303 |         sys.exit(f'Error: minimum quality of SV must be over 0')
304 |     if args.minqual_snp < 0:
305 |         sys.exit(f'Error: minimum quality of SNP must be over 0')
306 |     try:
307 |         length_parameters = [float(x) for x in args.readlen.split(',')]
308 |         args.mean_frag_length = length_parameters[0]
309 |         args.frag_length_stdev = length_parameters[1]
310 |     except (ValueError, IndexError):
311 |         sys.exit('Error: could not parse --length values')
312 |     if args.mean_frag_length <= 100:
313 |         sys.exit(f'Error: mean read length must be at least 100')
314 |     if args.frag_length_stdev < 0:
315 |         sys.exit('Error: read length stdev cannot be negative')
316 | 
317 | 
318 | def check_structural_variant_args(args):
319 |     if args.long is not None and not pathlib.Path(args.long).is_file():
320 |         sys.exit(f'Error: input file {args.long} does not exist')
321 |     if args.genome is not None and not pathlib.Path(args.genome).is_file():
322 |         sys.exit(f'Error: input file {args.genome} does not exist')
323 |     if not pathlib.Path(args.reference).is_file():
324 |         sys.exit(f'Error: reference file {args.reference} does not exist')
325 |     if args.long is None and args.genome is None:
326 |         sys.exit("At least one input must be specified. Must use long-read FASTQ (-l) or genomic FASTA (-g).")
327 |     if args.long is not None and args.genome is not None:
328 |         sys.exit("Only one input can be specified. Can't use FASTQ (-l) and FASTA (-g) together.")
329 |     if args.mincov_sv < 1:
330 |         sys.exit(f'Error: minimum coverage must be over 1')
331 |     if args.minlen_sv < 1:
332 |         sys.exit('Error: minimum length of SV must be over 1')
333 |     if args.sv_consensus < 1 or args.sv_consensus > 4:
334 |         sys.exit(f'Error: sv_consensus must be between 1 and 4')
335 |     if args.minqual_sv < 0:
336 |         sys.exit(f'Error: minimum quality of SV must be over 0')
337 |       
338 |     try:
339 |         length_parameters = [float(x) for x in args.readlen.split(',')]
340 |         args.mean_frag_length = length_parameters[0]
341 |         args.frag_length_stdev = length_parameters[1]
342 |     except (ValueError, IndexError):
343 |         sys.exit('Error: could not parse --length values')
344 |     if args.mean_frag_length <= 100:
345 |         sys.exit(f'Error: mean read length must be at least 100')
346 |     if args.frag_length_stdev < 0:
347 |         sys.exit('Error: read length stdev cannot be negative')
348 | 
349 | def check_snp_indel_args(args):
350 |     if args.short1 is not None and not pathlib.Path(args.short1).is_file():
351 |         sys.exit(f'Error: input file {args.short1} does not exist')
352 |     if args.short2 is not None and not pathlib.Path(args.short2).is_file():
353 |         sys.exit(f'Error: input file {args.short2} does not exist')        
354 |     if args.genome is not None and not pathlib.Path(args.genome).is_file():
355 |         sys.exit(f'Error: input file {args.genome} does not exist')
356 |     if not pathlib.Path(args.reference).is_file():
357 |         sys.exit(f'Error: reference file {args.reference} does not exist')
358 |       
359 |     if args.short1 is None and args.short2 is None and args.genome is None:
360 |         sys.exit("At least one input must be specified. Must use genomic FASTA (-g) or short read pair 1 FASTQ (-1) and short read pair 2 FASTQ (-2).")
361 |     if args.genome is not None and (args.short1 is not None or args.short2 is not None):
362 |         sys.exit("Cannot use FASTA (-g) with other inputs.")
363 |     if args.genome is None and (args.short1 is None or args.short2 is None):
364 |         sys.exit("Must use short read pair 1 FASTQ (-1) and short read pair 2 FASTQ (-2) when calling SNPs and indels.")
365 |     if args.mincov_snp < 1:
366 |         sys.exit(f'Error: minimum coverage must be over 1')
367 |     if args.snp_consensus < 1 or args.snp_consensus > 3:
368 |         sys.exit(f'Error: snp_consensus must be between 1 and 3')
369 |     if args.minqual_snp < 0:
370 |         sys.exit(f'Error: minimum quality of SNP must be over 0')
371 | 
372 |     try:
373 |         length_parameters = [float(x) for x in args.readlen.split(',')]
374 |         args.mean_frag_length = length_parameters[0]
375 |         args.frag_length_stdev = length_parameters[1]
376 |     except (ValueError, IndexError):
377 |         sys.exit('Error: could not parse --length values')
378 |     if args.mean_frag_length <= 100:
379 |         sys.exit(f'Error: mean read length must be at least 100')
380 |     if args.frag_length_stdev < 0:
381 |         sys.exit('Error: read length stdev cannot be negative')
382 | 
383 | def check_combine_variants_args(args):
384 |     if args.snp_vcf is not None:
385 |         num_vcf = len(args.snp_vcf)
386 |         if num_vcf == 1:
387 |             sys.exit('Error: must have more than 1 VCF to create consensus set.')
388 |         if args.snp_consensus is None:
389 |             sys.exit('Error: must specify the minimum number of tools required to include SNP in the consensus list using --snp_consensus.')
390 |         if args.snp_consensus > num_vcf:
391 |             sys.exit('Error: minimum number of consensus VCF files is larger than number of VCF files provided.')
392 |         for vcf_file in args.snp_vcf:
393 |             if not pathlib.Path(vcf_file).is_file():
394 |                 sys.exit(f'Error: VCF file {vcf_file} does not exist.')
395 |         
396 |     if args.sv_vcf is not None:
397 |         num_vcf = len(args.sv_vcf)
398 |         if num_vcf == 1:
399 |             sys.exit('Error: must have more than 1 VCF to create consensus set.')
400 |         if args.sv_consensus is None:
401 |             sys.exit('Error: must specify the minimum number of tools required to include SNP in the consensus list using --snp_consensus.')
402 |         if args.sv_consensus > num_vcf:
403 |             sys.exit('Error: minimum number of consensus VCF files is larger than number of VCF files provided.')
404 |         for vcf_file in args.sv_vcf:
405 |             if not pathlib.Path(vcf_file).is_file():
406 |                 sys.exit(f'Error: VCF file {vcf_file} does not exist.')
407 | 
408 | def check_python_version():
409 |     if sys.version_info.major < 3 or sys.version_info.minor < 6:
410 |         sys.exit('Error: VariantDetective requires Python 3.6 or later')
411 | 
412 | def copy_file(file1, file2):
413 |     try:
414 |         shutil.copyfile(file1, file2)
415 |     except shutil.SameFileError:
416 |         pass
417 | 
418 | def copy_inputs(args):
419 |     if args.subparser_name != 'combine_variants': 
420 |         copy_file(args.reference, get_new_filename(args.reference, args.out))
421 |         if args.genome is not None:
422 |             copy_file(args.genome, get_new_filename(args.genome, args.out))
423 |         if args.subparser_name == 'structural_variant':
424 |             if args.long is not None:
425 |                 copy_file(args.long, get_new_filename(args.long, args.out))    
426 |         elif args.subparser_name == 'snp_indel':
427 |             if args.short1 is not None:
428 |                 copy_file(args.short1, get_new_filename(args.short1, args.out))
429 |             if args.short2 is not None:
430 |                 copy_file(args.short2, get_new_filename(args.short2, args.out))
431 |         else:
432 |             if args.short1 is not None:
433 |                 copy_file(args.short1, get_new_filename(args.short1, args.out))
434 |             if args.short2 is not None:
435 |                 copy_file(args.short2, get_new_filename(args.short2, args.out))
436 |             if args.long is not None:
437 |                 copy_file(args.long, get_new_filename(args.long, args.out))
438 |     else:
439 |         if args.snp_vcf is not None:
440 |             for vcf_file in args.snp_vcf:
441 |                 copy_file(vcf_file, get_new_filename(vcf_file, args.out))
442 |         if args.sv_vcf is not None:
443 |             for vcf_file in args.sv_vcf:
444 |                 copy_file(vcf_file, get_new_filename(vcf_file, args.out))
445 | 
446 | def create_outdir(args):
447 |     if not os.path.isdir(args.out):
448 |         os.makedirs(args.out)
449 |     
450 | class NoSubparsersMetavarFormatter(argparse.HelpFormatter):
451 |     """
452 |     This is a custom formatter class for argparse. It allows for some custom
453 |     formatting, in particular for the help texts when dealing with subparsers
454 |     action. It removes subparsers metavar and help line in subcommand argument
455 |     group, and removes extra indentation of those subcommands.
456 |     https://stackoverflow.com/questions/11070268/ 
457 |     """
458 | 
459 |     def _format_action(self, action):
460 |         result = super()._format_action(action)
461 |         if isinstance(action, argparse._SubParsersAction):
462 |             # fix indentation on first line
463 |             return "%*s%s" % (self._current_indent, "", result.lstrip())
464 |         return result
465 |     def _format_action_invocation(self, action):
466 |         if isinstance(action, argparse._SubParsersAction):
467 | 
468 |             # remove metavar and help line
469 |             return ""
470 |         return super()._format_action_invocation(action)
471 |     def _iter_indented_subactions(self, action):
472 |         if isinstance(action, argparse._SubParsersAction):
473 |             try:
474 |                 get_subactions = action._get_subactions
475 |             except AttributeError:
476 |                 pass
477 |             else:
478 |                 # remove indentation
479 |                 yield from get_subactions()
480 |         else:
481 |             yield from super()._iter_indented_subactions(action)
482 | 
483 | if __name__ == '__main__':
484 |     main()
485 | 


--------------------------------------------------------------------------------
/variantdetective/simulate.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains code required to run the long read simulation of VariantDetective.
  3 | 
  4 | Copyright (C) 2024 Phil Charron (phil.charron@inspection.gc.ca)
  5 | https://github.com/OLF-Bioinformatics/VariantDetective
  6 | Portions Copyright (C) 2018 Ryan Wick (rrwick@gmail.com)
  7 | https://github.com/rrwick/Badread
  8 | """
  9 | import datetime
 10 | import multiprocessing
 11 | import os 
 12 | import random
 13 | import sys
 14 | import uuid
 15 | from .simulate_tools import load_fasta, reverse_complement, random_chance
 16 | from .fragment_lengths import FragmentLengths
 17 | from .version import __version__
 18 | 
 19 | def simulate(args, input_fasta, output=sys.stderr):
 20 |     split_path = os.path.splitext(input_fasta)
 21 |     if split_path[1] == ".gz":
 22 |         output_name = os.path.splitext(split_path[0])[0] + ".fastq"
 23 |     else:
 24 |         output_name = split_path[0] + ".fastq"
 25 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tSimulating long-reads from genomic sequence...', file=output)
 26 |     ref_seqs, ref_depths, ref_circular = load_reference(input_fasta)
 27 |     rev_comp_ref_seqs = {name: reverse_complement(seq) for name, seq in ref_seqs.items()}
 28 |     frag_lengths = FragmentLengths(args.mean_frag_length, args.frag_length_stdev, output)
 29 |     adjust_depths(ref_seqs, ref_depths, ref_circular, frag_lengths, args)   
 30 |     ref_contigs, ref_contig_weights = get_ref_contig_weights(ref_seqs, ref_depths)    
 31 |     ref_size = sum(len(x) for x in ref_seqs.values())
 32 |     target_size = get_target_size(ref_size, args.readcov)
 33 |     process_size = target_size   
 34 |     output_file = open(output_name, 'w')
 35 | 
 36 |     process_list = []
 37 |     p =  multiprocessing.Process(target= generate_reads,
 38 |                                     args = [process_size, frag_lengths, ref_seqs,
 39 |                                     rev_comp_ref_seqs, ref_contigs,
 40 |                                     ref_contig_weights, ref_circular, output_file])
 41 |     p.start()
 42 |     process_list.append(p)
 43 |     for process in process_list:
 44 |         process.join()
 45 | 
 46 |     output_file.close()
 47 |     return output_name
 48 | 
 49 | def generate_reads(target_size, frag_lengths, ref_seqs, rev_comp_ref_seqs,
 50 |                    ref_contigs, ref_contig_weights, ref_circular, output_file):
 51 |     total_size = 0
 52 |     count = 0
 53 | 
 54 |     while total_size < target_size:
 55 |         fragment, info = build_fragment(frag_lengths, ref_seqs, rev_comp_ref_seqs, ref_contigs,
 56 |                                         ref_contig_weights, ref_circular)     
 57 |         quals = 'S'*len(fragment)
 58 | 
 59 |         if len(fragment) == 0:
 60 |             continue
 61 | 
 62 |         info.append(f'length={len(fragment)}')
 63 | 
 64 |         read_name = uuid.UUID(int=random.getrandbits(128))
 65 |         info = ' '.join(info)
 66 |         print(f'@{read_name} {info}\n'+ fragment + '\n+\n' + quals, file=output_file)
 67 | 
 68 |      
 69 |         total_size += len(fragment)
 70 |         count += 1  
 71 | 
 72 | def adjust_depths(ref_seqs, ref_depths, ref_circular, frag_lengths, args):
 73 |     sampled_lengths = [frag_lengths.get_fragment_length() for x in range(100000)]
 74 |     total = sum(sampled_lengths)
 75 |     for ref_name, ref_seq in ref_seqs.items():
 76 |         ref_len = len(ref_seq)
 77 |         ref_circ = ref_circular[ref_name]
 78 | 
 79 |         # Circular plasmids may have to have their depth increased due compensate for misses.
 80 |         if ref_circ:
 81 |             passing_total = sum(length for length in sampled_lengths if length <= ref_len)
 82 |             if passing_total == 0:
 83 |                 sys.exit('Error: fragment length distribution incompatible with reference lengths.')
 84 |             adjustment = total / passing_total
 85 |             ref_depths[ref_name] *= adjustment
 86 | 
 87 |         # Linear plasmids may have to have their depth increased due compensate for truncations.
 88 |         if not ref_circ:
 89 |             passing_total = sum(min(ref_len, length) for length in sampled_lengths)
 90 |             adjustment = total / passing_total
 91 |             ref_depths[ref_name] *= adjustment
 92 | 
 93 | def build_fragment(frag_lengths, ref_seqs, rev_comp_ref_seqs, ref_contigs, ref_contig_weights,
 94 |                    ref_circular):
 95 |     info = []
 96 |     frag_seq, frag_info = get_fragment(frag_lengths, ref_seqs, rev_comp_ref_seqs,
 97 |                                        ref_contigs, ref_contig_weights, ref_circular)
 98 |     info.append(','.join(frag_info))
 99 |     return frag_seq, info
100 | 
101 | def get_fragment(frag_lengths, ref_seqs, rev_comp_ref_seqs, ref_contigs, ref_contig_weights,
102 |                  ref_circular):
103 |     fragment_length = frag_lengths.get_fragment_length()
104 |  
105 |     # The get_real_fragment function can return nothing so we try repeatedly
106 |     # until we get a result.
107 |     for _ in range(1000):
108 |         seq, info = get_real_fragment(fragment_length, ref_seqs, rev_comp_ref_seqs, ref_contigs,
109 |                                       ref_contig_weights, ref_circular)
110 |         if seq != '':
111 |             return seq, info
112 |     sys.exit('Error: failed to generate any sequence fragments - are your read lengths '
113 |              'incompatible with your reference contig lengths?')
114 | 
115 | def get_real_fragment(fragment_length, ref_seqs, rev_comp_ref_seqs, ref_contigs,
116 |                       ref_contig_weights, ref_circular):
117 |     if len(ref_contigs) == 1:
118 |         contig = ref_contigs[0]
119 |     else:
120 |         contig = random.choices(ref_contigs, weights=ref_contig_weights)[0]
121 |     info = [contig]
122 |     if random_chance(0.5):
123 |         seq = ref_seqs[contig]
124 |         info.append('+strand')
125 |     else:
126 |         seq = rev_comp_ref_seqs[contig]
127 |         info.append('-strand')
128 | 
129 |     # If the reference contig is linear and the fragment length is long enough, then we just
130 |     # return the entire fragment, start to end.
131 |     if fragment_length >= len(seq) and not ref_circular[contig]:
132 |         info.append('0-' + str(len(seq)))
133 |         return seq, info
134 | 
135 |     # If the reference contig is circular and the fragment length is too long, then we fail to get
136 |     # the read.
137 |     if fragment_length > len(seq) and ref_circular[contig]:
138 |         return '', ''
139 | 
140 |     start_pos = random.randint(0, len(seq)-1)
141 |     end_pos = start_pos + fragment_length
142 | 
143 |     info.append(f'{start_pos}-{end_pos}')
144 | 
145 |     # For circular contigs, we may have to loop the read around the contig.
146 |     if ref_circular[contig]:
147 |         if end_pos <= len(seq):
148 |             return seq[start_pos:end_pos], info
149 |         else:
150 |             looped_end_pos = end_pos - len(seq)
151 |             assert looped_end_pos > 0
152 |         return seq[start_pos:] + seq[:looped_end_pos], info
153 | 
154 |     # For linear contigs, we don't care if the ending position is off the end - that will just
155 |     # result in the read ending at the sequence end (and being shorter than the fragment
156 |     # length).
157 |     else:
158 |         return seq[start_pos:end_pos], info
159 | 
160 | def get_ref_contig_weights(ref_seqs, ref_depths):
161 |     ref_contigs = [x[0] for x in ref_depths.items()]
162 |     ref_contig_weights = [x[1] * len(ref_seqs[x[0]]) for x in ref_depths.items()]
163 |     return ref_contigs, ref_contig_weights
164 | 
165 | def get_target_size(ref_size, coverage):
166 |     try:
167 |         return int(coverage)
168 |     except ValueError:
169 |         pass
170 |     coverage = coverage.lower()
171 |     try:
172 |         last_char = coverage[-1]
173 |         value = float(coverage[:-1])
174 |         if last_char == 'x':
175 |             return int(round(value * ref_size))
176 |         elif last_char == 'g':
177 |             return int(round(value * 1000000000))
178 |         elif last_char == 'm':
179 |             return int(round(value * 1000000))
180 |         elif last_char == 'k':
181 |             return int(round(value * 1000))
182 |     except (ValueError, IndexError):
183 |         pass
184 | 
185 | def load_reference(reference):
186 |     ref_seqs, ref_depths, ref_circular = load_fasta(reference)
187 |     return ref_seqs, ref_depths, ref_circular


--------------------------------------------------------------------------------
/variantdetective/simulate_tools.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains functions that are used in the long read simulation
  3 | component of VariantDetective.
  4 | 
  5 | Copyright (C) 2024 Phil Charron (phil.charron@inspection.gc.ca)
  6 | https://github.com/OLF-Bioinformatics/VariantDetective
  7 | Portions Copyright (C) 2018 Ryan Wick (rrwick@gmail.com)
  8 | https://github.com/rrwick/Badread
  9 | """
 10 | 
 11 | import collections
 12 | import gzip
 13 | import os
 14 | import random
 15 | import re
 16 | import sys
 17 | 
 18 | def complement_base(base):
 19 |     try:
 20 |         return REV_COMP_DICT[base]
 21 |     except KeyError:
 22 |         return 'N'
 23 | 
 24 | REV_COMP_DICT = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'a': 't', 't': 'a',
 25 |                  'g': 'c', 'c': 'g', 'R': 'Y', 'Y': 'R', 'S': 'S', 'W': 'W',
 26 |                  'K': 'M', 'M': 'K', 'B': 'V', 'V': 'B', 'D': 'H', 'H': 'D', 
 27 |                  'N': 'N', 'r': 'y', 'y': 'r', 's': 's', 'w': 'w', 'k': 'm',
 28 |                  'm': 'k', 'b': 'v', 'v': 'b', 'd': 'h', 'h': 'd', 'n': 'n', 
 29 |                  '.': '.', '-': '-', '?': '?'}
 30 | 
 31 | def get_compression_type(filename):
 32 |     """
 33 |     Attempts to guess the compression (if any) on a file using the first few bytes.
 34 |     http://stackoverflow.com/questions/13044562
 35 |     """
 36 |     magic_dict = {'gz': (b'\x1f', b'\x8b', b'\x08'),
 37 |                   'bz2': (b'\x42', b'\x5a', b'\x68'),
 38 |                   'zip': (b'\x50', b'\x4b', b'\x03', b'\x04')}
 39 |     max_len = max(len(x) for x in magic_dict)
 40 | 
 41 |     unknown_file = open(filename, 'rb')
 42 |     file_start = unknown_file.read(max_len)
 43 |     unknown_file.close()
 44 |     compression_type = 'plain'
 45 |     for file_type, magic_bytes in magic_dict.items():
 46 |         if file_start.startswith(magic_bytes):
 47 |             compression_type = file_type
 48 |     if compression_type == 'bz2':
 49 |         sys.exit('Error: cannot use bzip2 format - use gzip instead')
 50 |     if compression_type == 'zip':
 51 |         sys.exit('Error: cannot use zip format - use gzip instead')
 52 |     return compression_type
 53 | 
 54 | def get_open_func(filename):
 55 |     if get_compression_type(filename) == 'gz':
 56 |         return gzip.open
 57 |     else:  # plain text
 58 |         return open
 59 | 
 60 | def get_sequence_file_type(filename):
 61 |     """
 62 |     Determines whether a file is FASTA.
 63 |     """
 64 |     if not os.path.isfile(filename):
 65 |         sys.exit('Error: could not find {}'.format(filename))
 66 |     if get_compression_type(filename) == 'gz':
 67 |         open_func = gzip.open
 68 |     else:  # plain text
 69 |         open_func = open
 70 |     with open_func(filename, 'rt') as seq_file:
 71 |         try:
 72 |             first_char = seq_file.read(1)
 73 |         except UnicodeDecodeError:
 74 |             first_char = ''
 75 |     if first_char == '>':
 76 |         return 'FASTA'
 77 |     else:
 78 |         raise ValueError('File is not FASTA')
 79 | 
 80 | def load_fasta(filename):
 81 |     if get_sequence_file_type(filename) != 'FASTA':
 82 |         sys.exit('Error: {} is not FASTA format'.format(filename))
 83 |     fasta_seqs = collections.OrderedDict()
 84 |     depths, circular = {}, {}
 85 |     p = re.compile(r'depth=([\d.]+)')
 86 |     with get_open_func(filename)(filename, 'rt') as fasta_file:
 87 |         name = ''
 88 |         sequence = []
 89 |         for line in fasta_file:
 90 |             line = line.strip()
 91 |             if not line:
 92 |                 continue
 93 |             if line[0] == '>':  # Header line = start of new contig
 94 |                 if name:
 95 |                     fasta_seqs[name.split()[0]] = ''.join(sequence)
 96 |                     sequence = []
 97 |                 name = line[1:]
 98 |                 short_name = name.split()[0]
 99 |                 if 'depth=' in name.lower():
100 |                     try:
101 |                         depths[short_name] = float(p.search(name.lower()).group(1))
102 |                     except (ValueError, AttributeError):
103 |                         depths[short_name] = 1.0
104 |                 else:
105 |                     depths[short_name] = 1.0
106 |                 circular[short_name] = 'circular=true' in name.lower()
107 |             else:
108 |                 sequence.append(line)
109 |         if name:
110 |             fasta_seqs[name.split()[0]] = ''.join(sequence)
111 |     return fasta_seqs, depths, circular
112 | 
113 | def random_chance(chance):
114 |     assert 0.0 <= chance <= 1.0
115 |     return random.random() < chance
116 | 
117 | def reverse_complement(seq):
118 |     return ''.join([complement_base(x) for x in seq][::-1])
119 | 


--------------------------------------------------------------------------------
/variantdetective/snp_indel.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains code for the snp_indel subcommand.
  3 | 
  4 | Copyright (C) 2024 Phil Charron (phil.charron@inspection.gc.ca)
  5 | https://github.com/OLF-Bioinformatics/VariantDetective
  6 | """
  7 | 
  8 | import os
  9 | import sys
 10 | import io
 11 | import pandas as pd
 12 | import datetime
 13 | import psutil
 14 | import subprocess
 15 | from .tools import get_new_filename, run_process, read_vcf, generate_tab_csv_snp_summary
 16 | 
 17 | def snp_indel(args, snp_input, output=sys.stderr):
 18 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tStarting snp_indel tool', file=output)
 19 |     reference = get_new_filename(args.reference, args.out)   
 20 |     snp_indel_outdir = os.path.join(args.out, 'snp_indel')
 21 |     haplotypecaller_outdir = os.path.join(snp_indel_outdir, 'haplotypecaller')
 22 |     freebayes_outdir = os.path.join(snp_indel_outdir, 'freebayes')
 23 |     clair3_outdir = os.path.join(snp_indel_outdir, 'clair3')
 24 |     
 25 |     if not os.path.isdir(snp_indel_outdir):
 26 |         os.makedirs(snp_indel_outdir)
 27 |     if not os.path.isdir(haplotypecaller_outdir):
 28 |         os.makedirs(haplotypecaller_outdir)
 29 |     if not os.path.isdir(freebayes_outdir):
 30 |         os.makedirs(freebayes_outdir)
 31 |     if not os.path.isdir(clair3_outdir):
 32 |         os.makedirs(clair3_outdir)
 33 | 
 34 |     # Map reads if using short reads
 35 |     if isinstance(snp_input, list):
 36 |         bam_file_dir = snp_indel_outdir
 37 |         rgpl = 'ILLUMINA'
 38 |         if args.assembler == 'minimap2':
 39 |             print(str(datetime.datetime.now().replace(microsecond=0)) + '\tRunning minimap2...', file=output)
 40 |             command = 'minimap2 -t ' + str(args.threads) + ' -ax sr '
 41 |         elif args.assembler == 'bwa':
 42 |             print(str(datetime.datetime.now().replace(microsecond=0)) + '\tRunning bwa...', file=output)
 43 |             command = 'bwa index ' + reference
 44 |             run_process(command)
 45 |             command = 'bwa mem -t ' + str(args.threads) + ' '
 46 |         command += reference + ' ' + snp_input[0] + ' ' + snp_input[1] + \
 47 |         ' | samtools view -Sb - -@ ' + str(args.threads) + \
 48 |         ' | samtools sort -n - -@ ' +  str(args.threads) + \
 49 |         ' | samtools fixmate -m - - -@ ' + str(args.threads) + \
 50 |         ' | samtools sort - -@ ' + str(args.threads) + \
 51 |         ' | samtools markdup -r - -@ ' + str(args.threads) + ' ' + \
 52 |         bam_file_dir + '/alignment.sorted.bam'
 53 |         run_process(command)
 54 |     elif args.subparser_name == 'snp_indel':
 55 |         bam_file_dir = snp_indel_outdir
 56 |         rgpl = 'ONT'
 57 |         print(str(datetime.datetime.now().replace(microsecond=0)) + '\tRunning minimap2...', file=output)
 58 |         command = 'minimap2 -t ' + str(args.threads) + ' -ax map-ont ' + \
 59 |         reference + ' ' + snp_input + \
 60 |         ' | samtools view -Sb - -@ ' + str(args.threads) + \
 61 |         ' | samtools sort - -@ ' + str(args.threads) + \
 62 |         ' -o ' + bam_file_dir + '/alignment.sorted.bam'
 63 |         run_process(command)
 64 |     else:
 65 |         bam_file_dir = os.path.join(args.out, 'structural_variant')
 66 |         rgpl = 'ONT'
 67 | 
 68 |     # Run Picard 
 69 |     reference_base = os.path.splitext(reference)[0]
 70 |     dict_file = reference_base + ".dict"
 71 |     if os.path.exists(dict_file):
 72 |         os.remove(dict_file)
 73 |     command = 'picard CreateSequenceDictionary R=' + reference
 74 |     run_process(command)
 75 |     command = 'picard AddOrReplaceReadGroups I=' + bam_file_dir + '/alignment.sorted.bam O=' + \
 76 |         snp_indel_outdir + '/alignment.rg.sorted.bam RGID=1 RGLB=SAMPLE RGSM=SAMPLE RGPU=SAMPLE RGPL=' + rgpl
 77 |     run_process(command)
 78 |     command = 'samtools index ' + snp_indel_outdir + '/alignment.rg.sorted.bam'
 79 |     run_process(command)
 80 |     try:
 81 |         command = 'rm ' + snp_indel_outdir + '/alignment.sorted.bam'
 82 |         run_process(command)
 83 |     except:
 84 |         pass
 85 |     try:
 86 |         command = 'samtools faidx ' + reference
 87 |         run_process(command)
 88 |     except:
 89 |         pass
 90 | 
 91 | 
 92 |     # Run Freebayes
 93 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tRunning Freebayes...', file=output)
 94 |     command = 'freebayes-parallel <(fasta_generate_regions.py ' + reference + '.fai 100000) ' + \
 95 |             str(args.threads) + ' -f ' + reference + ' ' + \
 96 |             snp_indel_outdir + '/alignment.rg.sorted.bam -p 1 > ' + \
 97 |             freebayes_outdir + '/freebayes.vcf'
 98 |     run_process(command)
 99 | 
100 |     command = 'vcffilter -f "QUAL > ' + str(args.minqual_snp) + '" ' + freebayes_outdir + '/freebayes.vcf > ' + \
101 |         freebayes_outdir + '/freebayes.filt.vcf'
102 |     run_process(command)
103 | 
104 |     command = 'bgzip -c -@ ' + str(args.threads) + ' ' + \
105 |         freebayes_outdir + '/freebayes.filt.vcf > ' + \
106 |         freebayes_outdir + '/freebayes.filt.vcf.gz'
107 |     run_process(command)
108 | 
109 |     command = 'tabix -p vcf ' + freebayes_outdir + '/freebayes.filt.vcf.gz'
110 |     run_process(command)
111 | 
112 |     # Run HaplotypeCaller
113 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tRunning HaplotypeCaller...', file=output)
114 |     total_memory_gb = psutil.virtual_memory().total / (1024 ** 3)
115 |     ninety_percent_memory_gb = 0.9 * total_memory_gb
116 |     xmx_value = f"-Xmx{int(ninety_percent_memory_gb)}G"
117 |     xss_value = f"-Xss{int(ninety_percent_memory_gb // 10)}M"  # Assuming we use 10% of Xmx for Xss, and convert to MB
118 |     if xss_value == "-Xss0M":
119 |         xss_value = "-Xss1M"
120 |     command = f'gatk HaplotypeCaller --java-options "{xmx_value} {xss_value}" -R ' + reference + \
121 |         ' -I ' + snp_indel_outdir + '/alignment.rg.sorted.bam' + \
122 |         ' -O ' + haplotypecaller_outdir + '/haplotypecaller.vcf' + \
123 |         ' -ploidy 1'
124 |     run_process(command)
125 |     command = 'vcffilter -f "QD > ' + str(args.minqual_snp) + '" ' + haplotypecaller_outdir + '/haplotypecaller.vcf > ' + \
126 |         haplotypecaller_outdir + '/haplotypecaller.filt.vcf'
127 |     run_process(command)
128 | 
129 |     command = 'bgzip -c -@ ' + str(args.threads) + ' ' + \
130 |         haplotypecaller_outdir + '/haplotypecaller.filt.vcf > ' + \
131 |         haplotypecaller_outdir + '/haplotypecaller.filt.vcf.gz'
132 |     run_process(command)
133 | 
134 |     command = 'tabix -p vcf ' + haplotypecaller_outdir + '/haplotypecaller.filt.vcf.gz'
135 |     run_process(command)
136 | 
137 |     # Run Clair3
138 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tRunning Clair3...', file=output)
139 |     #model_path = pkg_resources.resource_filename('variantdetective', 'clair3_models/ilmn')
140 |     
141 |     if args.custom_clair3_model is not None:
142 |         model_path = args.custom_clair3_model
143 | 
144 |     else:
145 |         command = "dirname $(which run_clair3.sh)"
146 |         # Run the command and capture the output
147 |         try:
148 |             bin_output = subprocess.check_output(command, shell=True, universal_newlines=True).strip()
149 |         except subprocess.CalledProcessError as e:
150 |             print("Error:", e)
151 |         model_path = bin_output + "/models/ilmn"
152 | 
153 |     command = 'run_clair3.sh -f ' + reference + \
154 |         ' -b ' + snp_indel_outdir + '/alignment.rg.sorted.bam' + \
155 |         ' -o ' + clair3_outdir + \
156 |         ' -p "ilmn" -m ' + model_path + ' --include_all_ctgs ' + \
157 |         ' --no_phasing_for_fa --haploid_precise -t ' + str(args.threads)
158 |     run_process(command)
159 |     
160 |     command = 'mv ' + clair3_outdir + '/merge_output.vcf.gz ' + clair3_outdir + '/clair3.vcf.gz'
161 |     run_process(command)
162 | 
163 |     command = 'gunzip -f ' + clair3_outdir + '/clair3.vcf.gz'
164 |     run_process(command)
165 | 
166 |     command = 'vcffilter -f "QUAL > ' + str(args.minqual_snp) + ' & FILTER = PASS" ' + clair3_outdir + '/clair3.vcf > ' + \
167 |         clair3_outdir + '/clair3.filt.vcf'
168 |     run_process(command)
169 | 
170 |     command = 'bgzip -c -@ ' + str(args.threads) + ' ' + \
171 |         clair3_outdir + '/clair3.filt.vcf > ' + \
172 |         clair3_outdir + '/clair3.filt.vcf.gz'
173 |     run_process(command)
174 | 
175 |     command = 'tabix -p vcf ' + clair3_outdir + '/clair3.filt.vcf.gz'
176 |     run_process(command)
177 | 
178 |     # Combine Variants
179 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tCombining variants...', file=output)
180 |     def has_variants(vcf_file):
181 |         """Check if a VCF file has variants."""
182 |         command = f"zcat {vcf_file} | grep -v '#' | wc -l"
183 |         try:
184 |             result = subprocess.check_output(command, shell=True, universal_newlines=True).strip()
185 |             return int(result) > 0
186 |         except subprocess.CalledProcessError as e:
187 |             print("Error:", e)
188 |             return False
189 | 
190 |     def create_vcf_if_not_exists(source_file, dest_file, snp_indel_outdir):
191 |         if not os.path.exists(dest_file):
192 |             command = f"zcat {snp_indel_outdir}/{source_file} | grep '#' | bgzip -c > {snp_indel_outdir}/{dest_file}"
193 |             run_process(command)
194 |     
195 |     def run_tabix(vcf_file, snp_indel_outdir):
196 |         command = 'tabix -p vcf ' + snp_indel_outdir + '/' + vcf_file
197 |         run_process(command)
198 | 
199 |     vcf_files = [
200 |         freebayes_outdir + '/freebayes.filt.vcf.gz',
201 |         haplotypecaller_outdir + '/haplotypecaller.filt.vcf.gz',
202 |         clair3_outdir + '/clair3.filt.vcf.gz'
203 |     ]
204 |     
205 |     # Filter out files with no variants
206 |     valid_vcf_files = [file for file in vcf_files if has_variants(file)]
207 | 
208 |     if len(valid_vcf_files) > 0:
209 |         command = 'vcf-isec -p ' + snp_indel_outdir + '/snp_ ' + ' '.join(valid_vcf_files)
210 |         run_process(command)
211 |         if len(valid_vcf_files) == 3:
212 |             source_vcf = 'snp_0_1_2.vcf.gz'
213 |             dest_vcfs = ['snp_0_1.vcf.gz', 'snp_0_2.vcf.gz', 'snp_1_2.vcf.gz',
214 |                         'snp_0.vcf.gz', 'snp_1.vcf.gz', 'snp_2.vcf.gz']
215 |             run_tabix(source_vcf, snp_indel_outdir)
216 |             for dest_vcf in dest_vcfs:
217 |                 create_vcf_if_not_exists(source_vcf, dest_vcf, snp_indel_outdir)
218 |                 run_tabix(dest_vcf, snp_indel_outdir)
219 |         elif len(valid_vcf_files) == 2:
220 |             source_vcf = 'snp_0_1.vcf.gz'
221 |             dest_vcfs = ['snp_0_1_2.vcf.gz', 'snp_0_2.vcf.gz', 'snp_1_2.vcf.gz',
222 |                         'snp_0.vcf.gz', 'snp_1.vcf.gz', 'snp_2.vcf.gz']
223 |             run_tabix(source_vcf, snp_indel_outdir)
224 |             for dest_vcf in dest_vcfs:
225 |                 create_vcf_if_not_exists(source_vcf, dest_vcf, snp_indel_outdir)
226 |                 run_tabix(dest_vcf, snp_indel_outdir) 
227 |         elif len(valid_vcf_files) == 1:
228 |             source_vcf = 'snp_0.vcf.gz'
229 |             dest_vcfs = ['snp_0_1_2.vcf.gz', 'snp_0_1.vcf.gz', 'snp_0_2.vcf.gz',
230 |                         'snp_1_2.vcf.gz', 'snp_1.vcf.gz', 'snp_2.vcf.gz']
231 |             run_tabix(source_vcf, snp_indel_outdir)
232 |             for dest_vcf in dest_vcfs:
233 |                 create_vcf_if_not_exists(source_vcf, dest_vcf, snp_indel_outdir)
234 |                 run_tabix(dest_vcf, snp_indel_outdir)
235 |         if args.snp_consensus == 3:
236 |             command = 'gunzip -c ' + snp_indel_outdir + '/snp_0_1_2.vcf.gz > ' + \
237 |                 snp_indel_outdir + '/snp_final.vcf' 
238 |             run_process(command)
239 |         elif args.snp_consensus == 2:
240 |             command = 'bcftools concat -a ' + snp_indel_outdir + '/snp_0_1_2.vcf.gz ' + \
241 |                 snp_indel_outdir + '/snp_0_1.vcf.gz ' + \
242 |                 snp_indel_outdir + '/snp_0_2.vcf.gz ' + \
243 |                 snp_indel_outdir + '/snp_1_2.vcf.gz ' + \
244 |                 '-o ' + snp_indel_outdir + '/snp_final.vcf'
245 |             run_process(command)
246 |         elif args.snp_consensus == 1:
247 |             command = 'bcftools concat -a ' + snp_indel_outdir + '/snp_0_1_2.vcf.gz ' + \
248 |                 snp_indel_outdir + '/snp_0_1.vcf.gz ' + \
249 |                 snp_indel_outdir + '/snp_0_2.vcf.gz ' + \
250 |                 snp_indel_outdir + '/snp_1_2.vcf.gz ' + \
251 |                 snp_indel_outdir + '/snp_0.vcf.gz ' + \
252 |                 snp_indel_outdir + '/snp_1.vcf.gz ' + \
253 |                 snp_indel_outdir + '/snp_2.vcf.gz ' + \
254 |                 '-o ' + snp_indel_outdir + '/snp_final.vcf'
255 |             run_process(command)
256 |         if len(valid_vcf_files) == 3:    
257 |             command = 'mv ' + snp_indel_outdir + '/snp_0.vcf.gz ' + snp_indel_outdir + '/freebayes.unique.vcf.gz ; ' + \
258 |                     'mv ' + snp_indel_outdir + '/snp_1.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.unique.vcf.gz ; ' + \
259 |                     'mv ' + snp_indel_outdir + '/snp_2.vcf.gz ' + snp_indel_outdir + '/clair3.unique.vcf.gz ; ' + \
260 |                     'mv ' + snp_indel_outdir + '/snp_0_1.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.vcf.gz ; ' + \
261 |                     'mv ' + snp_indel_outdir + '/snp_0_2.vcf.gz ' + snp_indel_outdir + '/freebayes.clair3.vcf.gz ; ' + \
262 |                     'mv ' + snp_indel_outdir + '/snp_1_2.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.clair3.vcf.gz ; ' + \
263 |                     'mv ' + snp_indel_outdir + '/snp_0_1_2.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.clair3.vcf.gz ; ' + \
264 |                     'rm ' + snp_indel_outdir + '/*.tbi ' + snp_indel_outdir + '/snp__README'
265 |         elif len(valid_vcf_files) == 2:
266 |             missing_vcf = list(set(vcf_files) - set(valid_vcf_files))
267 |             if freebayes_outdir + '/freebayes.filt.vcf.gz' in missing_vcf:
268 |                 command = 'mv ' + snp_indel_outdir + '/snp_0.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.unique.vcf.gz ; ' + \
269 |                     'mv ' + snp_indel_outdir + '/snp_1.vcf.gz ' + snp_indel_outdir + '/clair3.unique.vcf.gz ; ' + \
270 |                     'mv ' + snp_indel_outdir + '/snp_2.vcf.gz ' + snp_indel_outdir + '/freebayes.unique.vcf.gz ; ' + \
271 |                     'mv ' + snp_indel_outdir + '/snp_0_1.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.clair3.vcf.gz ; ' + \
272 |                     'mv ' + snp_indel_outdir + '/snp_0_2.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.vcf.gz ; ' + \
273 |                     'mv ' + snp_indel_outdir + '/snp_1_2.vcf.gz ' + snp_indel_outdir + '/freebayes.clair3.vcf.gz ; ' + \
274 |                     'mv ' + snp_indel_outdir + '/snp_0_1_2.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.clair3.vcf.gz ; ' + \
275 |                     'rm ' + snp_indel_outdir + '/*.tbi ' + snp_indel_outdir + '/snp__README'
276 |             elif haplotypecaller_outdir + '/haplotypecaller.filt.vcf.gz' in missing_vcf:
277 |                 command = 'mv ' + snp_indel_outdir + '/snp_0.vcf.gz ' + snp_indel_outdir + '/freebayes.unique.vcf.gz ; ' + \
278 |                     'mv ' + snp_indel_outdir + '/snp_1.vcf.gz ' + snp_indel_outdir + '/clair3.unique.vcf.gz ; ' + \
279 |                     'mv ' + snp_indel_outdir + '/snp_2.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.unique.vcf.gz ; ' + \
280 |                     'mv ' + snp_indel_outdir + '/snp_0_1.vcf.gz ' + snp_indel_outdir + '/freebayes.clair3.vcf.gz ; ' + \
281 |                     'mv ' + snp_indel_outdir + '/snp_0_2.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.vcf.gz ; ' + \
282 |                     'mv ' + snp_indel_outdir + '/snp_1_2.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.clair3.vcf.gz ; ' + \
283 |                     'mv ' + snp_indel_outdir + '/snp_0_1_2.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.clair3.vcf.gz ; ' + \
284 |                     'rm ' + snp_indel_outdir + '/*.tbi ' + snp_indel_outdir + '/snp__README'
285 |             elif clair3_outdir + '/clair3.filt.vcf.gz'in missing_vcf:
286 |                 command = 'mv ' + snp_indel_outdir + '/snp_0.vcf.gz ' + snp_indel_outdir + '/freebayes.unique.vcf.gz ; ' + \
287 |                     'mv ' + snp_indel_outdir + '/snp_1.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.unique.vcf.gz ; ' + \
288 |                     'mv ' + snp_indel_outdir + '/snp_2.vcf.gz ' + snp_indel_outdir + '/clair3.unique.vcf.gz ; ' + \
289 |                     'mv ' + snp_indel_outdir + '/snp_0_1.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.vcf.gz ; ' + \
290 |                     'mv ' + snp_indel_outdir + '/snp_0_2.vcf.gz ' + snp_indel_outdir + '/freebayes.clair3.vcf.gz ; ' + \
291 |                     'mv ' + snp_indel_outdir + '/snp_1_2.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.clair3.vcf.gz ; ' + \
292 |                     'mv ' + snp_indel_outdir + '/snp_0_1_2.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.clair3.vcf.gz ; ' + \
293 |                     'rm ' + snp_indel_outdir + '/*.tbi ' + snp_indel_outdir + '/snp__README'
294 |         elif len(valid_vcf_files) == 1:
295 |             if freebayes_outdir + '/freebayes.filt.vcf.gz' in valid_vcf_files:
296 |                 command = 'mv ' + snp_indel_outdir + '/snp_0.vcf.gz ' + snp_indel_outdir + '/freebayes.unique.vcf.gz ; ' + \
297 |                     'mv ' + snp_indel_outdir + '/snp_1.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.unique.vcf.gz ; ' + \
298 |                     'mv ' + snp_indel_outdir + '/snp_2.vcf.gz ' + snp_indel_outdir + '/clair3.unique.vcf.gz ; ' + \
299 |                     'mv ' + snp_indel_outdir + '/snp_0_1.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.vcf.gz ; ' + \
300 |                     'mv ' + snp_indel_outdir + '/snp_0_2.vcf.gz ' + snp_indel_outdir + '/freebayes.clair3.vcf.gz ; ' + \
301 |                     'mv ' + snp_indel_outdir + '/snp_1_2.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.clair3.vcf.gz ; ' + \
302 |                     'mv ' + snp_indel_outdir + '/snp_0_1_2.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.clair3.vcf.gz ; ' + \
303 |                     'rm ' + snp_indel_outdir + '/*.tbi ' + snp_indel_outdir + '/snp__README'
304 |             elif haplotypecaller_outdir + '/haplotypecaller.filt.vcf.gz' in valid_vcf_files:
305 |                 command = 'mv ' + snp_indel_outdir + '/snp_0.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.unique.vcf.gz ; ' + \
306 |                     'mv ' + snp_indel_outdir + '/snp_1.vcf.gz ' + snp_indel_outdir + '/freebayes.unique.vcf.gz ; ' + \
307 |                     'mv ' + snp_indel_outdir + '/snp_2.vcf.gz ' + snp_indel_outdir + '/clair3.unique.vcf.gz ; ' + \
308 |                     'mv ' + snp_indel_outdir + '/snp_0_1.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.vcf.gz ; ' + \
309 |                     'mv ' + snp_indel_outdir + '/snp_0_2.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.clair3.vcf.gz ; ' + \
310 |                     'mv ' + snp_indel_outdir + '/snp_1_2.vcf.gz ' + snp_indel_outdir + '/freebayes.clair3.vcf.gz ; ' + \
311 |                     'mv ' + snp_indel_outdir + '/snp_0_1_2.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.clair3.vcf.gz ; ' + \
312 |                     'rm ' + snp_indel_outdir + '/*.tbi ' + snp_indel_outdir + '/snp__README'
313 |             elif clair3_outdir + '/clair3.filt.vcf.gz'in valid_vcf_files:
314 |                 command = 'mv ' + snp_indel_outdir + '/snp_0.vcf.gz ' + snp_indel_outdir + '/clair3.unique.vcf.gz ; ' + \
315 |                     'mv ' + snp_indel_outdir + '/snp_1.vcf.gz ' + snp_indel_outdir + '/freebayes.unique.vcf.gz ; ' + \
316 |                     'mv ' + snp_indel_outdir + '/snp_2.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.unique.vcf.gz ; ' + \
317 |                     'mv ' + snp_indel_outdir + '/snp_0_1.vcf.gz ' + snp_indel_outdir + '/freebayes.clair3.vcf.gz ; ' + \
318 |                     'mv ' + snp_indel_outdir + '/snp_0_2.vcf.gz ' + snp_indel_outdir + '/haplotypecaller.clair3.vcf.gz ; ' + \
319 |                     'mv ' + snp_indel_outdir + '/snp_1_2.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.vcf.gz ; ' + \
320 |                     'mv ' + snp_indel_outdir + '/snp_0_1_2.vcf.gz ' + snp_indel_outdir + '/freebayes.haplotypecaller.clair3.vcf.gz ; ' + \
321 |                     'rm ' + snp_indel_outdir + '/*.tbi ' + snp_indel_outdir + '/snp__README'
322 |         run_process(command)
323 |     else:
324 |         command = 'gunzip -c ' + vcf_files[0] + ' > ' + snp_indel_outdir + '/snp_final.vcf' 
325 |         run_process(command)
326 |     
327 |     generate_tab_csv_snp_summary(read_vcf(snp_indel_outdir + '/snp_final.vcf'), snp_indel_outdir)


--------------------------------------------------------------------------------
/variantdetective/structural_variant.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains code for the structural_variant subcommand.
  3 | 
  4 | Copyright (C) 2024 Phil Charron (phil.charron@inspection.gc.ca)
  5 | https://github.com/OLF-Bioinformatics/VariantDetective
  6 | """
  7 | 
  8 | import os
  9 | import sys
 10 | import io
 11 | import pandas as pd
 12 | import datetime
 13 | from .tools import get_new_filename, run_process, read_vcf, generate_tab_csv_sv_summary
 14 | 
 15 | def structural_variant(args, input_reads, output=sys.stderr):
 16 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tStarting structural_variant tool', file=output)
 17 |     reference = get_new_filename(args.reference, args.out)   
 18 |     structural_variant_outdir = os.path.join(args.out, 'structural_variant')
 19 |     nanovar_outdir = os.path.join(structural_variant_outdir, 'nanovar')
 20 |     nanosv_outdir = os.path.join(structural_variant_outdir, 'nanosv')
 21 |     svim_outdir = os.path.join(structural_variant_outdir, 'svim')
 22 |     cutesv_outdir = os.path.join(structural_variant_outdir, 'cutesv')
 23 |     
 24 |     if not os.path.isdir(structural_variant_outdir):
 25 |         os.makedirs(structural_variant_outdir)
 26 |     if not os.path.isdir(nanovar_outdir):
 27 |         os.makedirs(nanovar_outdir)
 28 |     if not os.path.isdir(nanosv_outdir):
 29 |         os.makedirs(nanosv_outdir)
 30 |     if not os.path.isdir(svim_outdir):
 31 |         os.makedirs(svim_outdir)
 32 |     if not os.path.isdir(cutesv_outdir):
 33 |         os.makedirs(cutesv_outdir)
 34 | 
 35 |     # Run NanoVar 
 36 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tRunning NanoVar...', file=output)
 37 |     if args.long is not None and args.data_type_sv == "pacbio":
 38 |         command = 'nanovar -t ' + str(args.threads) +  ' ' + \
 39 |             input_reads  + ' ' + \
 40 |             reference + ' ' + \
 41 |             nanovar_outdir + \
 42 |             ' -x pacbio-clr' + \
 43 |             ' -c ' + str(args.mincov_sv) + \
 44 |             ' -l ' + str(args.minlen_sv)
 45 |     else:
 46 |         command = 'nanovar -t ' + str(args.threads) +  ' ' + \
 47 |             input_reads  + ' ' + \
 48 |             reference + ' ' + \
 49 |             nanovar_outdir + \
 50 |             ' -c ' + str(args.mincov_sv) + \
 51 |             ' -l ' + str(args.minlen_sv)
 52 |     run_process(command)
 53 | 
 54 |     # Sort bam file
 55 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tSorting BAM file...', file=output)
 56 |     command = 'samtools sort -@ ' + str(args.threads) + ' ' + \
 57 |             nanovar_outdir + '/*-mm.bam > ' + \
 58 |             structural_variant_outdir + '/alignment.sorted.bam'
 59 |     run_process(command)
 60 |     
 61 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tIndexing sorted BAM file...', file=output)
 62 |     command = 'samtools index ' + structural_variant_outdir + '/alignment.sorted.bam'
 63 |     run_process(command)
 64 | 
 65 |     # Run NanoSV
 66 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tRunning NanoSV...', file=output)
 67 |     command = 'samtools faidx ' + reference
 68 |     run_process(command)
 69 |     
 70 |     command = 'cut -f 1,2 ' + reference + '.fai > ' + nanosv_outdir + '/chrom.sizes'
 71 |     run_process(command)
 72 | 
 73 |     command = 'bedtools random -l 1 -g ' + nanosv_outdir + '/chrom.sizes > ' + nanosv_outdir + '/reference.bed'
 74 |     run_process(command)
 75 | 
 76 |     command = 'NanoSV -t ' + str(args.threads) + \
 77 |             ' -o ' + nanosv_outdir + '/variants.vcf ' + \
 78 |             ' -s samtools' + \
 79 |             ' -b ' + nanosv_outdir + '/reference.bed ' + \
 80 |             structural_variant_outdir + '/alignment.sorted.bam' 
 81 |     run_process(command)
 82 |     
 83 |     # Run SVIM
 84 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tRunning SVIM...', file=output)
 85 |     command = 'svim alignment ' + svim_outdir + ' ' + \
 86 |             structural_variant_outdir + '/alignment.sorted.bam ' + \
 87 |             reference + \
 88 |             ' --min_sv_size ' + str(args.minlen_sv)
 89 |     run_process(command)
 90 |     
 91 |     command = "bcftools view -i 'QUAL >= " + str(args.minqual_sv) + "' " + \
 92 |             svim_outdir + '/variants.vcf > ' + \
 93 |             svim_outdir + '/variants.filt.vcf'
 94 |     run_process(command)
 95 | 
 96 |     # Run CuteSV
 97 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tRunning CuteSV...', file=output)
 98 |     command = 'cuteSV ' + structural_variant_outdir + '/alignment.sorted.bam ' + \
 99 |             reference + ' ' + \
100 |             cutesv_outdir + '/variants.vcf ' + \
101 |             cutesv_outdir + \
102 |             ' -t ' + str(args.threads) + \
103 |             ' -s ' + str(args.mincov_sv) + \
104 |             ' -l ' + str(args.minlen_sv) + \
105 |             ' -L -1'  
106 |     run_process(command)
107 |     
108 |     # Run SURVIVOR
109 |     print(str(datetime.datetime.now().replace(microsecond=0)) + '\tRunning SURVIVOR...', file=output)
110 |     command = 'ls ' + nanovar_outdir + '/*pass.vcf ' + \
111 |             cutesv_outdir + '/variants.vcf ' + \
112 |             nanosv_outdir + '/variants.vcf ' + \
113 |             svim_outdir + '/variants.filt.vcf > ' + \
114 |             structural_variant_outdir + '/vcf_list'
115 |     run_process(command)
116 | 
117 |     command = 'SURVIVOR merge ' + structural_variant_outdir + \
118 |             '/vcf_list 1000 ' + str(args.sv_consensus) + ' 1 1 0 ' + str(args.minlen_sv) + ' ' \
119 |             + structural_variant_outdir + '/combined_sv.vcf'
120 |     run_process(command)
121 | 
122 |     generate_tab_csv_sv_summary(read_vcf(structural_variant_outdir + '/combined_sv.vcf'), structural_variant_outdir)
123 |     command = 'rm ' + structural_variant_outdir + '/vcf_list'
124 |     run_process(command)
125 |     
126 |     return structural_variant_outdir + '/alignment.sorted.bam'
127 | 


--------------------------------------------------------------------------------
/variantdetective/tools.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains miscellaneous tools that are used in various
  3 | components of VariantDetective.
  4 | 
  5 | Copyright (C) 2024 Phil Charron (phil.charron@inspection.gc.ca)
  6 | https://github.com/OLF-Bioinformatics/VariantDetective
  7 | """
  8 | 
  9 | import gzip
 10 | import io
 11 | import os
 12 | import pandas as pd
 13 | import statistics
 14 | from subprocess import Popen, PIPE, STDOUT
 15 | 
 16 | def get_fasta_info(open_func, file):
 17 |     count = 0 
 18 |     with open_func(file, 'rt') as seq_file:
 19 |         for line in seq_file:
 20 |             if line.startswith('>'):
 21 |                 count += 1
 22 |     return count
 23 | 
 24 | 
 25 | def get_fastq_info(open_func, file):
 26 |     num_lines = sum(1 for i in open_func(file, 'rb'))
 27 |     if num_lines % 4 != 0:
 28 |         raise ValueError('File might be corrupted, unexpected number of lines was found')
 29 |     else:
 30 |         with open_func(file, 'rt') as seq_file:
 31 |             length_list = []
 32 |             for i in range(int(num_lines/4)):
 33 |                 next(seq_file)
 34 |                 length_list.append(len(next(seq_file).strip()))
 35 |                 next(seq_file)
 36 |                 next(seq_file)
 37 |         count = int(num_lines/4)
 38 |         min_value = min(length_list)
 39 |         max_value = max(length_list)
 40 |         median = int(statistics.median(length_list))
 41 |         average = int(statistics.mean(length_list))
 42 |         return count, min_value, max_value, median, average
 43 | 
 44 | 
 45 | def get_input_type(open_func, file):
 46 |     with open_func(file, 'rt') as seq_file:
 47 |         try:
 48 |             first_char = seq_file.read(1)
 49 |         except UnicodeDecodeError:
 50 |             first_char = ''
 51 |     if first_char == '>':
 52 |         return 'FASTA'
 53 |     elif first_char == '@':
 54 |         return 'FASTQ'
 55 |     else:
 56 |         raise ValueError('File is not FASTA or FASTQ')
 57 | 
 58 | def get_new_filename(file, out):
 59 |     basename = os.path.basename(file)
 60 |     new_filename = os.path.join(out, basename)
 61 |     return new_filename
 62 | 
 63 | 
 64 | def get_open_function(file_extension):
 65 |     if file_extension == ".gz":
 66 |         open_func = gzip.open
 67 |     else:
 68 |         open_func = open
 69 |     return open_func
 70 | 
 71 | def run_process(command):
 72 |     process = Popen([command],
 73 |                     universal_newlines=True, stdout=PIPE, stderr=PIPE, shell=True, executable='/bin/bash')
 74 |     output, error = process.communicate()
 75 |     
 76 |     if process.returncode != 0:
 77 |         raise Exception(error)
 78 | 
 79 | def read_vcf(path):
 80 |     with open(path, 'r') as f:
 81 |         lines = [l for l in f if not l.startswith('##')]
 82 |     return pd.read_csv(
 83 |         io.StringIO(''.join(lines)),
 84 |         dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
 85 |             'QUAL': str, 'FILTER': str, 'INFO': str},
 86 |         sep='\t'
 87 |     ).rename(columns={'#CHROM': 'CHROM'})
 88 | 
 89 | def generate_tab_csv_snp_summary(vcf, output_dir):
 90 |     if len(vcf) > 0:
 91 |         CHROM = vcf.iloc[:,0]
 92 |         POS = vcf.iloc[:,1]
 93 |         FORMAT_ID = vcf.iloc[:,8].str.split(':', expand=True)
 94 |         FORMAT = vcf.iloc[:,9].str.split(':', expand=True)
 95 |         INFO = vcf.iloc[:,7].str.split(';', expand=True)
 96 |         REF = vcf.iloc[:,3]
 97 |         ALT = vcf.iloc[:,4]
 98 |         SUPPORT = pd.Series([None] * len(vcf), name='SUPPORT')
 99 |         TYPE = INFO.iloc[:,40].str.split('=',expand=True).iloc[:,1]
100 |         TYPE.name = 'TYPE'
101 |         for i, v in TYPE.items():
102 |             try:
103 |                 AD_index =list(FORMAT_ID.iloc[i]).index('AD')
104 |                 RAW_SUPPORT = FORMAT.iloc[i,AD_index].split(",")
105 |                 SUPPORT[i] = "REF=" + RAW_SUPPORT[0] + ";ALT=" + RAW_SUPPORT[1]
106 |             except ValueError:
107 |                 AF_index =list(FORMAT_ID.iloc[i]).index('AF')
108 |                 DP_index =list(FORMAT_ID.iloc[i]).index('DP')
109 |                 REF_COUNT = round(int(FORMAT.iloc[i,DP_index]) - (float(FORMAT.iloc[i,AF_index]) * int(FORMAT.iloc[i,DP_index])))
110 |                 ALT_COUNT = round(float(FORMAT.iloc[i,AF_index]) * int(FORMAT.iloc[i,DP_index]))
111 |                 SUPPORT[i] = "REF=" + str(REF_COUNT) + ";ALT=" + str(ALT_COUNT)
112 |             if (v == None):
113 |                 if (len(REF[i]) < len(ALT[i])):
114 |                     TYPE[i] = 'ins'
115 |                 elif (len(REF[i]) > len(ALT[i])):
116 |                     TYPE[i] = 'del'
117 |                 elif (len(REF[i]) == 1):
118 |                     TYPE[i] = 'snp'
119 |                 else:
120 |                     TYPE[i] = 'complex'
121 |         TAB_DATA = pd.concat([CHROM, POS, TYPE, REF, ALT, SUPPORT], axis=1)
122 |         VARIANT_TYPES = pd.Series(['SNP', 'DEL', 'INS', 'MNP', 'COMPLEX','TOTAL'], name = 'TYPE')
123 |         VARIANT_DATA = pd.Series([(TYPE=='snp').sum(), (TYPE=='del').sum(), (TYPE=='ins').sum(), (TYPE=='mnp').sum(), (TYPE=='complex').sum(), len(TYPE)], name = 'COUNT')
124 |         SUMMARY_DATA = pd.concat([VARIANT_TYPES, VARIANT_DATA], axis=1)
125 |         TAB_DATA.to_csv(output_dir + '/snp_final.csv', index=False)
126 |         TAB_DATA.to_csv(output_dir + '/snp_final.tab', sep='\t', index=False)
127 |         SUMMARY_DATA.to_csv(output_dir + '/snp_final_summary.txt', sep='\t', index=False)
128 |     else:
129 |         column_names_str= "CHROM POS TYPE REF ALT SUPPORT"
130 |         column_names = column_names_str.split()
131 |         TAB_DATA = pd.DataFrame(columns=column_names)
132 |         TAB_DATA.to_csv(output_dir + '/snp_final.csv', index=False)
133 |         TAB_DATA.to_csv(output_dir + '/snp_final.tab', sep='\t', index=False)
134 |         data = {"TYPE": ["SNP", "DEL", "INS", "MNP", "COMPLEX", "TOTAL"],
135 |                 "COUNT": [0, 0, 0, 0, 0, 0]}
136 |         SUMMARY_DATA = pd.DataFrame(data)
137 |         SUMMARY_DATA.to_csv(output_dir + '/snp_final_summary.txt', sep='\t', index=False)
138 | 
139 | 
140 | def generate_tab_csv_sv_summary(vcf, output_dir):
141 |     CHROM = vcf.iloc[:,0]
142 |     CHROM.name = 'REF_CHROM'
143 |     START = vcf.iloc[:,1]
144 |     START.name = 'REF_START'
145 |     END = vcf.iloc[:,7].str.split(';', expand=True).iloc[:,6].str[4:]
146 |     END.name = 'REF_STOP'
147 |     SIZE = vcf.iloc[:,7].str.split(';', expand=True).iloc[:,2].str[6:]
148 |     SIZE.name = 'SIZE'
149 |     TYPE = vcf.iloc[:,7].str.split(';', expand=True).iloc[:,3].str[7:]
150 |     TYPE.name = 'TYPE'
151 |     INFO = vcf.iloc[:,4].copy()
152 |     INFO.name = 'INFO'
153 |     for i, v in INFO.items():
154 |         if (TYPE[i] == "TRA"):
155 |             if ("]" not in v and "[" not in v):
156 |                 INFO[i] = ''
157 |             elif ("]" in v):
158 |                 INFO[i] = ']'+(v.split(']'))[1].split(']')[0]+']'
159 |             elif ("[" in v):
160 |                 INFO[i] = '['+(v.split('['))[1].split('[')[0]+'['
161 |             else:
162 |                 INFO[i] = ''
163 |         else:
164 |             INFO[i] = ''
165 | 
166 |     TAB_DATA = pd.concat([CHROM, START, END, SIZE, TYPE, INFO], axis=1)
167 | 
168 |     VARIANT_TYPES = pd.Series(['TRANSLOCATION', 'INVERSION', 'DELETION', 'INSERTION', 'DUPLICATION','TOTAL'], name = 'TYPE')
169 |     VARIANT_DATA = pd.Series([(TYPE=='TRA').sum(), (TYPE=='INV').sum(), (TYPE=='DEL').sum(), (TYPE=='INS').sum(), (TYPE=='DUP').sum(), len(TYPE)], name = 'COUNT')
170 |     SUMMARY_DATA = pd.concat([VARIANT_TYPES, VARIANT_DATA], axis=1)
171 |     
172 |     TAB_DATA.to_csv(output_dir + '/combined_sv.csv', index=False)
173 |     TAB_DATA.to_csv(output_dir + '/combined_sv.tab', sep='\t', index=False)
174 |     SUMMARY_DATA.to_csv(output_dir + '/combined_sv_summary.txt', sep='\t', index=False)


--------------------------------------------------------------------------------
/variantdetective/validate_inputs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains code needed to validate the inputs used
  3 | to ensure they are in the right format for VariantDetective.
  4 | 
  5 | Copyright (C) 2024 Phil Charron (phil.charron@inspection.gc.ca)
  6 | https://github.com/OLF-Bioinformatics/VariantDetective
  7 | """
  8 | 
  9 | import datetime
 10 | import os
 11 | import sys
 12 | 
 13 | from .combine_variants import combine_variants
 14 | from .simulate import simulate
 15 | from .snp_indel import snp_indel
 16 | from .structural_variant import structural_variant
 17 | from .tools import get_fasta_info, get_fastq_info, get_input_type, get_new_filename, get_open_function
 18 | 
 19 | def validate_inputs(args, output=sys.stderr):
 20 |     input_file_type = []
 21 |     snp_vcf_list = []
 22 |     sv_vcf_list = []
 23 |     #actual_file_type = []
 24 |    
 25 |     if 'genome' in args and args.genome is not None:
 26 |         genome_file = get_new_filename(args.genome, args.out)
 27 |         input_file_type.append("Genomic FASTA")
 28 |         #actual_file_type.append(check_input(genome_file, output))
 29 |         #actual_file_type.append("Genomic FASTA")
 30 |     if 'long' in args and args.long is not None:
 31 |         long_file = get_new_filename(args.long, args.out)
 32 |         input_file_type.append("Long-read FASTQ")
 33 |         #actual_file_type.append(check_input(long_file, output))
 34 |         #actual_file_type.append("Long-read FASTQ")
 35 |     if 'short1' in args and args.short1 is not None:
 36 |         short1_file = get_new_filename(args.short1, args.out)
 37 |         input_file_type.append("Short-read FASTQ")
 38 |         #actual_file_type.append(check_input(short1_file, output))
 39 |         #actual_file_type.append("Short-read FASTQ")
 40 |     if 'short2' in args and args.short2 is not None:
 41 |         short2_file = get_new_filename(args.short2, args.out)
 42 |         input_file_type.append("Short-read FASTQ")
 43 |         #actual_file_type.append(check_input(short2_file, output))
 44 |         #actual_file_type.append("Short-read FASTQ")
 45 |     if 'snp_vcf' in args and args.snp_vcf is not None:
 46 |         for vcf_file in args.snp_vcf:
 47 |             snp_vcf_list.append(get_new_filename(vcf_file, args.out))
 48 |         input_file_type.append("SNP VCF")
 49 |     if 'sv_vcf' in args and args.sv_vcf is not None:
 50 |         for vcf_file in args.sv_vcf:
 51 |             sv_vcf_list.append(get_new_filename(vcf_file, args.out))
 52 |         input_file_type.append("SV VCF")
 53 | 
 54 | 
 55 |     #if input_file_type == actual_file_type:
 56 |     if 'Genomic FASTA' in input_file_type:
 57 |         print(str(datetime.datetime.now().replace(microsecond=0)) + '\tStarting genome pipeline')
 58 |         sim_file = simulate(args, genome_file, output=sys.stderr)
 59 |         if args.subparser_name == "structural_variant":
 60 |             long_bam_file = structural_variant(args, sim_file, output=sys.stderr)
 61 |         if args.subparser_name == "snp_indel":
 62 |             snp_indel(args, sim_file, output=sys.stderr)
 63 |         if args.subparser_name == "all_variants":
 64 |             long_bam_file = structural_variant(args, sim_file, output=sys.stderr)
 65 |             snp_indel(args, long_bam_file, output=sys.stderr)
 66 |     elif 'Long-read FASTQ' in input_file_type and 'Short-read FASTQ' in input_file_type:
 67 |         print(str(datetime.datetime.now().replace(microsecond=0)) + '\tStarting short and long read pipeline')
 68 |         short_inputs = [short1_file, short2_file]
 69 |         long_bam_file = structural_variant(args, long_file, output=sys.stderr)
 70 |         snp_indel(args, short_inputs, output=sys.stderr)
 71 |     elif 'Long-read FASTQ' in input_file_type:
 72 |         print(str(datetime.datetime.now().replace(microsecond=0)) + '\tStarting long read pipeline')
 73 |         long_bam_file = structural_variant(args, long_file, output=sys.stderr)
 74 |     elif 'Short-read FASTQ' in input_file_type:
 75 |         print(str(datetime.datetime.now().replace(microsecond=0)) + '\tStarting short read pipeline')
 76 |         short_inputs = [short1_file, short2_file]
 77 |         snp_indel(args, short_inputs, output=sys.stderr)
 78 |     elif 'SNP VCF' or 'SV VCF' in input_file_type:
 79 |         print(str(datetime.datetime.now().replace(microsecond=0)) + '\tStarting combine variants tool')
 80 |         vcf_lists = [snp_vcf_list, sv_vcf_list]
 81 |         combine_variants(args, vcf_lists, output=sys.stderr)
 82 |     
 83 |     #else:
 84 |     #    for i in range(len(input_file_type)):
 85 |     #        if input_file_type[i] == "Long-read FASTQ":
 86 |     #            if actual_file_type[i] == "Genomic FASTA":
 87 |     #                message = 'Input file was supposed to be long-read FASTQ but genomic FASTA was detected.'
 88 |     #            elif actual_file_type[i] == "Short-read FASTQ":
 89 |     #                message = 'Input file was supposed to be long-read FASTQ but short-read FASTQ was detected.'
 90 |     #        elif input_file_type[i] == "Genomic FASTA":
 91 |     #            if actual_file_type[i] == "Long-read FASTQ":
 92 |     #                message = 'Input file was supposed to be genomic FASTA but long-read FASTQ was detected.'
 93 |     #            elif actual_file_type[i] == "Short-read FASTQ":
 94 |     #                message = 'Input file was supposed to be genomic FASTA but short-read FASTQ was detected.'
 95 |     #        elif input_file_type[i] == "Short-read FASTQ":
 96 |     #            if actual_file_type[i] == "Long-read FASTQ":
 97 |     #                message = 'Input file was supposed to be short-read FASTQ but long-read FASTQ was detected.'
 98 |     #            elif actual_file_type[i] == "Genomic FASTA":
 99 |     #                message = 'Input file was supposed to be short-read FASTQ but genomic FASTA was detected.'
100 |     #        message = message + ' Please verify inputs or use appropriate tool and parameters.'
101 |     #        raise Exception(message)
102 | 
103 | def check_input(file, output=sys.stderr):
104 |     file_extension = os.path.splitext(file)
105 |     open_func = get_open_function(file_extension[1])
106 |     file_type = get_input_type(open_func, file)
107 | 
108 |     if file_type == 'FASTQ':
109 |         count, min_value, max_value, median, average = get_fastq_info(open_func, file)
110 |         if average > 301:
111 |             actual_file_type = "Long-read FASTQ"
112 |             print("Input file type:\tLong-read FASTQ", file=output)
113 |         elif average > 0:
114 |             actual_file_type = "Short-read FASTQ"
115 |             print("Input file type:\tShort-read FASTQ", file=output)
116 |         else:
117 |             raise Exception('Average length of reads is 0')
118 |         print("Number of reads:\t{}".format(count), file=output)
119 | 
120 |     elif file_type == 'FASTA':
121 |         count = get_fasta_info(open_func, file)
122 |         actual_file_type = "Genomic FASTA"
123 |     
124 |     return actual_file_type
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/variantdetective/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.0.1'
2 | 


--------------------------------------------------------------------------------