├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── build.sh ├── build_jar.sh ├── build_no_iris.sh ├── igv_jasmine ├── jasmine ├── jasmine.jar ├── jasmine_igv.jar ├── jasmine_iris.jar ├── jasmine_split.jar ├── pipeline ├── README.md ├── align_single.snakefile ├── call_svs_sniffles_single.snakefile ├── cut_regions_bam.snakefile ├── data.yaml ├── fix_sam.py ├── jasmine_pre.snakefile ├── main_chr_filter.py ├── marcc_config.yaml ├── pipeline.snakefile ├── pipelineoverview.svg ├── rockfish_config.yaml ├── sv_sizes.py ├── sv_supports.py ├── tools.yaml └── utils.py ├── plot_merges.py ├── run.sh ├── smalltest.sh ├── split_jasmine ├── src ├── AddGenotypes.java ├── BndVcfEntry.java ├── ChrNameNormalization.java ├── DuplicationsToInsertions.java ├── Forest.java ├── GenomeQuery.java ├── IgvScreenshotMaker.java ├── InsertionsToDuplications.java ├── KDTree.java ├── Main.java ├── MarkSpecificCalls.java ├── NormalizeTypes.java ├── Overlap.java ├── ParallelMerger.java ├── PipelineManager.java ├── PreSplit.java ├── Settings.java ├── StringUtils.java ├── TestKDTree.java ├── Variant.java ├── VariantInput.java ├── VariantMergeTest.java ├── VariantMerger.java ├── VariantOutput.java ├── VcfEntry.java ├── VcfHeader.java ├── VcfHeaderTest.java └── VisualizationPrep.java └── test_data ├── a.vcf ├── b.vcf └── c.vcf /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | .classpath 3 | .project 4 | .settings/ 5 | *.class 6 | *.vcf 7 | *.vcf.graph 8 | *.bed 9 | *.txt 10 | # IDEA setup 11 | .idea/ 12 | *.iml 13 | src/FixStrands.java 14 | data/ 15 | output/ 16 | out.log 17 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "Iris"] 2 | path = Iris 3 | url = https://github.com/mkirsche/Iris.git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Melanie Kirsche 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/jasminesv/README.html) 2 | [![European Galaxy server](https://img.shields.io/badge/usegalaxy-.eu-brightgreen?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAASCAYAAABB7B6eAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAACXBIWXMAAAsTAAALEwEAmpwYAAACC2lUWHRYTUw6Y29tLmFkb2JlLnhtcAAAAAAAPHg6eG1wbWV0YSB4bWxuczp4PSJhZG9iZTpuczptZXRhLyIgeDp4bXB0az0iWE1QIENvcmUgNS40LjAiPgogICA8cmRmOlJERiB4bWxuczpyZGY9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkvMDIvMjItcmRmLXN5bnRheC1ucyMiPgogICAgICA8cmRmOkRlc2NyaXB0aW9uIHJkZjphYm91dD0iIgogICAgICAgICAgICB4bWxuczp0aWZmPSJodHRwOi8vbnMuYWRvYmUuY29tL3RpZmYvMS4wLyI+CiAgICAgICAgIDx0aWZmOlJlc29sdXRpb25Vbml0PjI8L3RpZmY6UmVzb2x1dGlvblVuaXQ+CiAgICAgICAgIDx0aWZmOkNvbXByZXNzaW9uPjE8L3RpZmY6Q29tcHJlc3Npb24+CiAgICAgICAgIDx0aWZmOk9yaWVudGF0aW9uPjE8L3RpZmY6T3JpZW50YXRpb24+CiAgICAgICAgIDx0aWZmOlBob3RvbWV0cmljSW50ZXJwcmV0YXRpb24+MjwvdGlmZjpQaG90b21ldHJpY0ludGVycHJldGF0aW9uPgogICAgICA8L3JkZjpEZXNjcmlwdGlvbj4KICAgPC9yZGY6UkRGPgo8L3g6eG1wbWV0YT4KD0UqkwAAAn9JREFUOBGlVEuLE0EQruqZiftwDz4QYT1IYM8eFkHFw/4HYX+GB3/B4l/YP+CP8OBNTwpCwFMQXAQPKtnsg5nJZpKdni6/6kzHvAYDFtRUT71f3UwAEbkLch9ogQxcBwRKMfAnM1/CBwgrbxkgPAYqlBOy1jfovlaPsEiWPROZmqmZKKzOYCJb/AbdYLso9/9B6GppBRqCrjSYYaquZq20EUKAzVpjo1FzWRDVrNay6C/HDxT92wXrAVCH3ASqq5VqEtv1WZ13Mdwf8LFyyKECNbgHHAObWhScf4Wnj9CbQpPzWYU3UFoX3qkhlG8AY2BTQt5/EA7qaEPQsgGLWied0A8VKrHAsCC1eJ6EFoUd1v6GoPOaRAtDPViUr/wPzkIFV9AaAZGtYB568VyJfijV+ZBzlVZJ3W7XHB2RESGe4opXIGzRTdjcAupOK09RA6kzr1NTrTj7V1ugM4VgPGWEw+e39CxO6JUw5XhhKihmaDacU2GiR0Ohcc4cZ+Kq3AjlEnEeRSazLs6/9b/kh4eTC+hngE3QQD7Yyclxsrf3cpxsPXn+cFdenF9aqlBXMXaDiEyfyfawBz2RqC/O9WF1ysacOpytlUSoqNrtfbS642+4D4CS9V3xb4u8P/ACI4O810efRu6KsC0QnjHJGaq4IOGUjWTo/YDZDB3xSIxcGyNlWcTucb4T3in/3IaueNrZyX0lGOrWndstOr+w21UlVFokILjJLFhPukbVY8OmwNQ3nZgNJNmKDccusSb4UIe+gtkI+9/bSLJDjqn763f5CQ5TLApmICkqwR0QnUPKZFIUnoozWcQuRbC0Km02knj0tPYx63furGs3x/iPnz83zJDVNtdP3QAAAABJRU5ErkJggg==)](https://usegalaxy.eu/root?tool_id=jasminesv) 3 | 4 | 5 | 6 | 7 | # Jasmine 8 | 9 | JASMINE: Jointly Accurate Sv Merging with Intersample Network Edges 10 | 11 | Version 1.1.5 12 | 13 | This tool is used to merge structural variants (SVs) across samples. Each sample has a number of SV calls, consisting of position information (chromosome, start, end, length), type and strand information, and a number of other values. Jasmine represents the set of all SVs across samples as a network, and uses a modified minimum spanning forest algorithm to determine the best way of merging the variants such that each merged variants represents a set of analogous variants occurring in different samples. 14 | 15 | 16 | ## Conda Installation 17 | 18 | The recommended installation method is through [bioconda](https://bioconda.github.io/). 19 | 20 | Conda Installation command (typically takes under a minute to install): 21 | 22 | ``` 23 | conda config --add channels bioconda 24 | conda config --add channels conda-forge 25 | conda install jasminesv 26 | ``` 27 | 28 | 29 | ## Instructions for building from source 30 | 31 | When running Jasmine, one of the preprocessing options is to run Iris, a tool which refines the sequences and breakpoints of insertions in datasets with high-error reads. Iris depends on samtools, minimap2, and racon by default, which can be installed separately and either added to your path or pointed to with the `iris_args` parameter. Once these dependencies are installed (or if running Jasmine without Iris preprocessing), Jasmine can be built with the following command: 32 | 33 | ``` 34 | path_to_jasmine_repo/build_jar.sh 35 | ``` 36 | 37 | 38 | ## Instructions for running 39 | 40 | After building the jar file, Jasmine can be run with the executable file `jasmine`, which will be in the main folder of this repository if building from source, or in the condabin folder if installed through conda. Running it with no parameters will print a usage menu describing the required and optional arguments. 41 | 42 | 43 | ## Demo Dataset 44 | To run Jasmine on HiFi data from the HG002 trio, run the following commands (typically takes about a minute to download and under five minutes to run on a modern desktop): 45 | ``` 46 | wget http://data.schatz-lab.org/jasmine/HG002Trio/UnmergedVCFs/HG002vGRCh38_wm_50md_PBCCS_sniffles.s2l20.refined.nSVtypes.ism.vcf.gz 47 | wget http://data.schatz-lab.org/jasmine/HG002Trio/UnmergedVCFs/HG003vGRCh38_wm_50md_PBCCS_sniffles.s2l20.refined.nSVtypes.ism.vcf.gz 48 | wget http://data.schatz-lab.org/jasmine/HG002Trio/UnmergedVCFs/HG004vGRCh38_wm_50md_PBCCS_sniffles.s2l20.refined.nSVtypes.ism.vcf.gz 49 | wget http://data.schatz-lab.org/jasmine/HG002Trio/HG002Trio_HiFi.merged.vcf.gz 50 | gunzip * 51 | ls *vGRCh38_wm_50md_PBCCS_sniffles.s2l20.refined.nSVtypes.ism.vcf > filelist.txt 52 | jasmine file_list=filelist.txt out_file=merged.vcf 53 | jasmine --dup_to_ins --postprocess_only out_file=merged.vcf 54 | ``` 55 | 56 | The output of merged.vcf should then exactly match the contents of HG002Trio_HiFi.merged.vcf. 57 | 58 | 59 | ## Optimized SV Inference Pipeline 60 | 61 | Jasmine is offered as standalone software and will accurately merge SV calls from any SV callers, including short-read callers. However, if calling SVs from genomic long reads (PacBio CLR, PacBio HiFi, or Oxford Nanopore), for best results, we recommend using the following optimized pipeline to obtain population-scale SV calls from FASTQ files. This pipeline is provided as a [Snakemake pipeline](https://github.com/mkirsche/Jasmine/tree/master/pipeline). 62 | 63 | 64 | ![Jasmine SV Inference Pipeline](https://github.com/mkirsche/Jasmine/blob/master/pipeline/pipelineoverview.svg) 65 | 66 | 67 | 68 | ## IGV visualization module 69 | 70 | Jasmine also includes a module for automating the creation of [IGV](http://software.broadinstitute.org/software/igv/) screenshots of variants of interest. It can be run through the `igv_jasmine` executable file. Running it with no parameters will print a usage menu describing the required and optional arguments, and it requires at minimum the following: 71 | - BAM files from which variants were called in each sample 72 | - The reference genome 73 | - The merged VCF file, or a BED file with regions of interest 74 | 75 | Running this module creates a folder which will store IGV screenshots for each variant (optionally filtered based on the command line parameters), and populates that folder with a .bat file, a script which can be run through IGV by selecting Tools -> Run Batch Script and navigating to the file. After running this script, the folder containing the .bat file will also include images of the regions surrounding each variant of interest. 76 | 77 | 78 | ## User Manual 79 | 80 | The user manual with detailed information about input/output files and command line arguments can be found here: https://github.com/mkirsche/Jasmine/wiki/Jasmine-User-Manual 81 | 82 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | if [ "$(uname -s)" = 'Linux' ]; then 2 | BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 3 | else 4 | BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 5 | fi 6 | 7 | WORKINGDIR=`pwd` 8 | 9 | cd $BINDIR 10 | git submodule update --init --recursive 11 | cd $WORKINGDIR 12 | $BINDIR/Iris/build.sh 13 | $BINDIR/Iris/rebuild_default_external.sh 14 | 15 | javac -cp $BINDIR/Iris/src $BINDIR/src/*.java 16 | -------------------------------------------------------------------------------- /build_jar.sh: -------------------------------------------------------------------------------- 1 | if [ "$(uname -s)" = 'Linux' ]; then 2 | BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 3 | else 4 | BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 5 | fi 6 | 7 | git submodule update $BINDIR/Iris 8 | 9 | irisjar='' 10 | if [ $# -eq 1 ] 11 | then 12 | echo "Iris jar: " $1 13 | irisjar=$1 14 | else 15 | $BINDIR/Iris/build_jar.sh 16 | irisjar=$BINDIR/Iris/iris.jar 17 | fi 18 | 19 | cp $irisjar $BINDIR/jasmine_iris.jar 20 | 21 | cd $BINDIR/src 22 | javac -cp $BINDIR/jasmine_iris.jar *.java 23 | jar -c -e Main -f jasmine.jar *.class 24 | jar -c -e PreSplit -f jasmine_split.jar *.class 25 | jar -c -e IgvScreenshotMaker -f jasmine_igv.jar *.class 26 | mv jasmine.jar $BINDIR 27 | mv jasmine_igv.jar $BINDIR 28 | mv jasmine_split.jar $BINDIR 29 | cd $BINDIR 30 | -------------------------------------------------------------------------------- /build_no_iris.sh: -------------------------------------------------------------------------------- 1 | if [ "$(uname -s)" = 'Linux' ]; then 2 | BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 3 | else 4 | BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 5 | fi 6 | 7 | if [ ! -d $BINDIR/Iris/src ] 8 | then 9 | git submodule update --init --remote Iris 10 | fi 11 | 12 | javac -cp $BINDIR/Iris/src $BINDIR/src/*.java 13 | 14 | -------------------------------------------------------------------------------- /igv_jasmine: -------------------------------------------------------------------------------- 1 | # Script for running Jasmine's IgvScreenshotMaker 2 | if [ "$(uname -s)" = 'Linux' ]; then 3 | BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 4 | else 5 | BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 6 | fi 7 | 8 | java -jar $BINDIR/jasmine_igv.jar "${@:1}" 9 | 10 | -------------------------------------------------------------------------------- /jasmine: -------------------------------------------------------------------------------- 1 | # Script for running Jasmine 2 | if [ "$(uname -s)" = 'Linux' ]; then 3 | BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 4 | else 5 | BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 6 | fi 7 | 8 | java -cp $BINDIR/jasmine_iris.jar:$BINDIR/jasmine.jar Main iris_args=samtools_path=samtools,racon_path=racon,minimap_path=minimap2 "${@:1}" 9 | 10 | -------------------------------------------------------------------------------- /jasmine.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkirsche/Jasmine/a4f6988eeb191c1f8cf2e8d73869b6a5ffc14665/jasmine.jar -------------------------------------------------------------------------------- /jasmine_igv.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkirsche/Jasmine/a4f6988eeb191c1f8cf2e8d73869b6a5ffc14665/jasmine_igv.jar -------------------------------------------------------------------------------- /jasmine_iris.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkirsche/Jasmine/a4f6988eeb191c1f8cf2e8d73869b6a5ffc14665/jasmine_iris.jar -------------------------------------------------------------------------------- /jasmine_split.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mkirsche/Jasmine/a4f6988eeb191c1f8cf2e8d73869b6a5ffc14665/jasmine_split.jar -------------------------------------------------------------------------------- /pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Automated pipeline for alignment and SV calling in long-read datasets 2 | 3 | ## Installation 4 | Ensure that `snakemake` is installed. 5 | Clone the repository via `git clone https://github.com/mkirsche/Jasmine.git` into a general location (e.g., `/path/to/pipelines`) on a computing cluster. 6 | 7 | ## Experiment run 8 | Determine and `cd` into experiment dedicated folder (e.g., `experiment`). 9 | Create symlinks to all `snakefile` and `py` files in the `Jasmine/pipeline` master folder: 10 | ```bash 11 | ln -s /path/to/pipelines/Jasmine/pipeline/*snakefile . 12 | ln -s /path/to/pipelines/Jasmine/pipeline/*py . 13 | ``` 14 | Copy configuration `yaml` files: 15 | ```bash 16 | cp /path/to/pipelines/Jasmine/pipeline/*yaml . 17 | ``` 18 | Set up targeted dataset inside `data.yaml` file. 19 | 20 | Run snakemake (dry run) to see automated pipeline: 21 | ```bash 22 | snakemake -s pipeline.snakefile -npr 23 | ``` 24 | If the dry run produces satisfactory results, run `snakemake` with production settings, including, possible cluster setup, multithreading, etc. 25 | 26 | ### Example SLURM run 27 | ```bash 28 | snakemake -s pipeline.snakefile --latency-wait 200 -pr -j 20 --rerun-incomplete --cluster "sbatch --account={cluster.account} --partition={cluster.partition} --job-name={cluster.name} --nodes={cluster.nodes} --cpus-per-task={cluster.nCPUs} --time={cluster.time} --out={cluster.out} --err={cluster.err} --mem={cluster.mem_mb}M" 29 | ``` 30 | which ensures that 31 | * no more than 20 jobs (`-j`) are submitted at a time 32 | * any incomplete results from possible previous failed runs are regenerated (`--rerun-incomplete`) 33 | * sets up a SLURM submission setup, which in turn requests a single node of `parallel` partition per job with: 34 | * a 3 day time limit, 35 | * 24G of RAM per node, 36 | 37 | ## Pipeline Overview 38 | 39 | 1. Align reads in each sample with [Winnowmap](https://github.com/marbl/Winnowmap) with the recommended parameters for the type of sequencing data being used 40 | 2. Call SVs in each sample with [sniffles](https://github.com/fritzsedlazeck/Sniffles) using sensitive parameters and report all supporting reads: `sniffles -m -v --threads --min_support 2 --max_distance 50 --min_length 20 --num_reads_report -1` 41 | 3. Convert duplications to insertions temporarily for breakpoint refinement and better cross-sample comparison: `jasmine --dup_to_ins --preprocess_only vcf_filelist= --comma_filelist` 42 | 4. Refine SVs in each sample with [Iris](https://github.com/mkirsche/Iris/) 43 | 5. Normalize SV types in each sample: `jasmine --preprocess_only --pre_normalize --comma_filelist file_list=` 44 | 6. Mark high-confidence callset (high-specificity callset) in each sample: `jasmine file_list= --comma_filelist --preprocess_only --mark_specific spec_reads= spec_len=30` 45 | 7. Remove duplicate calls in each sample: `jasmine file_list= max_dist=200 --allow_intrasample out_file= --comma_filelist --nonlinear_dist` 46 | 8. Generate a list of all finalized per-sample VCF files (txt file, one per line) 47 | 9. Merge SVs across samples: `jasmine file_list= out_file=` 48 | 10. Convert insertions back to duplications: `jasmine --dup_to_ins --postprocess_only out_file=` 49 | 11. Remove low-confidence or imprecise calls: `cat | grep -v 'IMPRECISE;' | grep -v 'IS_SPECIFIC=0'` 50 | -------------------------------------------------------------------------------- /pipeline/call_svs_sniffles_single.snakefile: -------------------------------------------------------------------------------- 1 | import os 2 | import utils 3 | 4 | if os.path.exists("data.yaml"): 5 | configfile: "data.yaml" 6 | if os.path.exists("tools.yaml"): 7 | configfile: "tools.yaml" 8 | 9 | output_dir = config.get(utils.OUTPUT_DIR, "") 10 | alignment_output_dir = os.path.join(output_dir, utils.ALIGNMENTS) 11 | svs_output_dir = os.path.join(output_dir, utils.SVS) 12 | raw_svs_output_dir = os.path.join(svs_output_dir, utils.RAW) 13 | 14 | utils.ensure_samples_correctness(config) 15 | sample_to_reads_paths = utils.get_samples_to_reads_paths(config) 16 | utils.ensure_ref_correctness(config) 17 | 18 | sniffles_sens_suffix = utils.get_sniffles_sens_suffix(config) 19 | samples_regex = utils.get_samples_regex(sample_to_reads_paths) 20 | 21 | sniffles_config = config.get(utils.TOOLS, {}).get(utils.SNIFFLES, {}) 22 | jasmine_config=config.get(utils.TOOLS, {}).get(utils.JASMINE, {}) 23 | iris_config=config.get(utils.TOOLS, {}).get(utils.IRIS, {}) 24 | tech_regex = utils.get_tech_regex(config) 25 | java_config=config.get(utils.TOOLS, {}).get(utils.JAVA, {}) 26 | sv_sizes_config=config.get(utils.TOOLS, {}).get(utils.SV_SIZES, {}) 27 | 28 | rule raw_sv_tally: 29 | input: os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + "{file_suffix}") 30 | output: os.path.join(raw_svs_output_dir, utils.STATS, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + "{file_suffix}.stats.sizes.txt") 31 | log: os.path.join(raw_svs_output_dir, utils.LOG, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + "{file_suffix}.stats.sizes.txt.log") 32 | resources: 33 | mem_mb=utils.DEFAULT_CLUSTER_MEM_MB 34 | params: 35 | python=config.get(utils.TOOLS, {}).get(utils.PYTHON, {}).get(utils.PATH, "python3"), 36 | script_path=sv_sizes_config.get(utils.PATH, "sv_sizes.py"), 37 | bins=sv_sizes_config.get(utils.BINS, "1,30,50,100,150,200,350,300,500,750,1000,2000,5000,10000,50000,100000,500000"), 38 | types=sv_sizes_config.get(utils.TYPES, "INS,DEL,DUP,INV,TRA"), 39 | abs_length="" if sv_sizes_config.get(utils.ABS_LENGTH, True) else "--no-abs-length", 40 | info_len_field=sv_sizes_config.get(utils.INFO_LENGTH_FIELD, "SVLEN") 41 | shell: 42 | "{params.python} {params.script_path} {input} -o {output} --bins {params.bins} --types {params.types} {params.abs_length} --info-len-field {params.info_len_field} &> {log}" 43 | 44 | 45 | rule get_raw_specific: 46 | output: os.path.join(raw_svs_output_dir, "{sample," + samples_regex + "}_{tech," + tech_regex + "}_sniffles." + sniffles_sens_suffix + ".specific.vcf") 47 | input: vcf=os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + "_markedSpec.vcf"), 48 | vcf_list=os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + ".vcf_list_markedSpec.txt") 49 | resources: 50 | mem_mb=utils.DEFAULT_CLUSTER_MEM_MB 51 | shell: 52 | "awk '($0 ~/^#/ || $0 ~/IS_SPECIFIC=1/)' {input.vcf} > {output}" 53 | 54 | rule mark_specific_in_raw: 55 | output: vcf=temp(os.path.join(raw_svs_output_dir, "{sample," + samples_regex + "}_{tech," + tech_regex + "}_sniffles." + sniffles_sens_suffix + "_markedSpec.vcf")), 56 | vcf_file_list=temp(os.path.join(raw_svs_output_dir, "{sample," + samples_regex + "}_{tech," + tech_regex + "}_sniffles." + sniffles_sens_suffix + ".vcf_list_markedSpec.txt")) 57 | input: vcf=os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + ".vcf"), 58 | coverage=os.path.join(alignment_output_dir, utils.STATS, "{sample}_{tech}.coverage.txt"), 59 | vcf_file_list=os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + ".vcf_list.txt") 60 | threads: lambda wc: min(cluster_config.get("sensitive_ins_to_dup_conversion", {}).get(utils.NCPUS, utils.DEFAULT_THREAD_CNT), jasmine_config.get(utils.THREADS, utils.DEFAULT_THREAD_CNT)) 61 | log: os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + "_markedSpec.vcf.log") 62 | resources: 63 | mem_mb=lambda wildcards, threads: jasmine_config.get(utils.MEM_MB_CORE, 20000) + jasmine_config.get(utils.MEM_MB_PER_THREAD, 1000) * threads 64 | params: 65 | output_dir=raw_svs_output_dir, 66 | min_support_fixed=jasmine_config.get(utils.SPECIFIC_MARKED, {}).get(utils.SPEC_READS_FIXED, 10), 67 | min_support_fraction=jasmine_config.get(utils.SPECIFIC_MARKED, {}).get(utils.SPEC_READS_FRACTION, 0.25), 68 | min_length=jasmine_config.get(utils.SPECIFIC_MARKED, {}).get(utils.SPEC_LEN, 30), 69 | java_src=":".join(x for x in [jasmine_config.get(utils.SRC_PATH, ""), iris_config.get(utils.SRC_PATH, "")] if len(x) > 0), 70 | java=java_config.get(utils.PATH, "java"), 71 | run: 72 | min_support=utils.get_min_support(input.coverage, params.min_support_fixed, params.min_support_fraction) 73 | shell("{params.java} -cp {params.java_src} Main file_list={input.vcf_file_list} --preprocess_only --mark_specific out_dir={params.output_dir} spec_reads=" + str(min_support) + " spec_len={params.min_length} out_file=test.vcf &> {log}") 74 | 75 | rule raw_vcf_files_list: 76 | input: os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + ".vcf") 77 | output: temp(os.path.join(raw_svs_output_dir, "{sample," + samples_regex + "}_{tech," + tech_regex + "}_sniffles." + sniffles_sens_suffix + ".vcf_list.txt")) 78 | resources: 79 | mem_mb=utils.DEFAULT_CLUSTER_MEM_MB 80 | run: 81 | dirname = os.path.dirname(output[0]) 82 | os.makedirs(dirname,exist_ok=True) 83 | with open(output[0], "wt") as dest: 84 | print(input[0], file=dest) 85 | 86 | def get_sniffles_parameter(parameter, sample=None, tech=None, default=None): 87 | if sample is None or tech is None: 88 | return default 89 | result = default 90 | result = sniffles_config.get(parameter, result) 91 | result = sniffles_config.get(tech, {}).get(parameter, result) 92 | for sample_data in config["samples"]: 93 | if sample_data["sample"] == sample and sample_data["tech"] == tech: 94 | result = sample_data.get(utils.SNIFFLES, {}).get(parameter, result) 95 | break 96 | return result 97 | 98 | 99 | rule sensitive_svs_sniffles: 100 | input: os.path.join(alignment_output_dir, "{sample}_{tech}.sort.bam") 101 | output: os.path.join(raw_svs_output_dir, "{sample," + samples_regex + "}_{tech," + tech_regex + "}_sniffles." + sniffles_sens_suffix + ".vcf") 102 | threads: lambda wildcards: min(cluster_config.get("sensitive_svs_sniffles", {}).get(utils.NCPUS, utils.DEFAULT_THREAD_CNT), sniffles_config.get(utils.THREADS, utils.DEFAULT_THREAD_CNT)) 103 | log: os.path.join(raw_svs_output_dir, utils.LOG, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + ".vcf.log") 104 | resources: 105 | mem_mb = lambda wildcards, threads: sniffles_config.get(utils.MEM_MB_CORE, 25000) + sniffles_config.get(utils.MEM_MB_PER_THREAD, 1000) * threads 106 | params: 107 | sniffles = sniffles_config.get(utils.PATH, "sniffles"), 108 | min_length = lambda wc: get_sniffles_parameter(utils.MIN_LENGTH, sample=wc.sample, tech=wc.tech, default=20), 109 | min_support = lambda wc: get_sniffles_parameter(utils.MIN_SUPPORT, sample=wc.sample, tech=wc.tech, default=2), 110 | max_num_splits = lambda wc: get_sniffles_parameter(utils.MAX_NUM_SPLIT_READS, sample=wc.sample, tech=wc.tech, default=10), 111 | max_distance = lambda wc: get_sniffles_parameter(utils.MAX_DISTANCE, sample=wc.sample, tech=wc.tech, default=50), 112 | num_reads_report = lambda wc: get_sniffles_parameter(utils.NUM_READS_REPORT, sample=wc.sample, tech=wc.tech, default=-1), 113 | min_seq_size = lambda wc: get_sniffles_parameter(utils.MIN_SEQ_SIZE, sample=wc.sample, tech=wc.tech, default=1000), 114 | shell: 115 | "{params.sniffles} -m {input} -v {output} --threads {threads} --min_support {params.min_support} --max_distance {params.max_distance} --max_num_splits {params.max_num_splits} --min_length {params.min_length} --num_reads_report {params.num_reads_report} --min_seq_size {params.min_seq_size} &> {log}" 116 | 117 | localrules: raw_vcf_files_list, get_raw_specific 118 | 119 | include: "align_single.snakefile" -------------------------------------------------------------------------------- /pipeline/cut_regions_bam.snakefile: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | files = [] 4 | out_dir = config["out_dir"] 5 | suffix = config.get("name", "regions") 6 | input_by_base = {} 7 | for path in config["bams"]: 8 | basename = os.path.basename(path) 9 | base = os.path.splitext(basename)[0] 10 | input_by_base[base] = path 11 | files.append(os.path.join(out_dir, f"{base}.{suffix}.sort.bam")) 12 | files.append(os.path.join(out_dir, f"{base}.{suffix}.sort.bam.bai")) 13 | 14 | regions = [] 15 | with open(config["regions"], "rt") as source: 16 | for line in source: 17 | data = line.strip().split("\t") 18 | regions.append(f"{data[0]}:{data[1]}-{data[2]}") 19 | regions = " ".join(regions) 20 | 21 | 22 | rule all: 23 | input: files 24 | 25 | 26 | rule index_bam: 27 | output: os.path.join(out_dir, "{base}.{suffix," + suffix + "}.sort.bam.bai") 28 | input: os.path.join(out_dir, "{base}.{suffix}.sort.bam") 29 | shell: 30 | "samtools index {input}" 31 | 32 | rule sort_cut_bam: 33 | output: os.path.join(out_dir, "{base}.{suffix," + suffix+ "}.sort.bam") 34 | input: os.path.join(out_dir, "{base}.{suffix}.bam") 35 | shell: 36 | "samtools sort -@ 4 -O bam -o {output} {input}" 37 | 38 | rule create_cut_bam: 39 | output: temp(os.path.join(out_dir, "{base}.{suffix," + suffix + "}.bam")) 40 | input: bam=lambda wc: input_by_base[wc.base] 41 | params: 42 | regions=regions, 43 | shell: 44 | "samtools view -O bam -o {output} {input} {params.regions}" -------------------------------------------------------------------------------- /pipeline/data.yaml: -------------------------------------------------------------------------------- 1 | samples: 2 | - sample: sample_1 3 | # list of paths for fastq reads. Can be with fastq, fq, fastq.gz, or fq.gz extensions. At least one entry is required. 4 | reads_paths: 5 | - "path1/reads.fq" 6 | - "path2/reads.fq" 7 | - "path3/reads.fq" 8 | # ONT or PB. Required. 9 | tech: "ONT" 10 | 11 | - sample: sample_2 12 | reads_paths: 13 | - "path1/reads.fq" 14 | existing_alignments: 15 | - "" 16 | tech: "ONT" 17 | 18 | # full path to the reference. Required. 19 | ref: "GRCh38.fa" 20 | 21 | # not required. `alignments`, `svs`, etc subdirs will be created in it. Default is "." 22 | output_dir: "" -------------------------------------------------------------------------------- /pipeline/fix_sam.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | import pysam 5 | 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("sam", type=argparse.FileType("rt"), default=sys.stdin) 10 | parser.add_argument("-o", "--output", type=argparse.FileType("rt"), default=sys.stdout) 11 | args = parser.parse_args() 12 | source = pysam.AlignmentFile(args.sam, "r") 13 | dest = pysam.AlignmentFile(args.output, "w", template=source) 14 | counter = 0 15 | while True: 16 | try: 17 | record = next(source) 18 | dest.write(record) 19 | except OSError as oe: 20 | print(oe, file=sys.stderr) 21 | counter += 1 22 | except StopIteration: 23 | break 24 | print(f"{counter} overall skipped records", file=sys.stderr) 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /pipeline/main_chr_filter.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | 5 | def execute(command: str, dry: bool = True): 6 | if dry: 7 | print(command) 8 | else: 9 | os.system(command) 10 | 11 | 12 | def main(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("VCF") 15 | parser.add_argument("--chr_bed", default="/work-zfs/mschatz1/pipelines/resources/ref/human/GRCh38.main_chr.bed") 16 | parser.add_argument("--bad-chr-FLs", default="LKG") 17 | parser.add_argument("-n", "--dry-run", dest="dry", action="store_true") 18 | args = parser.parse_args() 19 | basename = os.path.basename(args.VCF) 20 | new_vcf_name = basename.split(".")[0] + ".all_chr." + ".".join(basename.split(".")[1:]) 21 | execute(f'mv {args.VCF} {new_vcf_name}', dry=args.dry) 22 | execute(f'grep "#" {new_vcf_name} > {args.VCF}', dry=args.dry) 23 | execute(f'bedtools intersect -a {new_vcf_name} -b {args.chr_bed} -u | awk \'{{if ($8 !~/SVTYPE=BND/ || $5 !~ /[\\[\\]][{args.bad_chr_FLs}]/) print $0}}\' >> {args.VCF}', 24 | dry=args.dry) 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /pipeline/marcc_config.yaml: -------------------------------------------------------------------------------- 1 | "__default__": 2 | account: "FILL" 3 | time: "24:0:0" 4 | nCPUs: 2 5 | nodes: 1 6 | partition: "shared,parallel,lrgmem" 7 | mem_mb: "2000" 8 | name: "JOBNAME.{rule}.{wildcards}" 9 | out: "logs/cluster/{rule}.{wildcards}.out" 10 | err: "logs/cluster/{rule}.{wildcards}.err" 11 | 12 | single_alignment: 13 | time: "72:0:0" 14 | partition: "parallel,lrgmem,shared" 15 | nCPUs: 24 16 | mem_mb: "{resources.mem_mb}" 17 | name: "{rule}.{wildcards}" 18 | 19 | single_sam_to_sort_bam: 20 | time: "24:0:0" 21 | partition: "parallel,shared,lrgmem" 22 | nCPUs: 24 23 | mem_mb: "{resources.mem_mb}" 24 | name: "samtools.{rule}.{wildcards}" 25 | 26 | merge_sorted: 27 | time: "24:0:0" 28 | partition: "shared,parallel,lrgmem" 29 | nCPUs: 1 30 | mem_mb: "{resources.mem_mb}" 31 | name: "samtools.{rule}.{wildcards}" 32 | 33 | merged_average_coverage_samtools: 34 | time: "24:0:0" 35 | partition: "shared,parallel,lrgmem" 36 | nCPUs: 1 37 | name: "samtools.{rule}.{wildcards}" 38 | 39 | merged_coverage_mosdepth: 40 | time: "24:0:0" 41 | partition: "shared,parallel,lrgmem" 42 | nCPUs: 24 43 | name: "samtools.{rule}.{wildcards}" 44 | 45 | sensitive_svs_sniffles: 46 | time: "24:0:0" 47 | partition: "shared,parallel,lrgmem" 48 | nCPUs: 5 49 | mem_mb: "{resources.mem_mb}" 50 | name: "sv_inference.{rule}.{wildcards}" 51 | 52 | sensitive_ins_to_dup_conversion: 53 | time: "24:0:0" 54 | partition: "shared,parallel,lrgmem" 55 | nCPUs: 2 56 | mem_mb: "{resources.mem_mb}" 57 | name: "jasmine_pre.{rule}.{wildcards}" 58 | 59 | refined_sensitive_new_sv_types: 60 | time: "72:0:0" 61 | partition: "parallel,lrgmem,shared" 62 | nCPUs: 24 63 | mem_md: "{resources.mem_mb}" 64 | name: "jasmine_pre.{rule}.{wildcards}" 65 | 66 | intra_sample_merging: 67 | time: "24:0:0" 68 | partition: "parallel,lrgmem,shared" 69 | nCPUs: 24 70 | mem_md: "{resources.mem_mb}" 71 | name: "jasmine_pre.{rule}.{wildcards}" 72 | -------------------------------------------------------------------------------- /pipeline/pipeline.snakefile: -------------------------------------------------------------------------------- 1 | import os 2 | import utils 3 | 4 | if os.path.exists("data.yaml"): 5 | configfile: "data.yaml" 6 | if os.path.exists("tools.yaml"): 7 | configfile: "tools.yaml" 8 | 9 | output_dir = config.get(utils.OUTPUT_DIR, "") 10 | alignment_output_dir = os.path.join(output_dir, utils.ALIGNMENTS) 11 | svs_output_dir = os.path.join(output_dir, utils.SVS) 12 | raw_svs_output_dir = os.path.join(svs_output_dir, utils.RAW) 13 | refined_svs_output_dir = os.path.join(svs_output_dir, utils.REFINED) 14 | ins_to_dup_output_dir = os.path.join(refined_svs_output_dir, utils.INS_TO_DUP) 15 | iris_refined_output_dir = os.path.join(refined_svs_output_dir, utils.IRIS_REFINED) 16 | specific_marked_output_dir = os.path.join(refined_svs_output_dir, utils.SPECIFIC_MARKED) 17 | 18 | utils.ensure_samples_correctness(config) 19 | sample_to_reads_paths = utils.get_samples_to_reads_paths(config) 20 | utils.ensure_ref_correctness(config) 21 | utils.ensure_enabled_sv_tools(config) 22 | 23 | 24 | # during development this thing guarantees that only the latest supported part of pipeline produces results 25 | overall_expected_files = [] 26 | # print(sample_to_reads_paths) 27 | for (sample, tech) in sample_to_reads_paths.keys(): 28 | if config.get(utils.ENABLE_ALIGNMENT_STATS, True): 29 | overall_expected_files.append(os.path.join(alignment_output_dir, utils.STATS, f"{sample}_{tech}.coverage.txt")) 30 | overall_expected_files.append(os.path.join(alignment_output_dir, utils.STATS, f"{sample}_{tech}.samtools.stats.txt")) 31 | overall_expected_files.append(os.path.join(alignment_output_dir, utils.STATS, f"{sample}_{tech}.alignment.yield.txt")) 32 | overall_expected_files.append(os.path.join(alignment_output_dir, utils.STATS, f"{sample}_{tech}.mosdepth.global.dist.txt")) 33 | if config.get(utils.ENABLE_SV_INFERENCE, True): 34 | for sv_tool in config[utils.SV_TOOLS_ENABLED]: 35 | if sv_tool == "sniffles": 36 | suffix = utils.get_sniffles_sens_suffix(config) + "." 37 | else: 38 | suffix = "" 39 | overall_expected_files.append(os.path.join(raw_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}vcf")) 40 | overall_expected_files.append(os.path.join(raw_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}specific.vcf")) 41 | overall_expected_files.append(os.path.join(raw_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}vcf.stats.sizes.txt")) 42 | overall_expected_files.append(os.path.join(raw_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}specific.vcf.stats.sizes.txt")) 43 | if config.get(utils.ENABLE_SV_REFINEMENT, True): 44 | overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.specific.vcf")) 45 | overall_expected_files.append(os.path.join(refined_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.specific.vcf.stats.sizes.txt")) 46 | overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.vcf")) 47 | overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.norm.vcf")) 48 | overall_expected_files.append(os.path.join(refined_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.vcf.stats.sizes.txt")) 49 | overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.specific.vcf")) 50 | overall_expected_files.append(os.path.join(refined_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}refined.specific.vcf.stats.sizes.txt")) 51 | overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.vcf")) 52 | overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.norm.vcf")) 53 | overall_expected_files.append(os.path.join(refined_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}refined.vcf.stats.sizes.txt")) 54 | if config.get(utils.ENABLE_IS_MERGING, True): 55 | overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.ism.vcf")) 56 | overall_expected_files.append(os.path.join(refined_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.ism.vcf.stats.sizes.txt")) 57 | overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.ism.specific.vcf")) 58 | overall_expected_files.append(os.path.join(refined_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.ism.specific.vcf.stats.sizes.txt")) 59 | 60 | rule main: 61 | input: overall_expected_files 62 | 63 | include: "call_svs_sniffles_single.snakefile" 64 | include: "align_single.snakefile" 65 | include: "jasmine_pre.snakefile" -------------------------------------------------------------------------------- /pipeline/rockfish_config.yaml: -------------------------------------------------------------------------------- 1 | "__default__": 2 | account: "FILL" 3 | time: "24:0:0" 4 | nCPUs: 2 5 | nodes: 1 6 | partition: "defq,lrgmem" 7 | mem_mb: "2000" 8 | name: "JOBNAME.{rule}.{wildcards}" 9 | out: "logs/cluster/{rule}.{wildcards}.out" 10 | err: "logs/cluster/{rule}.{wildcards}.err" 11 | 12 | single_alignment: 13 | time: "48:0:0" 14 | partition: "defq,lrgmem" 15 | nCPUs: 24 16 | mem_mb: "{resources.mem_mb}" 17 | name: "alignment.{rule}.{wildcards}" 18 | 19 | single_sam_to_sort_bam: 20 | time: "24:0:0" 21 | partition: "defq,lrgmem" 22 | nCPUs: 24 23 | mem_mb: "{resources.mem_mb}" 24 | name: "samtools_sort.{rule}.{wildcards}" 25 | 26 | merge_sorted: 27 | time: "24:0:0" 28 | partition: "defq,lrgmem" 29 | nCPUs: 1 30 | mem_mb: "{resources.mem_mb}" 31 | name: "samtools_merge.{rule}.{wildcards}" 32 | 33 | merged_average_coverage_samtools: 34 | time: "24:0:0" 35 | partition: "defq,lrgmem" 36 | nCPUs: 1 37 | name: "samtools_cov.{rule}.{wildcards}" 38 | 39 | merged_coverage_mosdepth: 40 | time: "24:0:0" 41 | partition: "defq,lrgmem" 42 | nCPUs: 24 43 | name: "mosdepth.{rule}.{wildcards}" 44 | 45 | sensitive_svs_sniffles: 46 | time: "24:0:0" 47 | partition: "defq,lrgmem" 48 | nCPUs: 5 49 | mem_mb: "{resources.mem_mb}" 50 | name: "sniffles.{rule}.{wildcards}" 51 | 52 | sensitive_ins_to_dup_conversion: 53 | time: "24:0:0" 54 | partition: "defq,lrgmem" 55 | nCPUs: 2 56 | mem_mb: "{resources.mem_mb}" 57 | name: "jasmine_pre_ins_dup.{rule}.{wildcards}" 58 | 59 | refined_sensitive_new_sv_types: 60 | time: "72:0:0" 61 | partition: "defq,lrgmem" 62 | nCPUs: 24 63 | mem_md: "{resources.mem_mb}" 64 | name: "jasmine_iris_refine.{rule}.{wildcards}" 65 | 66 | intra_sample_merging: 67 | time: "24:0:0" 68 | partition: "defq,lrgmem" 69 | nCPUs: 24 70 | mem_md: "{resources.mem_mb}" 71 | name: "jasmine_intra.{rule}.{wildcards}" 72 | -------------------------------------------------------------------------------- /pipeline/sv_sizes.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import bisect 3 | import logging 4 | import sys 5 | import cyvcf2 6 | 7 | import utils 8 | 9 | 10 | def bin_string_repr(bin_value: int) -> str: 11 | suffixes = ["", "K", "M", "G"] 12 | suffix_index = 0 13 | while abs(bin_value) >= 1000: 14 | suffix_index += 1 15 | bin_value /= 1000 16 | if bin_value == int(bin_value): 17 | bin_value = int(bin_value) 18 | return f"{bin_value:,}{suffixes[suffix_index]}" 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument("VCF", type=str) 24 | parser.add_argument("--bins", default="1,30,50,100,150,200,350,300,500,750,1000,2000,5000,10000,50000,100000,500000") 25 | parser.add_argument("-o", "--output", default=sys.stdout, type=argparse.FileType("wt")) 26 | parser.add_argument("--no-out-header", dest="out_header", action="store_false") 27 | parser.add_argument("--no-out-total-t", dest="out_total_type", action="store_false") 28 | parser.add_argument("--no-out-total-s", dest="out_total_size", action="store_false") 29 | parser.add_argument("--no-out-indiv-bins", dest="out_indiv_bins", action="store_false") 30 | parser.add_argument("--types", type=str, default="INS,DEL,DUP,INV,TRA") 31 | parser.add_argument("--no-abs-length", dest="abs_length", action="store_false") 32 | parser.add_argument("--info-len-field", type=str, default="SVLEN") 33 | parser.add_argument("--info-type-field", type=str, default="SVTYPE") 34 | args = parser.parse_args() 35 | logger = logging.getLogger("SV-stats") 36 | logger.setLevel(logging.DEBUG) 37 | ch = logging.StreamHandler() 38 | ch.setLevel(logging.DEBUG) 39 | logger.addHandler(ch) 40 | supplied_bins = sorted(set(map(int, args.bins.split(",")))) 41 | bins = [-3000000000] + supplied_bins + [3000000000] 42 | logger.debug(f"bins: [{','.join(map(str, bins))}]") 43 | types = [utils.SVType.from_str(string=s) for s in args.types.split(",")] 44 | if not set(types).issubset({x for x in utils.SVType}): 45 | logger.critical(f"Supplied type list {','.join(map(str, types))} is not a subset of a standardized 5 types.") 46 | exit(1) 47 | bin_counts = {bin_value: {sv_type: 0 for sv_type in types} for bin_value in bins} 48 | reader = cyvcf2.VCF(args.VCF) 49 | for cnt, record in enumerate(reader): 50 | sv_type = utils.get_sv_type(vcf_record=record, info_type_field=args.info_type_field, info_len_field=args.info_len_field, logger=logger) 51 | sv_length = utils.get_sv_length(record, sv_type=sv_type, abs_value=args.abs_length, info_len_field=args.info_len_field, info_type_field=args.info_type_field) 52 | bin_index = bisect.bisect_right(bins, sv_length) 53 | if bin_index < 1: 54 | logger.error(f"Something is wrong with length bin determination for record {str(record)} with type {str(sv_type)}") 55 | if sv_type not in bin_counts[bins[bin_index - 1]]: 56 | continue 57 | bin_counts[bins[bin_index - 1]][sv_type] += 1 58 | type_totals = {sv_type: sum(bin_counts[bin_v][sv_type] for bin_v in bins) for sv_type in types} 59 | header = ["bin"] + types 60 | if args.out_total_size: 61 | header += ["total"] 62 | if args.out_header: 63 | print(",".join(map(str, header)), file=args.output) 64 | if args.out_indiv_bins: 65 | for lv, rv in zip(bins[:-1], bins[1:]): 66 | bin_str_value = f"[{bin_string_repr(lv)} - {bin_string_repr(rv)})" 67 | sv_type_values = [bin_counts[lv][sv_type] for sv_type in types] 68 | bin_total = sum(sv_type_values) 69 | result = f"{bin_str_value}," + ",".join(map(str, sv_type_values)) 70 | if args.out_total_size: 71 | result += f",{bin_total}" 72 | print(result, file=args.output) 73 | if args.out_total_type: 74 | print("total," + ",".join(map(str, (type_totals[sv_type] for sv_type in types))), file=args.output) 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /pipeline/sv_supports.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import bisect 3 | import logging 4 | import sys 5 | 6 | import cyvcf2 7 | 8 | import utils 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("VCF", type=str) 14 | parser.add_argument("--supports", default="0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20") 15 | parser.add_argument("-o", "--output", default=sys.stdout, type=argparse.FileType("wt")) 16 | parser.add_argument("--no-out-header", dest="out_header", action="store_false") 17 | parser.add_argument("--no-out-total-b", dest="out_total_bins", action="store_false") 18 | parser.add_argument("--no-out-type", dest="out_indiv_types", action="store_false") 19 | parser.add_argument("--types", type=str, default="INS,DEL,DUP,INV,TRA") 20 | parser.add_argument("--info-support-field", type=str, default="RE") 21 | parser.add_argument("--info-reads-field", type=str, default="RNAMES") 22 | parser.add_argument("--info-type-field", type=str, default="SVTYPE") 23 | parser.add_argument("--info-len-field", type=str, default="SVLEN") 24 | args = parser.parse_args() 25 | logger = logging.getLogger("SV-stats") 26 | logger.setLevel(logging.DEBUG) 27 | ch = logging.StreamHandler() 28 | ch.setLevel(logging.DEBUG) 29 | logger.addHandler(ch) 30 | bins = sorted(set(map(int, args.supports.split(",")))) 31 | logger.debug(f"bins: [{','.join(map(str, bins))}]") 32 | types = [utils.SVType.from_str(string=s) for s in args.types.split(",")] 33 | if not set(types).issubset({x for x in utils.SVType}): 34 | logger.critical(f"Supplied type list {','.join(map(str, types))} is not a subset of a standardized 5 types.") 35 | exit(1) 36 | if not all(map(lambda x: x >= 0, bins)): 37 | logger.warning(f"Some bins were of negative values. Only non-negative values are permitted. Removing all negative values.") 38 | bins = [x for x in bins if x >= 0] 39 | if 0 not in bins: 40 | logger.warning(f"0 value is not in bins. Adding 0.") 41 | bins = [0] + bins 42 | bin_counts = {bin_value: {sv_type: 0 for sv_type in types} for bin_value in bins} 43 | reader = cyvcf2.VCF(args.VCF) 44 | for cnt, record in enumerate(reader): 45 | sv_type = utils.get_sv_type(vcf_record=record, info_type_field=args.info_type_field, info_len_field=args.info_len_field, logger=logger) 46 | sv_support = utils.get_sv_support_cnt(vcf_record=record, info_re_field=args.info_support_field, info_reads_field=args.info_reads_field) 47 | bin_index = bisect.bisect_right(bins, sv_support) 48 | assert bin_index > 0 49 | if sv_type not in bin_counts[bins[bin_index - 1]]: 50 | continue 51 | bin_counts[bins[bin_index - 1]][sv_type] += 1 52 | header = ["bin"] 53 | if args.out_indiv_types: 54 | header += types 55 | if args.out_total_bins: 56 | header += ["total"] 57 | if args.out_header: 58 | print(",".join(map(str, header)), file=args.output) 59 | type_totals = {sv_type: sum(bin_counts[bin_v][sv_type] for bin_v in bins) for sv_type in types} 60 | for bin_value in bins: 61 | sv_type_values = [] 62 | for sv_type in types: 63 | sv_type_values.append(type_totals[sv_type]) 64 | type_totals[sv_type] -= bin_counts[bin_value][sv_type] 65 | bin_total = sum(sv_type_values) 66 | result = f"{bin_value}" 67 | if args.out_indiv_types: 68 | result += "," + ",".join(map(str, sv_type_values)) 69 | if args.out_total_bins: 70 | result += f",{bin_total}" 71 | print(result, file=args.output) 72 | assert all(map(lambda x: x == 0, type_totals.values())) 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /pipeline/tools.yaml: -------------------------------------------------------------------------------- 1 | sv_tools_enabled: 2 | - "sniffles" 3 | 4 | aligner: "ngmlr" 5 | 6 | tools: 7 | tmp_dir: "" 8 | samtools: 9 | path: "samtools" 10 | threads: 100 11 | mem_mb_per_thread: 1000 12 | mem_mb_core: 2000 13 | ngmlr: 14 | reads_cnt_per_run: 200000 15 | path: "ngmlr" 16 | threads: 100 17 | time: "72:0:0" 18 | mem_mb_per_thread: 1000 19 | mem_mb_core: 6000 20 | sniffles: 21 | path: "sniffles" 22 | threads: 5 23 | min_support: 2 24 | min_length: 20 25 | max_num_splits: 10 26 | max_distance: 50 27 | num_reads_report: -1 28 | min_seq_size: 1000 29 | time: "24:0:0" 30 | mem_mb_per_thread: 1000 31 | mem_mb_core: 25000 32 | java: 33 | path: "java" 34 | jasmine: 35 | src_path: "" 36 | threads: 2 37 | mem_mb_core: 20000 38 | mem_mb_per_thread: 4000 39 | ins_to_dup: 40 | max_dup_length: 10000 41 | script_name: "InsertionsToDuplications" 42 | specific_marked: 43 | spec_reads_fixed: 10 44 | spec_reads_fraction: 0.25 45 | spec_len: 30 46 | is_merging: 47 | normalize_types: True 48 | use_types: True 49 | use_strands: True 50 | use_edit_distance: False 51 | use_end: False 52 | max_distance: 100 53 | min_distance: -1 54 | threads: 100 55 | strategy: "default" 56 | kd_tree_norm: 2 57 | min_seq_id: 0 58 | max_distance_linear: 0 59 | k_jaccard: 9 60 | iris: 61 | src_path: "" 62 | threads: 100 63 | mem_mb_core: 20000 64 | mem_mb_per_thread: 1000 65 | min_ins_length: 30 66 | max_out_length: 100000 67 | max_ins_dist: 100 68 | max_length_change: 0.25 69 | minimap2: 70 | reads_cnt_per_run: 800000 71 | mem_mb_per_thread: 1000 72 | mem_mb_core: 6000 73 | path: "minimap2" 74 | threads: 100 75 | time: "72:0:0" 76 | racon: 77 | path: "racon" 78 | sv_sizes: 79 | path: "sv_sizes.py" 80 | bins: "1,30,50,100,150,200,350,300,500,750,1000,2000,5000,10000,50000,100000,500000" 81 | types: "INS,DEL,DUP,INV,TRA" 82 | abs_length: True 83 | info_length_field: "SVLEN" 84 | sam_fix: 85 | path: "fix_sam.py" 86 | seqtk: 87 | path: "seqtk" 88 | mosdepth: 89 | path: "mosdepth" 90 | mem_mb_per_thread: 1000 91 | mem_mb_core: 2000 92 | threads: 100 93 | paer_base: False 94 | fast_mode: True 95 | window_size: 500 96 | meryl: 97 | path: "meryl" 98 | distinct: 0.9998 99 | k: 15 100 | winnowmap: 101 | reads_cnt_per_run: 800000 102 | mem_mb_per_thread: 1000 103 | mem_mb_core: 6000 104 | path: "winnowmap" 105 | threads: 100 106 | time: "72:0:0" 107 | -------------------------------------------------------------------------------- /pipeline/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from collections import defaultdict 4 | from enum import Enum 5 | import logging 6 | from typing import Optional 7 | 8 | DEFAULT_THREAD_CNT = 99 9 | DEFAULT_CLUSTER_MEM_MB = 4000 10 | 11 | OUTPUT_DIR = "output_dir" 12 | ALIGNMENTS = "alignments" 13 | READS_PATHS = "reads_paths" 14 | SAMPLES = "samples" 15 | LOG = "log" 16 | TOOLS = "tools" 17 | THREADS = "threads" 18 | SAMTOOLS = "samtools" 19 | ALIGNER = "aligner" 20 | MOSDEPTH = "mosdepth" 21 | WINDOW_SIZE = "window_size" 22 | PER_BASE = "per_base" 23 | FAST_MODE = "fast_mode" 24 | PYTHON = "python" 25 | 26 | WINNOWMAP = "winnowmap" 27 | MERYL = "meryl" 28 | DISTINCT = "distinct" 29 | K = "k" 30 | 31 | AWK = "awk" 32 | SEQTK = "seqtk" 33 | NGMLR = "ngmlr" 34 | PATH = "path" 35 | TMP_DIR = "tmp_dir" 36 | TECH = "tech" 37 | REFERENCE = "ref" 38 | READS_CNT_PER_RUN = "reads_cnt_per_run" 39 | 40 | 41 | RAW = "raw" 42 | SVS = "svs" 43 | REFINED = "refined" 44 | INS_TO_DUP = "ins_to_dup" 45 | IRIS_REFINED = "iris_refined" 46 | NORM_SV = "norm_sv" 47 | SPECIFIC_MARKED = "specific_marked" 48 | SPEC_READS_FIXED = "spec_reads_fixed" 49 | SPEC_READS_FRACTION = "spec_reads_fraction" 50 | SPEC_LEN = "spec_len" 51 | MAX_DUP_LENGTH = "max_dup_length" 52 | 53 | SNIFFLES = "sniffles" 54 | MIN_SUPPORT = "min_support" 55 | MIN_LENGTH = "min_length" 56 | MAX_NUM_SPLIT_READS = "max_num_splits" 57 | MAX_DISTANCE = "max_distance" 58 | NUM_READS_REPORT = "num_reads_report" 59 | MIN_SEQ_SIZE = "min_seq_size" 60 | 61 | SV_TOOLS_ENABLED = "sv_tools_enabled" 62 | 63 | JAVA = "java" 64 | JASMINE = "jasmine" 65 | IRIS = "iris" 66 | SRC_PATH = "src_path" 67 | MINIMAP2 = "minimap2" 68 | RACON = "racon" 69 | SCRIPT_NAME = "script_name" 70 | MIN_INS_LENGTH = "min_ins_length" 71 | MAX_OUT_LENGTH = "max_out_length" 72 | MAX_INS_DIST = "max_ins_dist" 73 | MAX_LENGTH_CHANGE = "max_length_change" 74 | IS_MERGING = "is_merging" 75 | NORMALIZE_TYPES = "normalize_types" 76 | USE_STRANDS = "use_strands" 77 | USE_TYPES = "use_types" 78 | USE_EDIT_DISTANCE = "use_edit_distance" 79 | USE_END = "use_end" 80 | STRATEGY = "strategy" 81 | KD_TREE_NORM = "kd_tree_norm" 82 | MAX_DISTANCE_LINEAR = "max_distance_linear" 83 | MIN_DISTANCE = "min_distance" 84 | MIN_SEQ_ID = "min_seq_id" 85 | K_JACCARD = "k_jaccard" 86 | 87 | SV_SIZES = "sv_sizes" 88 | BINS = "bins" 89 | TYPES = "types" 90 | ABS_LENGTH = "abs_length" 91 | INFO_LENGTH_FIELD = "info_length_field" 92 | 93 | MEM_MB_PER_THREAD = "mem_mb_per_thread" 94 | MEM_MB_CORE = "mem_mb_core" 95 | 96 | NCPUS = "nCPUs" 97 | 98 | STATS = "stats" 99 | 100 | 101 | ENABLE_SV_INFERENCE = "enable_sv_inference" 102 | ENABLE_SV_REFINEMENT = "enable_sv_refinement" 103 | ENABLE_IS_MERGING = "enable_is_merging" 104 | ENABLE_ALIGNMENT_STATS = "enable_alignment_stats" 105 | 106 | 107 | EXISTING_ALIGNMENTS = "existing_alignments" 108 | BGZIP = "bgzip" 109 | 110 | ########## 111 | # 112 | # snakemake data preparation utils 113 | # 114 | ########## 115 | 116 | 117 | def ensure_samples_correctness(config): 118 | if SAMPLES not in config or not isinstance(config[SAMPLES], list) or len(config[SAMPLES]) < 1: 119 | raise ValueError("Configuration data file is missing information about samples or the setup is not dictionary-like") 120 | 121 | 122 | def get_samples_to_reads_paths(config): 123 | samples_to_reads_paths = defaultdict(list) 124 | for sample_data in config["samples"]: 125 | sample_name = sample_data["sample"] 126 | if TECH not in sample_data or sample_data[TECH].lower() not in ["ont", "pb", "pacbio", "pbccs", "pacbioccs"]: 127 | raise ValueError( 128 | f"incorrect or missing tech {sample_data[TECH]} specified for sample {sample_name} in data.yaml. Only ONT or PB are supported, and tech specification is required") 129 | tech = sample_data[TECH].upper() 130 | has_alignment = os.path.exists(os.path.join(config.get(OUTPUT_DIR, ""), ALIGNMENTS, f"{sample_name}_{tech}.sort.bam")) 131 | if not has_alignment: 132 | if READS_PATHS not in sample_data or not isinstance(sample_data[READS_PATHS], list) or len(sample_data[READS_PATHS]) < 1: 133 | raise ValueError( 134 | f"Error when parsing reads paths for sample {sample_name} sample. Make sure the entries are formatted as a list of strings under the {READS_PATHS} key") 135 | if (sample_name, tech) in samples_to_reads_paths: 136 | warning_message = f"sample {sample_name} with read tech {tech} is specified in input data multiple times." 137 | if not config.get("allow_dup_st_entries", False): 138 | raise ValueError(f"Error! {warning_message}") 139 | else: 140 | print(f"WARNING! {warning_message} Proceeding because `allow_dup_st_entries` is set to True", file=sys.stderr) 141 | for read_path in sample_data[READS_PATHS]: 142 | if not read_path.endswith(("fastq", "fq", "fastq.gz", "fq.gz", "fasta", "fasta.gz", "fa", "fa.gz")): 143 | raise ValueError(f"Unsupported input format for read path {read_path}. Only 'fastq', 'fq', 'fastq.gz', 'fq.gz', 'fasta', 'fasta.gz', 'fa', and 'fa.gz' are supported") 144 | samples_to_reads_paths[(sample_name, tech)].append(read_path) 145 | if len(samples_to_reads_paths[(sample_name, tech)]) != len(set(samples_to_reads_paths[(sample_name, tech)])): 146 | warning_message = f"sample {sample_name} with read tech {tech} has some read file paths specified multiple times." 147 | if not config.get("allow_dup_reads_entries", False): 148 | raise ValueError(f"Error! {warning_message}") 149 | else: 150 | print(f"WARNING! {warning_message} Proceeding because `allow_dup_reads_entries` is set to True", file=sys.stderr) 151 | else: 152 | samples_to_reads_paths[(sample_name, tech)].append("") 153 | return samples_to_reads_paths 154 | 155 | 156 | def ensure_aligner(config): 157 | if config['aligner'] not in {"ngmlr", "minimap2", "winnowmap"}: 158 | raise ValueError(f'unsupported aligner option {config["aligner"]}, only ngmlr, minimap2, and winnowmap are supported') 159 | 160 | 161 | def get_extra_alignments_paths(config): 162 | samples_to_reads_paths = defaultdict(list) 163 | for sample_data in config["samples"]: 164 | sample_name = sample_data["sample"] 165 | if TECH not in sample_data or sample_data[TECH].lower() not in ["ont", "pb", "pacbio", "pbccs", "pacbioccs"]: 166 | raise ValueError( 167 | f"incorrect or missing tech {sample_data[TECH]} specified for sample {sample_name} in data.yaml. Only ONT or PB are supported, and tech specification is required") 168 | tech = sample_data[TECH].upper() 169 | if EXISTING_ALIGNMENTS not in sample_data or not isinstance(sample_data[EXISTING_ALIGNMENTS], list) or len(sample_data[EXISTING_ALIGNMENTS]) < 1: 170 | samples_to_reads_paths[(sample_name, tech)] = [] 171 | continue 172 | for alignment_path in sample_data[EXISTING_ALIGNMENTS]: 173 | if not alignment_path.endswith(("bam")): 174 | raise ValueError( 175 | f"Unsupported extra alignment format for alignment {alignment_path}. Only 'bam' are supported") 176 | samples_to_reads_paths[(sample_name, tech)].append(alignment_path) 177 | return samples_to_reads_paths 178 | 179 | 180 | def get_samples_regex(samples_to_reads_paths): 181 | return f"({'|'.join(x[0] for x in samples_to_reads_paths.keys())})" 182 | 183 | 184 | def get_reads_paths_regex(samples_to_reads_paths): 185 | bases = set() 186 | for (sample_name, tech), reads_paths in samples_to_reads_paths.items(): 187 | for read_path in reads_paths: 188 | bases.add(os.path.basename(read_path)) 189 | return f"({'|'.join(bases)})" 190 | 191 | 192 | def get_tech_regex(config): 193 | techs = set() 194 | for sample_data in config[SAMPLES]: 195 | techs.add(sample_data[TECH]) 196 | return f"({'|'.join(techs)})" 197 | 198 | 199 | def ensure_ref_correctness(config): 200 | if REFERENCE not in config: 201 | raise ValueError(f"No reference fasta file specified under 'ref' key in data.yaml. Reference is required.") 202 | 203 | 204 | def get_sniffles_sens_suffix(config): 205 | min_support = config.get(TOOLS, {}).get(SNIFFLES, {}).get(MIN_SUPPORT, 2) 206 | min_length = config.get(TOOLS, {}).get(SNIFFLES, {}).get(MIN_LENGTH, 20) 207 | return f"s{min_support}l{min_length}" 208 | 209 | 210 | SUPPORTED_SV_TOOLS = {"sniffles"} 211 | 212 | 213 | def ensure_enabled_sv_tools(config): 214 | for tool in config[SV_TOOLS_ENABLED]: 215 | if tool.lower() not in SUPPORTED_SV_TOOLS: 216 | raise ValueError(f"Attempt to enable unsupported SV inference tool {tool}. Only {','.join(SUPPORTED_SV_TOOLS)} are supported") 217 | 218 | 219 | def get_min_support(coverage_file, min_support_fixed_cnt, min_support_fraction): 220 | coverage = 100 221 | with open(coverage_file, "rt") as source: 222 | for line in source: 223 | coverage = int(float(line.strip().split("=")[1].strip())) 224 | print(f"extracted coverage of {coverage} from file {coverage_file}") 225 | break 226 | result = min(int(min_support_fixed_cnt), int(coverage * min_support_fraction)) 227 | print(f"target min support cnt {result} with min support fixed cnt = {min_support_fixed_cnt} and min_support_fraction = {min_support_fraction}") 228 | return result 229 | 230 | ########## 231 | # 232 | # SV type and length utils 233 | # 234 | ########## 235 | 236 | 237 | class SVType(Enum): 238 | INS = "INS" 239 | DEL = "DEL" 240 | DUP = "DUP" 241 | INV = "INV" 242 | TRA = "TRA" 243 | 244 | def __str__(self) -> str: 245 | return str(self.value) 246 | 247 | def __repr__(self): 248 | return str(self) 249 | 250 | @classmethod 251 | def from_str(cls, string: str) -> "SVType": 252 | for entry in cls: 253 | if string.lower() == entry.value.lower(): 254 | return entry 255 | raise ValueError(f"Could not determine SVType from its supplied str version {string}") 256 | 257 | 258 | def get_chr_from_alt_bnd_record(bnd_string, default: str = "XXX") -> str: 259 | splitter = "[" if "[" in bnd_string else "]" 260 | chr_entry = [x for x in bnd_string.split(splitter) if ":" in x] 261 | if len(chr_entry) < 1: 262 | return default 263 | return chr_entry[0].split(":")[0] 264 | 265 | 266 | def get_sv_type(vcf_record, info_type_field: str = "SVTYPE", info_len_field: str = "SVLEN", logger: Optional[logging.Logger] = None) -> SVType: 267 | logger = logger if logger else logging.getLogger("Dummy") 268 | strands = vcf_record.INFO.get("STRANDS", "??") 269 | chr1 = str(vcf_record.CHROM) 270 | chr2 = str(vcf_record.INFO.get("CHR2", get_chr_from_alt_bnd_record(bnd_string=vcf_record.ALT[0], default=chr1))) 271 | if chr1 != chr2: 272 | return SVType.TRA 273 | if strands in ["--", "++"]: 274 | return SVType.INV 275 | if strands == "-+": 276 | return SVType.DUP 277 | info_svtype = vcf_record.INFO.get(info_type_field, None) 278 | if info_svtype is not None: 279 | if "INS" in info_svtype: 280 | return SVType.INS 281 | if "DEL" in info_svtype: 282 | return SVType.DEL 283 | coord_length = get_sv_length_from_coordinates(vcf_record) 284 | if coord_length in [0, 1]: 285 | return SVType.INS 286 | info_length = vcf_record.INFO.get(info_len_field, None) 287 | if info_length is not None and int(float(info_length)) < 0: 288 | return SVType.DEL 289 | logger.warning(f"Can't determine the SV type for VCF record {str(vcf_record)}. Defaulting to DEL") 290 | return SVType.DEL 291 | 292 | 293 | def get_sv_length_from_coordinates(vcf_record) -> int: 294 | try: 295 | return abs(int(vcf_record.POS) - vcf_record.INFO["END"]) 296 | except KeyError: 297 | print(f"No END field in VCF record {str(vcf_record)}") 298 | 299 | 300 | def get_sv_length_from_ref_alt(vcf_record) -> int: 301 | return abs(len(vcf_record.ALT[0]) - len(vcf_record.REF)) 302 | 303 | 304 | def get_sv_length(vcf_record, abs_value: bool = True, sv_type: Optional[SVType] = None, info_len_field: str = "SVLEN", info_type_field: str = "SVTYPE") -> int: 305 | """ 306 | 0 value is reserved for TRA SVs 307 | """ 308 | sv_type = sv_type if sv_type else get_sv_type(vcf_record=vcf_record, info_type_field=info_type_field) 309 | result = 0 310 | if sv_type == SVType.TRA: 311 | result = 0 312 | elif sv_type in [SVType.DUP, SVType.INV]: 313 | result = int(float(vcf_record.INFO.get(info_len_field, get_sv_length_from_coordinates(vcf_record)))) 314 | elif sv_type == SVType.INS: 315 | result = int(float(vcf_record.INFO.get(info_len_field, get_sv_length_from_ref_alt(vcf_record)))) 316 | elif sv_type == SVType.DEL: 317 | result = int(float(vcf_record.INFO.get(info_len_field, get_sv_length_from_coordinates(vcf_record)))) 318 | if result > 0: 319 | result *= -1 320 | if abs_value: 321 | result = abs(result) 322 | return result 323 | 324 | 325 | def get_sv_support_cnt(vcf_record, info_re_field: str = "RE", info_reads_field: str = "RNAMES") -> int: 326 | re_value = int(vcf_record.INFO.get(info_re_field, 0)) 327 | if re_value != 0: 328 | return re_value 329 | reads = vcf_record.INFO.get(info_reads_field, "").split(",") 330 | if len(reads) > 1 or len(reads[0]) > 0: 331 | return len(reads) 332 | return 0 333 | -------------------------------------------------------------------------------- /plot_merges.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A simple visualization for comparing Jasmine results to those of SURVIVOR on human chr1 3 | It assumes that the exact lines and points to plot, along with their colors, have already 4 | been determined with the companion program src/VisualizationPrep.java. 5 | 6 | This program takes in a single command line argument - the name of the file with points/lines to plot 7 | ''' 8 | 9 | # Lots of matplotlib imports - we need Qt5Agg for scrollbar 10 | import matplotlib 11 | matplotlib.use('Qt5Agg') 12 | 13 | # Matplotlib's libraries 14 | import matplotlib.patches as mpatches 15 | import matplotlib.pyplot as plt 16 | from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas 17 | from matplotlib.backends.backend_qt5agg import NavigationToolbar2QT as NavigationToolbar 18 | from matplotlib import collections as mc 19 | from matplotlib.lines import Line2D 20 | 21 | # Other imports 22 | import numpy as np 23 | import pylab as pl 24 | import sys 25 | from PyQt5 import QtWidgets, QtCore 26 | 27 | # No yellow because it's hard to see so use brown instead 28 | colors = ['red', 'orange', 'brown', 'green', 'blue', 'purple', 'pink', 'gray', 'black'] 29 | 30 | xs = [] # x values are genomic positions of variants 31 | ys = [] # y values are the sample ID 32 | cs = [] # Color of each point based on its variant type 33 | xline = [] # x-coordinate pairs for merged variants 34 | yline = [] # y-coordinate pairs for merged variants 35 | linecs = [] # Color of each line segment based on which software merged that pair 36 | 37 | 38 | ''' 39 | Each line will contain either a point (x, y) or a line segment (x1, y1, x2, y2). 40 | Both types of lines have the option of adding a color value afterwards. 41 | ''' 42 | with open(sys.argv[1], "r") as f: 43 | for line in f.readlines(): 44 | tokens = line.split() 45 | if len(tokens) == 2: # Point with no color 46 | xs.append(int(tokens[0])) 47 | ys.append(int(tokens[1])) 48 | cs.append(colors[ys[len(ys)-1]]) 49 | elif len(tokens) == 4: # Line segment with no color 50 | xline.append([int(tokens[0]), int(tokens[2])]) 51 | yline.append([int(tokens[1]), int(tokens[3])]) 52 | linecs.append('black') 53 | elif len(tokens) == 3: # Point with color 54 | xs.append(int(tokens[0])) 55 | ys.append(int(tokens[1])) 56 | cs.append(colors[int(tokens[2])]) 57 | elif len(tokens) == 5: # Line segment with color 58 | xline.append([int(tokens[0]), int(tokens[2])]) 59 | yline.append([int(tokens[1]), int(tokens[3])]) 60 | linecs.append(colors[2*int(tokens[4])]) # Double color value so not too similar 61 | 62 | #plt.scatter(xs, ys, c = cs) 63 | #for i in range(0, len(xline)): 64 | # plt.plot(xline[i], yline[i], c = linecs[i]) 65 | 66 | # A window to show a plot with scrolling along the x-axis enabled 67 | class ScrollableWindow(QtWidgets.QMainWindow): 68 | 69 | # Here step is what proportion of x-axis to show at once 70 | def __init__(self, fig, ax, step=0.01): 71 | plt.close("all") 72 | if not QtWidgets.QApplication.instance(): 73 | self.app = QtWidgets.QApplication(sys.argv) 74 | else: 75 | self.app = QtWidgets.QApplication.instance() 76 | 77 | QtWidgets.QMainWindow.__init__(self) 78 | self.widget = QtWidgets.QWidget() 79 | self.setCentralWidget(self.widget) 80 | self.widget.setLayout(QtWidgets.QVBoxLayout()) 81 | self.widget.layout().setContentsMargins(0,0,0,0) 82 | self.widget.layout().setSpacing(0) 83 | 84 | self.fig = fig 85 | self.ax = ax 86 | self.canvas = FigureCanvas(self.fig) 87 | self.canvas.draw() 88 | self.scroll = QtWidgets.QScrollBar(QtCore.Qt.Horizontal) 89 | self.step = step 90 | self.setupSlider() 91 | self.nav = NavigationToolbar(self.canvas, self.widget) 92 | self.widget.layout().addWidget(self.nav) 93 | self.widget.layout().addWidget(self.canvas) 94 | self.widget.layout().addWidget(self.scroll) 95 | 96 | self.canvas.draw() 97 | self.show() 98 | self.app.exec_() 99 | 100 | def setupSlider(self): 101 | self.lims = np.array(self.ax.get_xlim()) 102 | self.scroll.setPageStep(self.step*100) 103 | self.scroll.actionTriggered.connect(self.update) 104 | self.update() 105 | 106 | # Update the window limits based on the scrollbar position 107 | def update(self, evt=None): 108 | r = self.scroll.value()/((1+self.step)*100) 109 | l1 = self.lims[0]+r*np.diff(self.lims) 110 | l2 = l1 + np.diff(self.lims)*self.step 111 | self.ax.set_xlim(l1,l2) 112 | self.fig.canvas.draw_idle() 113 | 114 | fig, ax = plt.subplots() 115 | 116 | # Set the x-axis to go from 0 to the last variant position 117 | plt.xlim(0, max(xs)) 118 | 119 | # Plot the variant points 120 | ax.scatter(xs, ys, c = cs) 121 | 122 | # Add axis labels and title 123 | ax.set_ylabel('Sample ID') 124 | ax.set_xlabel('Position (chr1)') 125 | ax.set_yticks(np.arange(0, max(ys)+1)) 126 | ax.set_title('chr1') 127 | 128 | # Plot the line segments 129 | for i in range(0, len(xline)): 130 | if linecs[i] == colors[4]: 131 | ax.plot(xline[i], yline[i], c = linecs[i], linestyle='dotted') 132 | else: 133 | ax.plot(xline[i], yline[i], c = linecs[i]) 134 | custom_lines = [Line2D([0], [0], color=colors[2], lw=4), 135 | Line2D([0], [0], color=colors[4], lw=4), 136 | Line2D([0], [0], color=colors[6], lw=4)] 137 | 138 | # Add legend for merging software colors 139 | legend1 = plt.legend(custom_lines, ['Jasmine', 'SURVIVOR', 'BOTH'], bbox_to_anchor=(.3, 1.05), ncol = 3) 140 | ax.add_artist(legend1) 141 | 142 | # Add legend for variant type colors 143 | patches = [mpatches.Patch(color=colors[0], label='INS'), mpatches.Patch(color=colors[1], label='DEL'), 144 | mpatches.Patch(color=colors[2], label='DUP'), mpatches.Patch(color=colors[3], label='INV')] 145 | legend2 = plt.legend(handles=patches, bbox_to_anchor=(.9, 1.05), ncol=len(patches)) 146 | ax.add_artist(legend2) 147 | 148 | # Generate the plot with a scrolling window 149 | a = ScrollableWindow(fig,ax) 150 | 151 | 152 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | # Script for running Jasmine 2 | if [ "$(uname -s)" = 'Linux' ]; then 3 | BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 4 | else 5 | BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 6 | fi 7 | 8 | java -cp $BINDIR/src:$BINDIR/Iris/src Main "${@:1}" 9 | -------------------------------------------------------------------------------- /smalltest.sh: -------------------------------------------------------------------------------- 1 | if [ "$(uname -s)" = 'Linux' ]; then 2 | BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 3 | else 4 | BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 5 | fi 6 | 7 | $BINDIR/build_no_iris.sh 8 | $BINDIR/run.sh file_list=$BINDIR/test_data/a.vcf,$BINDIR/test_data/b.vcf out_file=$BINDIR/test_data/merged.vcf --comma_filelist 9 | 10 | myout=$BINDIR/test_data/merged.vcf 11 | correctout=$BINDIR/test_data/c.vcf 12 | 13 | diff -w $myout $correctout >/dev/null;REPLY=$? 14 | echo '' 15 | if [ ${REPLY} -eq 0 ] 16 | then 17 | echo '### TEST SUCCEEDED ###' 18 | else 19 | echo '### TEST FAILED ###' 20 | diff -w $myout $correctout 21 | fi 22 | -------------------------------------------------------------------------------- /split_jasmine: -------------------------------------------------------------------------------- 1 | # Script for running Jasmine's Splitting utility script 2 | if [ "$(uname -s)" = 'Linux' ]; then 3 | BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 4 | else 5 | BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")") 6 | fi 7 | 8 | java -jar $BINDIR/jasmine_split.jar "${@:1}" 9 | -------------------------------------------------------------------------------- /src/AddGenotypes.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Adds genotype information to a merged VCF file based on the genotypes of the original variants 3 | */ 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.PrintWriter; 7 | import java.util.ArrayList; 8 | import java.util.Arrays; 9 | import java.util.HashMap; 10 | import java.util.HashSet; 11 | import java.util.Scanner; 12 | 13 | public class AddGenotypes { 14 | 15 | public static void main(String[] args) throws Exception 16 | { 17 | if(args.length == 3) 18 | { 19 | String inputFile = args[0]; 20 | String fileList = args[1]; 21 | String outFile = args[2]; 22 | addGenotypes(inputFile, fileList, outFile); 23 | } 24 | else 25 | { 26 | System.out.println("Usage: java AddGenotypes "); 27 | return; 28 | } 29 | } 30 | 31 | /* 32 | * To add other FORMAT fields, add their details here and add the logic to initialize them in reformatVariantFormat 33 | */ 34 | static String[] newFieldNames = {"GT", "IS", "OT", "DV", "DR"}; 35 | static String[] newFieldNums = {"1", "1", "1", "1", "1"}; 36 | static String[] newFieldTypes = {"String", "String", "String", "String", "String"}; 37 | static String[] newFieldDescs = new String[] { 38 | "The genotype of the variant", 39 | "Whether or not the variant call was marked as specific due to high read support and length", 40 | "The original type of the variant", 41 | "The number of reads supporting the variant sequence", 42 | "The number of reads supporting the reference sequence" 43 | }; 44 | 45 | /* 46 | * Adds FORMAT fields, including per-sample genotypes, to the variants in a merged VCF file 47 | */ 48 | static void addGenotypes(String inputFile, String fileList, String outputFile) throws Exception 49 | { 50 | // FORMAT fields of all per-file variant calls 51 | ArrayList inputFormats = new ArrayList(); 52 | 53 | // The names of the samples present across all input files 54 | ArrayList allSampleNamesList = new ArrayList(); 55 | 56 | ArrayList vcfFiles = PipelineManager.getFilesFromList(fileList); 57 | for(String vcfFile : vcfFiles) 58 | { 59 | FileFormatField fileFormats = new FileFormatField(vcfFile, true); 60 | for(String sampleName : fileFormats.sampleNames) 61 | { 62 | allSampleNamesList.add(inputFormats.size() + "_" + sampleName); 63 | } 64 | inputFormats.add(fileFormats); 65 | } 66 | 67 | // Get the number of samples per file to know how much to skip in samples where a variant is absent 68 | int[] sampleCounts = new int[inputFormats.size()]; 69 | for(int i = 0; i= 9) 112 | { 113 | StringBuilder newLastLine = new StringBuilder(""); 114 | for(int i = 0; i<9; i++) 115 | { 116 | newLastLine.append(lastHeaderLineTokens[i]); 117 | if(i < 8) 118 | { 119 | newLastLine.append("\t"); 120 | } 121 | } 122 | for(String sampleName : allSampleNames) 123 | { 124 | newLastLine.append("\t" + sampleName); 125 | } 126 | header.lines.set(header.lines.size() - 1, newLastLine.toString()); 127 | } 128 | 129 | header.print(out); 130 | } 131 | 132 | // This is the per-variant merging and printing logic 133 | VcfEntry entry = new VcfEntry(line); 134 | String suppVec = entry.getInfo("SUPP_VEC"); 135 | if(suppVec.length() == 0) 136 | { 137 | // If there is no support vector field, just leave the entry as-is 138 | out.println(entry); 139 | } 140 | else 141 | { 142 | // The list of format fields of all variants merged into this one 143 | ArrayList toMerge = new ArrayList(); 144 | String[] ids = entry.getInfo("IDLIST").split(","); 145 | for(int i = 0; i list, int[] sampleCounts, String suppVec) 178 | { 179 | int numSamples = 0; 180 | for(int count : sampleCounts) 181 | { 182 | numSamples += count; 183 | } 184 | // Initialize empty format field data structure big enough for all of the samples 185 | VariantFormatField res = new VariantFormatField(numSamples, newFieldNames); 186 | 187 | // Update one field at a time 188 | for(int i = 0; i 0) 207 | { 208 | res.sampleFieldValues[sampleIndex][res.getFieldIndex(fieldName)] = val; 209 | } 210 | else 211 | { 212 | res.sampleFieldValues[sampleIndex][res.getFieldIndex(fieldName)] = "."; 213 | } 214 | } 215 | else 216 | { 217 | // Fill fields with "NA" but use "./." or "0|0" for genotype 218 | String val = "NA"; 219 | if(fieldName.equals("GT")) 220 | { 221 | if(Settings.DEFAULT_ZERO_GENOTYPE) 222 | { 223 | val = "0|0"; 224 | } 225 | else 226 | { 227 | val = "./."; 228 | } 229 | } 230 | res.sampleFieldValues[sampleIndex][res.getFieldIndex(fieldName)] = val; 231 | } 232 | sampleIndex++; 233 | } 234 | if(include) listIndex++; 235 | } 236 | } 237 | 238 | return res; 239 | 240 | } 241 | 242 | /* 243 | * Reformats a variant's format fields to match what we want 244 | */ 245 | static VariantFormatField reformatVariantFormat(VariantFormatField oldVariant, VcfEntry entry) throws Exception 246 | { 247 | int numSamples = oldVariant.numSamples(); 248 | VariantFormatField res = new VariantFormatField(numSamples, newFieldNames); 249 | 250 | for(int i = 0; i 0) 259 | { 260 | res.sampleFieldValues[j][i] = oldGt; 261 | } 262 | else 263 | { 264 | if(Settings.DEFAULT_ZERO_GENOTYPE) 265 | { 266 | res.sampleFieldValues[j][i] = "0|0"; 267 | } 268 | res.sampleFieldValues[j][i] = "./."; 269 | } 270 | } 271 | else if(field.equals("IS")) 272 | { 273 | if(entry.hasInfoField("IS_SPECIFIC")) 274 | { 275 | res.sampleFieldValues[j][i] = entry.getInfo("IS_SPECIFIC"); 276 | } 277 | else 278 | { 279 | res.sampleFieldValues[j][i] = "."; 280 | } 281 | } 282 | else if(field.equals("OT")) 283 | { 284 | if(entry.hasInfoField("OLDTYPE")) 285 | { 286 | res.sampleFieldValues[j][i] = entry.getInfo("OLDTYPE"); 287 | } 288 | else 289 | { 290 | String type = entry.getType(); 291 | if(type.length() > 0) 292 | { 293 | res.sampleFieldValues[j][i] = type; 294 | } 295 | else 296 | { 297 | res.sampleFieldValues[j][i] = "."; 298 | } 299 | } 300 | } 301 | else if(field.equals("DV")) 302 | { 303 | String oldDv = oldVariant.getValue(j, "DV"); 304 | if(oldDv.length() > 0) 305 | { 306 | res.sampleFieldValues[j][i] = oldDv; 307 | } 308 | else 309 | { 310 | res.sampleFieldValues[j][i] = entry.getReadSupport() + ""; 311 | } 312 | } 313 | else if(field.equals("DR")) 314 | { 315 | String oldDr = oldVariant.getValue(j, "DR"); 316 | if(oldDr.length() > 0) 317 | { 318 | res.sampleFieldValues[j][i] = oldDr; 319 | } 320 | else 321 | { 322 | res.sampleFieldValues[j][i] = "."; 323 | } 324 | } 325 | } 326 | } 327 | 328 | return res; 329 | } 330 | 331 | /* 332 | * The values of FORMAT fields for an entire VCF file, including the sample names in the header 333 | */ 334 | static class FileFormatField 335 | { 336 | // FORMAT field names and value for each individual variant 337 | ArrayList variantFormats; 338 | 339 | // Names of samples which are present in the file 340 | String[] sampleNames; 341 | 342 | // Map from variant ID to index in variantFormats for fast lookup of particular variants 343 | HashMap idToVariantIndex; 344 | 345 | // The header of the VCF file 346 | VcfHeader header; 347 | 348 | FileFormatField(String fileName, boolean reformat) throws Exception 349 | { 350 | HashSet ids = new HashSet(); 351 | variantFormats = new ArrayList(); 352 | idToVariantIndex = new HashMap(); 353 | header = new VcfHeader(); 354 | Scanner input = new Scanner(new FileInputStream(new File(fileName))); 355 | boolean extractedSampleNames = false; 356 | while(input.hasNext()) 357 | { 358 | String line = input.nextLine(); 359 | if(line.length() == 0) 360 | { 361 | continue; 362 | } 363 | if(line.startsWith("#")) 364 | { 365 | header.addLine(line); 366 | } 367 | else 368 | { 369 | // If this is the first variant, we finished the header, so get sample names from the last header line 370 | if(!extractedSampleNames) 371 | { 372 | extractedSampleNames = true; 373 | 374 | // Get the list of sample names from the last header line 375 | String lastLine = header.lines.get(header.lines.size() - 1); 376 | String[] tabTokens = lastLine.split("\t"); 377 | 378 | // Check if there are actually sample names in the header 379 | if(tabTokens.length > 9) 380 | { 381 | sampleNames = new String[tabTokens.length - 9]; 382 | for(int i = 0; i 9) 434 | { 435 | sampleNames = new String[tabTokens.length - 9]; 436 | for(int i = 0; i 8) 482 | { 483 | sampleFieldValues = new String[entry.tabTokens.length - 9][]; 484 | String formatString = entry.tabTokens[8]; 485 | fieldNames = formatString.split(":"); 486 | for(int i = 0; i 0) 59 | { 60 | String tmp = first; 61 | first = second; 62 | second = tmp; 63 | } 64 | String id = first + "_" + second; 65 | if(Settings.USE_TYPE) 66 | { 67 | id += "_" + getType(); 68 | } 69 | if(Settings.USE_STRAND) 70 | { 71 | id += "_" + getStrand(); 72 | } 73 | return id; 74 | } 75 | 76 | /* 77 | * The second chromosome can be found in either the CHR2 INFO field or the ALT field 78 | */ 79 | public String getChr2() throws Exception 80 | { 81 | if(hasInfoField("CHR2")) 82 | { 83 | return Settings.CHR_NAME_MAP.normalize(getInfo("CHR2")); 84 | } 85 | if(altTokens.length == 1) 86 | { 87 | return getChromosome(); 88 | } 89 | String chrPosToken = altTokens[1]; 90 | return Settings.CHR_NAME_MAP.normalize(chrPosToken.substring(0, chrPosToken.lastIndexOf(':'))); 91 | } 92 | 93 | /* 94 | * The strands may need to be inferred from the ALT square bracket format 95 | */ 96 | public String getStrand() throws Exception 97 | { 98 | String res = getInfo("STRANDS"); 99 | if(res.length() == 0) 100 | { 101 | return strandsFromAltFormat(); 102 | } 103 | return res; 104 | } 105 | 106 | /* 107 | * Determine the strands from the ALT square bracket format 108 | */ 109 | public String strandsFromAltFormat() 110 | { 111 | String alt = getAlt(); 112 | if(alt.startsWith("[")) 113 | { 114 | return "+-"; 115 | } 116 | else if(alt.startsWith("]")) 117 | { 118 | return "--"; 119 | } 120 | else if(alt.contains("[")) 121 | { 122 | return "++"; 123 | } 124 | else if(alt.contains("]")) 125 | { 126 | return "-+"; 127 | } 128 | return ""; 129 | } 130 | 131 | /* 132 | * Gets the first coordinate of the variant 133 | */ 134 | public double getFirstCoord() throws Exception 135 | { 136 | String chr = getChromosome(), chr2 = getChr2(); 137 | if(chr.compareTo(chr2) > 0 || (chr.equals(chr2) && getPos() > getEnd())) 138 | { 139 | if(hasInfoField("AVG_END")) 140 | { 141 | return Double.parseDouble(getInfo("AVG_END")); 142 | } 143 | return getEnd(); 144 | } 145 | if(hasInfoField("AVG_START")) 146 | { 147 | return Double.parseDouble(getInfo("AVG_START")); 148 | } 149 | return getPos(); 150 | } 151 | 152 | /* 153 | * Since length is undefined, get the second coord instead 154 | */ 155 | public double getSecondCoord() throws Exception 156 | { 157 | String chr = getChromosome(), chr2 = getChr2(); 158 | if(chr.compareTo(chr2) > 0 || (chr.equals(chr2) && getPos() > getEnd())) 159 | { 160 | if(hasInfoField("AVG_START")) 161 | { 162 | return Double.parseDouble(getInfo("AVG_START")); 163 | } 164 | return getPos(); 165 | } 166 | if(hasInfoField("AVG_END")) 167 | { 168 | return Double.parseDouble(getInfo("AVG_END")); 169 | } 170 | return getEnd(); 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /src/ChrNameNormalization.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Map for normalizing chromosome names 3 | */ 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.util.HashMap; 7 | import java.util.Scanner; 8 | 9 | public class ChrNameNormalization 10 | { 11 | HashMap normMap = new HashMap(); 12 | ChrNameNormalization() throws Exception 13 | { 14 | normMap = new HashMap(); 15 | if(Settings.DEFAULT_CHR_NORM) 16 | { 17 | // Remove "chr" from chromosome names 18 | for(int i = 1; i<=22; i++) 19 | { 20 | normMap.put("chr" + i, i + ""); 21 | } 22 | normMap.put("chrX", "X"); 23 | normMap.put("chrY", "Y"); 24 | normMap.put("chrM", "MT"); 25 | } 26 | else if(Settings.CHR_NORM_FILE.length() > 0) 27 | { 28 | // Read in chromosome name map 29 | Scanner input = new Scanner(new FileInputStream(new File(Settings.CHR_NORM_FILE))); 30 | while(input.hasNext()) 31 | { 32 | String line = input.nextLine(); 33 | if(line.length() == 0) 34 | { 35 | continue; 36 | } 37 | String[] tokens = line.split(" "); 38 | for(int i = 1; i entries = new ArrayList(); 46 | 47 | int countDup = 0; 48 | 49 | while(input.hasNext()) 50 | { 51 | String line = input.nextLine(); 52 | if(line.startsWith("#")) 53 | { 54 | header.addLine(line); 55 | } 56 | else if(line.length() > 0) 57 | { 58 | VcfEntry ve = new VcfEntry(line); 59 | if(ve.getType().equals("DUP") && ve.getLength() < Settings.MAX_DUP_LEN) 60 | { 61 | countDup++; 62 | 63 | long start = ve.getPos(), end = Long.parseLong(ve.getInfo("END")); 64 | int length = ve.getLength(); 65 | if(end <= start) 66 | { 67 | System.err.printf("Duplication with ID %s has end (%d) <= start (%d), so was not converted\n", 68 | ve.getId(), start, end); 69 | entries.add(ve); 70 | continue; 71 | } 72 | long nstart = start + length - 1, nend = nstart; 73 | 74 | if(ve.getAlt().equals("")) 75 | { 76 | String seq = gq.genomeSubstring(ve.getChromosome(), start, end-1); 77 | 78 | if(length < 100000) 79 | { 80 | ve.setRef(seq.charAt(seq.length()-1)+""); 81 | ve.setAlt(seq.charAt(seq.length()-1)+seq); 82 | } 83 | else 84 | { 85 | ve.setRef("."); 86 | ve.setAlt(""); 87 | } 88 | ve.setInfo("END", nend+""); 89 | ve.setInfo("STRANDS", "+-"); 90 | ve.setPos(nstart); 91 | ve.setInfo("OLDTYPE", "DUP"); 92 | } 93 | else 94 | { 95 | ve.setInfo("OLDTYPE", "DUP"); 96 | } 97 | ve.setType("INS"); 98 | 99 | } 100 | else 101 | { 102 | ve.setInfo("OLDTYPE", ve.getType()); 103 | } 104 | entries.add(ve); 105 | } 106 | } 107 | 108 | System.out.println("Number of duplications converted to insertions: " + countDup + " out of " + entries.size() + " total variants"); 109 | 110 | header.addInfoField("OLDTYPE", "1", "String", ""); 111 | header.addInfoField("STRANDS", "1", "String", ""); 112 | header.print(out); 113 | 114 | for(VcfEntry ve : entries) 115 | { 116 | out.println(ve); 117 | } 118 | 119 | input.close(); 120 | out.close(); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/Forest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * A representation of a forest using a union-find data structure 3 | * It allows nodes to be merged, checking if their components share 4 | * any variants from the same sample. For now, only up to 64 samples 5 | * are supported, but sampleMask can be replaced with actual bitsets 6 | * at a small cost to runtime. 7 | */ 8 | 9 | import java.util.Arrays; 10 | 11 | public class Forest 12 | { 13 | int[] map; // map[i] is negative if root, more negative means bigger set; if nonnegative, then it indicates the parent 14 | long[][] sampleMask; // For each root node, a bitmask of which samples are present in its component 15 | static int samplesPerMask = 63; 16 | 17 | public Forest(Variant[] data) 18 | { 19 | int n = data.length; 20 | int maxSample = 0; 21 | for(int i = 0; i 1 means the command failed, usually because samtools is not installed or on path 36 | if(seqExit > 1) 37 | { 38 | throw new Exception("samtools produced bad exit code (" + seqExit + ") - check path: " + Settings.SAMTOOLS_PATH); 39 | } 40 | } 41 | 42 | /* 43 | * Queries a genomic substring - runs samtools faidx chr:startPos-endPos 44 | */ 45 | String genomeSubstring(String chr, long startPos, long endPos) throws Exception 46 | { 47 | if(startPos > endPos) 48 | { 49 | return ""; 50 | } 51 | String faidxCommand = String.format("%s faidx %s %s:%d-%d", Settings.SAMTOOLS_PATH, filename, chr, startPos, endPos); 52 | Process child = Runtime.getRuntime().exec(faidxCommand); 53 | InputStream seqStream = child.getInputStream(); 54 | Scanner seqInput = new Scanner(seqStream); 55 | 56 | // Make sure it produced an actual output 57 | if(!seqInput.hasNext()) 58 | { 59 | seqInput.close(); 60 | throw new Exception("samtools faidx did not produce an output: " + faidxCommand); 61 | } 62 | // Read in and ignore sequence name 63 | seqInput.next(); 64 | 65 | // Make sure there's a sequence 66 | if(!seqInput.hasNext()) 67 | { 68 | seqInput.close(); 69 | throw new Exception("samtools faidx produced a sequence name but not an actual sequence: " + faidxCommand); 70 | } 71 | 72 | // Concatenate all lines of the output sequence 73 | StringBuilder res = new StringBuilder(""); 74 | while(seqInput.hasNext()) 75 | { 76 | res.append(seqInput.next()); 77 | } 78 | seqInput.close(); 79 | 80 | return res.toString(); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/IgvScreenshotMaker.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Script for visualizing all variants in a merged VCF file 3 | */ 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.PrintWriter; 7 | import java.nio.file.Path; 8 | import java.nio.file.Paths; 9 | import java.util.ArrayList; 10 | import java.util.HashMap; 11 | import java.util.HashSet; 12 | import java.util.Scanner; 13 | 14 | public class IgvScreenshotMaker { 15 | 16 | static String vcfFn = ""; 17 | static String bedFn = ""; 18 | static String bamFilelist = ""; 19 | static String vcfFilelist = ""; 20 | static String genomeFn = ""; 21 | 22 | static int PADDING = 100; 23 | 24 | static String outPrefix = ""; 25 | 26 | static boolean SQUISH = false; 27 | static boolean SVG = false; 28 | static boolean PRECISE = false; 29 | 30 | static HashMap infoFilters; 31 | static HashSet grepFilters; 32 | 33 | static void parseArgs(String[] args) 34 | { 35 | infoFilters = new HashMap(); 36 | grepFilters = new HashSet(); 37 | 38 | for(String arg : args) 39 | { 40 | int equalsIdx = arg.indexOf('='); 41 | if(equalsIdx == -1) 42 | { 43 | if(arg.toLowerCase().endsWith("squish")) 44 | { 45 | SQUISH = true; 46 | } 47 | else if(arg.toLowerCase().endsWith("svg")) 48 | { 49 | SVG = true; 50 | } 51 | else if(arg.toLowerCase().endsWith("normalize_chr_names")) 52 | { 53 | Settings.DEFAULT_CHR_NORM = true; 54 | } 55 | else if(arg.toLowerCase().endsWith("precise")) 56 | { 57 | PRECISE = true; 58 | } 59 | else if(arg.toLowerCase().endsWith("specific")) 60 | { 61 | infoFilters.put("IS_SPECIFIC", "1"); 62 | } 63 | } 64 | else 65 | { 66 | String key = arg.substring(0, equalsIdx); 67 | String val = arg.substring(1 + equalsIdx); 68 | if(key.equalsIgnoreCase("vcf_file")) 69 | { 70 | vcfFn = val; 71 | } 72 | else if(key.equalsIgnoreCase("bed_file")) 73 | { 74 | bedFn = val; 75 | } 76 | else if(key.equalsIgnoreCase("genome_file")) 77 | { 78 | genomeFn = val; 79 | } 80 | else if(key.equalsIgnoreCase("bam_filelist")) 81 | { 82 | bamFilelist = val; 83 | } 84 | else if(key.equalsIgnoreCase("vcf_filelist")) 85 | { 86 | vcfFilelist = val; 87 | } 88 | else if(key.equalsIgnoreCase("out_prefix")) 89 | { 90 | outPrefix = val; 91 | } 92 | else if(key.equalsIgnoreCase("info_filter")) 93 | { 94 | String[] tokens = val.split(","); 95 | infoFilters.put(tokens[0], tokens[1]); 96 | } 97 | else if(key.equalsIgnoreCase("grep_filter")) 98 | { 99 | grepFilters.add(val); 100 | } 101 | else if(key.equalsIgnoreCase("padding")) 102 | { 103 | PADDING = Integer.parseInt(val); 104 | } 105 | } 106 | } 107 | 108 | if((vcfFn.length() == 0 && bedFn.length() == 0) || genomeFn.length() == 0 || bamFilelist.length() == 0 || outPrefix.length() == 0) 109 | { 110 | usage(); 111 | System.exit(0); 112 | } 113 | } 114 | 115 | /* 116 | * Print the usage menu 117 | */ 118 | static void usage() 119 | { 120 | System.out.println(); 121 | System.out.println("Jasmine IGV Screenshot Maker"); 122 | System.out.println("Usage: igv_jasmine [args]"); 123 | System.out.println(" Example: igv_jasmine vcf_file=merged.vcf genome_file=genome.fa" 124 | + " bam_filelist=bams.txt out_prefix=igv"); 125 | System.out.println(); 126 | System.out.println("Required args:"); 127 | System.out.println(" vcf_file (String) - the VCF file with merged SVs"); 128 | System.out.println(" genome_file (String) - the FASTA file with the reference genome"); 129 | System.out.println(" bam_filelist (String) - a comma-separated list of BAM files"); 130 | System.out.println(" out_prefix (String) - the prefix of the output directory and filenames"); 131 | System.out.println(); 132 | System.out.println("Optional args:"); 133 | System.out.println(" info_filter=KEY,VALUE - filter by an INFO field value (multiple allowed) e.g., info_filter=SUPP_VEC,101"); 134 | System.out.println(" grep_filter=QUERY - filter to only lines containing a given QUERY"); 135 | System.out.println(" vcf_filelist (String) - the txt file with a list of input VCFs in the same order as BAM files"); 136 | System.out.println(" bed_file (String) - a bed file with a list of ranges (use instead of vcf_file)"); 137 | System.out.println(" --precise - require variant to contain \"PRECISE\" as an INFO field"); 138 | System.out.println(" --specific - shorthand for info_filter=IS_SPECIFIC,1"); 139 | System.out.println(" --squish - squishes tracks to fit more reads"); 140 | System.out.println(" --svg - save as an SVG instead of a PNG"); 141 | System.out.println(" --normalize_chr_names - normalize the VCF chromosome names to strip \"chr\""); 142 | System.out.println(); 143 | } 144 | 145 | public static void main(String[] args) throws Exception 146 | { 147 | Settings.CHR_NAME_MAP = new ChrNameNormalization(); 148 | 149 | parseArgs(args); 150 | 151 | Path currentRelativePath = Paths.get(""); 152 | String outDir = currentRelativePath.toAbsolutePath().toString() + "/" + outPrefix; 153 | File outDirFile = new File(outDir); 154 | if(outDirFile.isDirectory()) 155 | { 156 | final File[] files = outDirFile.listFiles(); 157 | for (File f: files) f.delete(); 158 | outDirFile.delete(); 159 | } 160 | outDirFile.mkdir(); 161 | String ofn = outDir + "/" + outPrefix + ".bat"; 162 | 163 | PrintWriter out = new PrintWriter(new File(ofn)); 164 | 165 | out.println("new"); 166 | out.println("genome " + (genomeFn.startsWith("/") ? 167 | genomeFn : (currentRelativePath.toAbsolutePath().toString() + "/" + genomeFn))); 168 | ArrayList bamFiles = PipelineManager.getFilesFromList(bamFilelist); 169 | ArrayList vcfFiles = new ArrayList(); 170 | if(vcfFilelist.length() > 0) 171 | { 172 | vcfFiles = PipelineManager.getFilesFromList(vcfFilelist); 173 | } 174 | ArrayList bedFiles = new ArrayList(); 175 | for(int i = 0; i 0) 181 | { 182 | String fn = currentRelativePath.toAbsolutePath().toString() + "/" + StringUtils.fileBaseName(bamFn); 183 | fn = fn.substring(0, fn.length() - 4) + ".bed"; 184 | out.println("load " + fn); 185 | bedFiles.add(fn); 186 | PrintWriter curOut = new PrintWriter(new File(fn)); 187 | Scanner curInput = new Scanner(new FileInputStream(new File(vcfFiles.get(i)))); 188 | while(curInput.hasNext()) 189 | { 190 | String line = curInput.nextLine(); 191 | if(line.length() == 0 || line.startsWith("#")) 192 | { 193 | continue; 194 | } 195 | VcfEntry entry = VcfEntry.fromLine(line); 196 | String chr = entry.getChromosome(); 197 | int start = (int)entry.getPos(); 198 | int end = (int)entry.getEnd(); 199 | String id = entry.getId(); 200 | String type = entry.getNormalizedType(); 201 | if(type.equalsIgnoreCase("TRA")) 202 | { 203 | String chr2 = entry.getChr2(); 204 | curOut.printf("%s\t%d\t%d\t%s_%s\n", chr, start, start+1, id, type); 205 | curOut.printf("%s\t%d\t%d\t%s_%s\n", chr2, end, end+1, id, type); 206 | } 207 | else 208 | { 209 | if(end - start <= 100000) 210 | { 211 | curOut.printf("%s\t%d\t%d\t%s_%s\n", chr, start, end+1, id, type); 212 | } 213 | } 214 | } 215 | curInput.close(); 216 | curOut.close(); 217 | } 218 | 219 | } 220 | out.println("snapshotDirectory " + outDir); 221 | 222 | if(vcfFn.length() > 0) 223 | { 224 | Scanner input = new Scanner(new FileInputStream(new File(vcfFn))); 225 | while(input.hasNext()) 226 | { 227 | String line = input.nextLine(); 228 | if(line.length() == 0 || line.startsWith("#")) 229 | { 230 | continue; 231 | } 232 | VcfEntry entry = new VcfEntry(line); 233 | 234 | // Check that the entry passes grep and INFO filters 235 | boolean passesFilters = true; 236 | 237 | for(String s : grepFilters) 238 | { 239 | if(!line.contains(s)) 240 | { 241 | passesFilters = false; 242 | } 243 | } 244 | for(String s : infoFilters.keySet()) 245 | { 246 | if(!entry.hasInfoField(s) || !entry.getInfo(s).equals(infoFilters.get(s))) 247 | { 248 | passesFilters = false; 249 | } 250 | } 251 | 252 | if(PRECISE && !entry.tabTokens[7].startsWith("PRECISE;") && !entry.tabTokens[7].contains(";PRECISE;")) 253 | { 254 | passesFilters = false; 255 | } 256 | 257 | if(!passesFilters) 258 | { 259 | continue; 260 | } 261 | 262 | long start = entry.getPos() - PADDING; 263 | long end = entry.getEnd() + PADDING; 264 | 265 | // Avoid giving non-positive coords 266 | start = Math.max(start, 1); 267 | end = Math.max(end, 1); 268 | 269 | // Make sure entire insertion is covered 270 | if(entry.getNormalizedType().equals("INS")) 271 | { 272 | end = entry.getPos() + entry.getLength() + PADDING; 273 | } 274 | 275 | if(end > start + 100000) 276 | { 277 | continue; 278 | } 279 | 280 | String chr = entry.getChromosome(); 281 | 282 | out.println("goto " + chr + ":" + start + "-" + end); 283 | out.println("sort position"); 284 | if(SQUISH) 285 | { 286 | for(String bamFile : bamFiles) 287 | { 288 | out.println("squish " + bamFile); 289 | } 290 | } 291 | else 292 | { 293 | for(String bamFile : bamFiles) 294 | { 295 | out.println("collapse " + bamFile); 296 | } 297 | } 298 | for(String bedFile : bedFiles) 299 | { 300 | out.println("expand " + bedFile); 301 | } 302 | out.println("snapshot " + entry.getId() + ".png"); 303 | } 304 | 305 | input.close(); 306 | } 307 | 308 | else if(bedFn.length() > 0) 309 | { 310 | Scanner input = new Scanner(new FileInputStream(new File(bedFn))); 311 | while(input.hasNext()) 312 | { 313 | String line = input.nextLine(); 314 | if(line.length() == 0) 315 | { 316 | continue; 317 | } 318 | 319 | String[] tokens = line.split("\t"); 320 | 321 | long start = Long.parseLong(tokens[1]); 322 | long end = Long.parseLong(tokens[2]); 323 | 324 | String chr = tokens[0]; 325 | 326 | out.println("goto " + chr + ":" + start + "-" + end); 327 | out.println("sort position"); 328 | if(SQUISH) 329 | { 330 | for(String bamFile : bamFiles) 331 | { 332 | out.println("squish " + bamFile); 333 | } 334 | } 335 | else 336 | { 337 | for(String bamFile : bamFiles) 338 | { 339 | out.println("collapse " + bamFile); 340 | } 341 | } 342 | out.println("snapshot " + tokens[3] + ".png"); 343 | } 344 | 345 | input.close(); 346 | } 347 | 348 | out.println("exit"); 349 | 350 | out.close(); 351 | } 352 | } 353 | -------------------------------------------------------------------------------- /src/InsertionsToDuplications.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Converts insertions which were originally duplications back to their original SV calls 3 | * Usage: java InsertionsToDuplications input_vcf output_vcf 4 | */ 5 | 6 | import java.io.File; 7 | import java.io.FileInputStream; 8 | import java.io.PrintWriter; 9 | import java.util.Scanner; 10 | 11 | public class InsertionsToDuplications { 12 | static String inputFile = ""; 13 | static String outputFile = ""; 14 | public static void main(String[] args) throws Exception 15 | { 16 | if(args.length != 2) 17 | { 18 | System.out.println("Usage: java InsertionsToDuplications input_vcf output_vcf"); 19 | return; 20 | } 21 | else 22 | { 23 | inputFile = args[0]; 24 | outputFile = args[1]; 25 | convertFile(inputFile, outputFile); 26 | } 27 | } 28 | 29 | /* 30 | * Convert any insertions which have OLDTYPE marked as DUP back to duplications 31 | */ 32 | static void convertFile(String inputFile, String outputFile) throws Exception 33 | { 34 | Scanner input = new Scanner(new FileInputStream(new File(inputFile))); 35 | 36 | PrintWriter out = new PrintWriter(new File(outputFile)); 37 | 38 | VcfHeader header = new VcfHeader(); 39 | 40 | int countDup = 0; 41 | 42 | boolean headerPrinted = false; 43 | int totalEntries = 0; 44 | 45 | while(input.hasNext()) 46 | { 47 | String line = input.nextLine(); 48 | if(line.startsWith("#")) 49 | { 50 | header.addLine(line); 51 | continue; 52 | } 53 | 54 | if(!headerPrinted) 55 | { 56 | header.addInfoField("REFINEDALT", "1", "String", "For duplications which were changed to insertions and refined, the refined ALT sequence"); 57 | header.addInfoField("STRANDS", "1", "String", ""); 58 | header.print(out); 59 | headerPrinted = true; 60 | } 61 | 62 | VcfEntry ve = new VcfEntry(line); 63 | 64 | totalEntries++; 65 | 66 | if(line.contains("OLDTYPE=DUP") && ve.getType().equals("INS")) 67 | { 68 | countDup++; 69 | 70 | long start = ve.getPos(); 71 | int length = ve.getLength(); 72 | long nstart = start - length + 1, nend = nstart + length; 73 | String refinedAlt = ve.getAlt(); 74 | ve.setPos(nstart); 75 | ve.setInfo("END", nend+""); 76 | ve.setType("DUP"); 77 | ve.setInfo("REFINEDALT", refinedAlt); 78 | ve.setInfo("STRANDS", "-+"); 79 | ve.setRef("."); 80 | ve.setAlt(""); 81 | out.println(ve); 82 | } 83 | else 84 | { 85 | ve.setInfo("REFINEDALT", "."); 86 | out.println(ve); 87 | } 88 | } 89 | 90 | System.out.println("Number of insertions converted back to duplications: " + countDup + " out of " + totalEntries + " total variants"); 91 | 92 | input.close(); 93 | out.close(); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/KDTree.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Data structure for fast k-nearest neighbor queries in variant sets 3 | * For a given query, the k closest points to it in the dataset will be reported, 4 | * breaking ties by variant ID to ensure deterministic behavior. 5 | * 6 | * We assume variants are 2-D points; nearness is based on Euclidean distance or its generalizations. 7 | * 8 | * Uses algorithm described here: 9 | * https://courses.cs.washington.edu/courses/cse599c1/13wi/slides/lsh-hashkernels-annotated.pdf 10 | */ 11 | 12 | import java.util.ArrayDeque; 13 | import java.util.ArrayList; 14 | import java.util.LinkedList; 15 | import java.util.PriorityQueue; 16 | import java.util.Stack; 17 | 18 | public class KDTree 19 | { 20 | Node root; 21 | Node search; 22 | PriorityQueue best; 23 | int cnt; 24 | int querySize; 25 | int K; 26 | 27 | int n; 28 | 29 | /* 30 | * Initializes a KD-tree from a list of variants 31 | */ 32 | public KDTree(Variant[] p) 33 | { 34 | n = p.length; 35 | K = 2; 36 | LinkedList list = new LinkedList(); 37 | for (Variant q : p) list.add(new Node(q)); 38 | root = build(list, 0);//buildNonrecursive(list, 0).get(0); 39 | } 40 | 41 | public KDTree(Variant[] p, boolean recursive) 42 | { 43 | n = p.length; 44 | K = 2; 45 | LinkedList list = new LinkedList(); 46 | for (Variant q : p) list.add(new Node(q)); 47 | root = recursive ? build(list, 0) : buildNonrecursive(list).get(0); 48 | } 49 | 50 | private Node build(LinkedList p, int depth) 51 | { 52 | if (p.size() == 0) return null; 53 | Node pivot = p.remove(); 54 | 55 | // Sort the points into left and right subtrees based on current split dimension 56 | LinkedList left = new LinkedList(); 57 | LinkedList right = new LinkedList(); 58 | while (!p.isEmpty()) 59 | { 60 | if (p.peek().planes[depth % K] < pivot.planes[depth % K]) 61 | left.add(p.remove()); 62 | else 63 | right.add(p.remove()); 64 | } 65 | pivot.children[0] = build(left, depth + 1); 66 | pivot.children[1] = build(right, depth + 1); 67 | 68 | return pivot; 69 | } 70 | 71 | /* 72 | * Build the data structure from a list of points without recursion 73 | * This avoids stack overflow issues caused by larger datasets 74 | */ 75 | private ArrayList buildNonrecursive(LinkedList p) 76 | { 77 | ArrayList nodeList = new ArrayList(); 78 | if(p.size() == 0) 79 | { 80 | return null; 81 | } 82 | 83 | // The stack of node lists to process (in place of recursive calls) 84 | ArrayDeque> toProcess = new ArrayDeque>(); 85 | ArrayDeque parents = new ArrayDeque(); 86 | ArrayDeque depths = new ArrayDeque(); 87 | ArrayDeque parentsides = new ArrayDeque(); 88 | 89 | // Initialize root to null to be filled 90 | //nodeList.add(res); 91 | toProcess.addFirst(p); 92 | parents.addFirst(-1); // This is not actually the parent of the root, but it will be ignored anyways 93 | depths.addFirst(0); 94 | parentsides.addFirst(-1); 95 | 96 | while(!toProcess.isEmpty()) 97 | { 98 | // Get information for processing this node from stacks 99 | LinkedList pcur = toProcess.pollFirst(); 100 | int parentcur = parents.pollFirst(); 101 | int depthcur = depths.pollFirst(); 102 | int parentsidecur = parentsides.pollFirst(); 103 | 104 | // Get pivot as the first point in the list 105 | Node pivot = pcur.remove(); 106 | 107 | // Separate this point into points left of the pivot vs. right of the pivot 108 | LinkedList left = new LinkedList(); 109 | LinkedList right = new LinkedList(); 110 | while (!pcur.isEmpty()) 111 | { 112 | Node check = pcur.pollFirst(); 113 | if (check.planes[depthcur % K] < pivot.planes[depthcur % K]) 114 | left.add(check); 115 | else 116 | right.add(check); 117 | } 118 | 119 | //pcur.clear(); 120 | 121 | // Update this node's parent's child-pointer to this node. 122 | if(parentsidecur != -1) 123 | { 124 | nodeList.get(parentcur).children[parentsidecur] = pivot; 125 | } 126 | 127 | pivot.children[0] = null; 128 | pivot.children[1] = null; 129 | nodeList.add(pivot); 130 | 131 | // Add right child to processing stack 132 | if(right.size() > 0) 133 | { 134 | toProcess.addFirst(right); 135 | parents.addFirst(nodeList.size() - 1); 136 | parentsides.addFirst(1); 137 | depths.addFirst(depthcur + 1); 138 | } 139 | 140 | // Add left child to processing stack 141 | if(left.size() > 0) 142 | { 143 | toProcess.addFirst(left); 144 | parents.addFirst(nodeList.size() - 1); 145 | parentsides.addFirst(0); 146 | depths.addFirst(depthcur + 1); 147 | } 148 | } 149 | 150 | return nodeList; 151 | } 152 | 153 | /* 154 | * Used to make sure two KD-trees are the same 155 | */ 156 | static boolean compare(String pref, Node a, Node b) 157 | { 158 | if(a == null && b != null) 159 | { 160 | System.out.println(pref + " only a is null"); 161 | return true; 162 | } 163 | if(b == null && a != null) 164 | { 165 | System.out.println(pref + " only b is null"); 166 | return true; 167 | } 168 | if(a == null && b == null) 169 | { 170 | return false; 171 | } 172 | if(a.planes[0] != b.planes[0] || a.planes[1] != b.planes[1]) 173 | { 174 | System.out.println(pref + " diff value: " + a.planes[0] + " " + a.planes[1] + " " + b.planes[0] + " " + b.planes[1]); 175 | return true; 176 | } 177 | 178 | boolean leftDiff = compare(pref + "L", a.children[0], b.children[0]); 179 | if(leftDiff) 180 | { 181 | return true; 182 | } 183 | else 184 | { 185 | return compare(pref + "R", a.children[1], b.children[1]); 186 | } 187 | 188 | } 189 | 190 | /* 191 | * Gets the k nearest neighbors for a query variant 192 | */ 193 | public Variant[] kNearestNeighbor(Variant p, int k) { 194 | search = new Node(p); 195 | best = new PriorityQueue(); 196 | querySize = k; 197 | search(root, 0); 198 | Variant[] res = new Variant[best.size()]; 199 | int idx = res.length - 1; 200 | while(!best.isEmpty()) 201 | { 202 | res[idx--] = best.poll().v; 203 | } 204 | return res; 205 | } 206 | 207 | /* 208 | * Search the subtree rooted at cur for candidate points in the set of query's k-nearest neighbors 209 | */ 210 | private void search(Node ocur, int odepth) { 211 | Stack curs = new Stack(); 212 | Stack depths = new Stack(); 213 | Stack processedBest = new Stack(); 214 | curs.add(ocur); 215 | depths.add(odepth); 216 | processedBest.push(false); 217 | while(!curs.isEmpty()) 218 | { 219 | Node cur = curs.pop(); 220 | int depth = depths.pop(); 221 | boolean bestDone = processedBest.pop(); 222 | 223 | if(cur == null) continue; 224 | 225 | int betterChild = (int) Math.signum(search.planes[depth % K] - cur.planes[depth % K]) < 0 ? 0 : 1; 226 | 227 | if(!bestDone) 228 | { 229 | curs.add(cur); 230 | depths.add(depth); 231 | processedBest.add(true); 232 | curs.add(cur.children[betterChild]); 233 | depths.add(depth+1); 234 | processedBest.add(false); 235 | continue; 236 | } 237 | Candidate toAdd = new Candidate(cur.p, cur.p.distance(search.p)); 238 | if (best == null || best.size() < querySize || toAdd.compareTo(best.peek()) > 0) 239 | { 240 | if(best.size() == querySize) 241 | { 242 | best.poll(); 243 | } 244 | best.add(toAdd); 245 | } 246 | if (best.size() < querySize || Math.abs(search.planes[depth % K] - cur.planes[depth % K]) < best.peek().dist) 247 | { 248 | curs.add(cur.children[1 - betterChild]); 249 | depths.add(depth+1); 250 | processedBest.add(false); 251 | } 252 | } 253 | } 254 | 255 | /* 256 | * A node of the KD tree 257 | * Each node has a variant, storing alongside it its values along the split planes, as well as two (possibly null) children 258 | */ 259 | private class Node { 260 | Node[] children; 261 | Variant p; 262 | double[] planes; 263 | public Node(Variant pp) 264 | { 265 | p = pp; 266 | planes = new double[K]; 267 | planes[0] = p.start; 268 | planes[1] = p.end; // add additional dimensions as necessary 269 | children = new Node[2]; 270 | children[0] = null; 271 | children[1] = null; 272 | } 273 | } 274 | 275 | /* 276 | * Candidate k-nearest neighbor of the current query point 277 | */ 278 | private static class Candidate implements Comparable 279 | { 280 | Variant v; 281 | double dist; 282 | Candidate(Variant v, double dist) 283 | { 284 | this.v = v; 285 | this.dist = dist; 286 | } 287 | public int compareTo(Candidate o) 288 | { 289 | if(Math.abs(dist - o.dist) > 1e-9) return Double.compare(o.dist, dist); 290 | if(v.hash != o.v.hash) return o.v.hash - v.hash; 291 | return o.v.id.compareTo(v.id); 292 | 293 | } 294 | } 295 | } 296 | -------------------------------------------------------------------------------- /src/Main.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Main interface for Jasmine 3 | */ 4 | import java.io.File; 5 | import java.util.ArrayList; 6 | import java.util.TreeMap; 7 | 8 | public class Main { 9 | public static void main(String[] args) throws Exception 10 | { 11 | Settings.parseArgs(args); 12 | 13 | // The input file to SV merging may change based on the steps the user wants to run 14 | String currentInputFile = Settings.FILE_LIST; 15 | 16 | if(Settings.USING_FILE_LIST) 17 | { 18 | File f = new File(currentInputFile); 19 | if(!f.exists()) 20 | { 21 | System.out.println("Warning: Input file list " + currentInputFile + " does not exist."); 22 | } 23 | } 24 | 25 | if(!Settings.POSTPROCESS_ONLY) 26 | { 27 | currentInputFile = preprocess(currentInputFile); 28 | } 29 | 30 | if(!Settings.PREPROCESS_ONLY && !Settings.POSTPROCESS_ONLY) 31 | { 32 | runJasmine(currentInputFile); 33 | } 34 | 35 | if(!Settings.PREPROCESS_ONLY) 36 | { 37 | postprocess(currentInputFile); 38 | } 39 | } 40 | static String preprocess(String currentInputFile) throws Exception 41 | { 42 | // Convert the duplications to insertions if the user wants to 43 | if(Settings.CONVERT_DUPLICATIONS) 44 | { 45 | currentInputFile = PipelineManager.convertDuplicationsToInsertions(currentInputFile); 46 | } 47 | 48 | // Mark calls with strong read support and long length as specific calls 49 | if(Settings.MARK_SPECIFIC) 50 | { 51 | currentInputFile = PipelineManager.markSpecificCalls(currentInputFile); 52 | } 53 | 54 | // Run iris if the user specifies that they want to run it 55 | if(Settings.RUN_IRIS) 56 | { 57 | currentInputFile = PipelineManager.runIris(currentInputFile); 58 | } 59 | 60 | // Run iris if the user specifies that they want to run it 61 | if(Settings.PRE_NORMALIZE) 62 | { 63 | currentInputFile = PipelineManager.normalizeTypes(currentInputFile); 64 | } 65 | 66 | return currentInputFile; 67 | } 68 | static void runJasmine(String currentInputFile) throws Exception 69 | { 70 | // Get the variants and bin them into individual graphs 71 | TreeMap> allVariants = VariantInput.readAllFiles(currentInputFile); 72 | 73 | // Initialize data structure for outputting merged variants 74 | VariantOutput output = new VariantOutput(); 75 | 76 | // Get the number of samples to know the length of the SUPP_VEC field 77 | int sampleCount = VariantInput.countFiles(currentInputFile); 78 | 79 | // Merge each graph in parallel 80 | ParallelMerger pm = new ParallelMerger(allVariants, output, sampleCount); 81 | pm.run(); 82 | 83 | System.out.println("Merging complete - outputting results"); 84 | 85 | // Print the merged variants to a file if they have enough support 86 | output.writeMergedVariants(currentInputFile, Settings.OUT_FILE); 87 | 88 | System.out.println("Number of sets with multiple variants: " + pm.totalMerged.get()); 89 | } 90 | static void postprocess(String currentInputFile) throws Exception 91 | { 92 | // Convert insertions back to duplications as needed 93 | if(Settings.CONVERT_DUPLICATIONS) 94 | { 95 | PipelineManager.convertInsertionsBackToDuplications(); 96 | } 97 | 98 | // Add genotypes 99 | if(Settings.OUTPUT_GENOTYPES) 100 | { 101 | PipelineManager.addGenotypes(currentInputFile); 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/MarkSpecificCalls.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Program for marking calls which fall in a specific callset, given the sensitive callset 3 | * Takes parameters for read support and SV length required for a variant to be specific 4 | */ 5 | 6 | import java.io.File; 7 | import java.io.FileInputStream; 8 | import java.io.PrintWriter; 9 | import java.util.ArrayList; 10 | import java.util.Scanner; 11 | 12 | public class MarkSpecificCalls { 13 | public static void main(String[] args) throws Exception 14 | { 15 | String fn = ""; 16 | String ofn = ""; 17 | int minReadSupport = 0; 18 | int minLength = 0; 19 | if(args.length == 4) 20 | { 21 | fn = args[0]; 22 | ofn = args[1]; 23 | minReadSupport = Integer.parseInt(args[2]); 24 | minLength = Integer.parseInt(args[3]); 25 | convertFile(fn, ofn, minReadSupport, minLength); 26 | } 27 | else 28 | { 29 | System.out.println("Usage: java MarkSpecificCalls vcffile outfile minreadsupport minlength"); 30 | return; 31 | } 32 | } 33 | 34 | /* 35 | * Marks specific calls in inputFile and outputs updated VCF to outputFile 36 | * A variant will be specific if its number of supporting reads is at least 37 | * minReadSupport (or unspecified) and its length (absolute value) is at least minLength 38 | */ 39 | static void convertFile(String inputFile, String outputFile, int minReadSupport, int minLength) throws Exception 40 | { 41 | Scanner input = new Scanner(new FileInputStream(new File(inputFile))); 42 | PrintWriter out = new PrintWriter(new File(outputFile)); 43 | 44 | VcfHeader header = new VcfHeader(); 45 | ArrayList entries = new ArrayList(); 46 | 47 | while(input.hasNext()) 48 | { 49 | String line = input.nextLine(); 50 | if(line.length() == 0) 51 | { 52 | continue; 53 | } 54 | if(line.startsWith("#")) 55 | { 56 | header.addLine(line); 57 | } 58 | else 59 | { 60 | VcfEntry entry = VcfEntry.fromLine(line); 61 | boolean inSpecific = false; 62 | int readSupport = entry.getReadSupport(); 63 | 64 | boolean longEnough = entry.getType().equals("TRA") || entry.getType().equals("BND") || Math.abs(entry.getLength()) >= minLength || entry.getLength() == 0; 65 | 66 | if(readSupport >= minReadSupport && longEnough) 67 | { 68 | inSpecific = true; 69 | } 70 | 71 | entry.setInfo("IS_SPECIFIC", inSpecific ? "1" : "0"); 72 | entries.add(entry); 73 | } 74 | } 75 | 76 | header.addInfoField("IS_SPECIFIC", "1", "String", "Whether or not a variant has enough read support and length to be specific"); 77 | header.print(out); 78 | 79 | for(VcfEntry entry : entries) 80 | { 81 | out.println(entry); 82 | } 83 | 84 | input.close(); 85 | out.close(); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/NormalizeTypes.java: -------------------------------------------------------------------------------- 1 | import java.io.File; 2 | import java.io.FileInputStream; 3 | import java.io.PrintWriter; 4 | import java.util.ArrayList; 5 | import java.util.Scanner; 6 | 7 | public class NormalizeTypes { 8 | static String inputFile = ""; 9 | static String outputFile = ""; 10 | public static void main(String[] args) throws Exception 11 | { 12 | if(args.length != 2) 13 | { 14 | System.out.println("Usage: java NormalizeTypes input_vcf output_vcf"); 15 | return; 16 | } 17 | else 18 | { 19 | inputFile = args[0]; 20 | outputFile = args[1]; 21 | Settings.CHR_NAME_MAP = new ChrNameNormalization(); 22 | convertFile(inputFile, outputFile); 23 | } 24 | } 25 | 26 | /* 27 | * Convert types in inputFiles to their normalized types and outputs them to a new file 28 | */ 29 | static void convertFile(String inputFile, String outputFile) throws Exception 30 | { 31 | Scanner input = new Scanner(new FileInputStream(new File(inputFile))); 32 | 33 | PrintWriter out = new PrintWriter(new File(outputFile)); 34 | 35 | VcfHeader header = new VcfHeader(); 36 | ArrayList entries = new ArrayList(); 37 | 38 | while(input.hasNext()) 39 | { 40 | String line = input.nextLine(); 41 | if(line.startsWith("#")) 42 | { 43 | header.addLine(line); 44 | } 45 | else if(line.length() > 0) 46 | { 47 | VcfEntry ve = VcfEntry.fromLine(line); 48 | ve.normalizeType(); 49 | entries.add(ve); 50 | } 51 | } 52 | 53 | header.print(out); 54 | 55 | for(VcfEntry ve : entries) 56 | { 57 | out.println(ve); 58 | } 59 | 60 | input.close(); 61 | out.close(); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/Overlap.java: -------------------------------------------------------------------------------- 1 | /* 2 | * A program for filtering variants based on their overlap with a list of regions. 3 | */ 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.PrintWriter; 7 | import java.util.ArrayList; 8 | import java.util.Collections; 9 | import java.util.HashMap; 10 | import java.util.HashSet; 11 | import java.util.Scanner; 12 | import java.util.TreeSet; 13 | 14 | public class Overlap 15 | { 16 | static String vcfFn = ""; 17 | static String bedFn = ""; 18 | static String ofn = ""; 19 | static String FILTER_MODE = "CONTAINED_IN_REGION"; 20 | static String REPORT_MODE = "REMOVE"; 21 | static String reportInfo = ""; 22 | 23 | static ChrNameNormalization chrNorm; 24 | static void parseArgs(String[] args) 25 | { 26 | for(String arg : args) 27 | { 28 | int equalsIdx = arg.indexOf('='); 29 | if(equalsIdx == -1) 30 | { 31 | 32 | } 33 | else 34 | { 35 | String key = arg.substring(0, equalsIdx); 36 | String val = arg.substring(1 + equalsIdx); 37 | if(key.equalsIgnoreCase("vcf_file")) 38 | { 39 | vcfFn = val; 40 | } 41 | else if(key.equalsIgnoreCase("bed_file")) 42 | { 43 | bedFn = val; 44 | } 45 | else if(key.equalsIgnoreCase("out_file")) 46 | { 47 | ofn = val; 48 | } 49 | else if(key.equalsIgnoreCase("info_report")) 50 | { 51 | reportInfo = val; 52 | } 53 | } 54 | } 55 | 56 | if(reportInfo.length() > 0) 57 | { 58 | REPORT_MODE = "INFO"; 59 | } 60 | 61 | if(vcfFn.length() == 0 || bedFn.length() == 0 || ofn.length() == 0) 62 | { 63 | usage(); 64 | System.exit(0); 65 | } 66 | } 67 | static void usage() 68 | { 69 | System.out.println(); 70 | System.out.println("Jasmine Overlapping"); 71 | System.out.println("Usage: overlap_jasmine [args]"); 72 | System.out.println(" Example: overlap_jasmine vcf_file=merged.vcf bed_file=regions.bed out_fie=filtered.vcf"); 73 | System.out.println(); 74 | System.out.println("Required args:"); 75 | System.out.println(" vcf_file (String) - the VCF file with merged SVs"); 76 | System.out.println(" bed_file (String) - a BED file with regions of interest"); 77 | System.out.println(" out_file (String) - the name of the output VCF filtered by regions of interest"); 78 | System.out.println(); 79 | System.out.println("Optional args:"); 80 | System.out.println(" info_report (String) [] - the INFO field to indicate presence in regions instead of removing non-overlapping variants"); 81 | System.out.println(); 82 | } 83 | 84 | public static void main(String[] args) throws Exception 85 | { 86 | parseArgs(args); 87 | 88 | Settings.DEFAULT_CHR_NORM = true; 89 | chrNorm = new ChrNameNormalization(); 90 | 91 | filterVcf(); 92 | } 93 | 94 | static ArrayList getBedEvents() throws Exception 95 | { 96 | Scanner input = new Scanner(new FileInputStream(new File(bedFn))); 97 | 98 | ArrayList events = new ArrayList(); 99 | int idNum = 0; 100 | while(input.hasNext()) 101 | { 102 | String line = input.nextLine(); 103 | if(line.startsWith("#")) 104 | { 105 | continue; 106 | } 107 | String[] tokens = line.split("\t"); 108 | String chr = tokens[0]; 109 | chr = chrNorm.normalize(chr); 110 | int start = Integer.parseInt(tokens[1]); 111 | int end = Integer.parseInt(tokens[2]); 112 | idNum++; 113 | events.add(new Event(chr, start, 1, idNum + "")); 114 | events.add(new Event(chr, end, -1, idNum + "")); 115 | } 116 | input.close(); 117 | 118 | Collections.sort(events); 119 | 120 | return events; 121 | } 122 | 123 | /* 124 | * Gets the start and end events for variants 125 | * Translocations are a special case and are broken up into two event pairs, one for each breakpoint 126 | */ 127 | static ArrayList getVcfEvents() throws Exception 128 | { 129 | Scanner input = new Scanner(new FileInputStream(new File(vcfFn))); 130 | 131 | ArrayList events = new ArrayList(); 132 | while(input.hasNext()) 133 | { 134 | String line = input.nextLine(); 135 | if(line.startsWith("#")) 136 | { 137 | continue; 138 | } 139 | VcfEntry entry = VcfEntry.fromLine(line); 140 | String chr = entry.getChromosome(); 141 | chr = chrNorm.normalize(chr); 142 | int start = (int)(entry.getPos()); 143 | int end = (int)(entry.getEnd()); 144 | String id = entry.getId(); 145 | 146 | if(entry.getNormalizedType().equals("TRA")) 147 | { 148 | events.add(new Event(chr, start, 1, id + "_breakpoint1")); 149 | events.add(new Event(chr, start+1, -1, id + "_breakpoint1")); 150 | String chr2 = entry.getChr2(); 151 | if(chr2.length() != 0) 152 | { 153 | events.add(new Event(chr2, end, 1, id + "_breakpoint2")); 154 | events.add(new Event(chr2, end + 1, -1, id + "_breakpoint2")); 155 | } 156 | } 157 | else 158 | { 159 | events.add(new Event(chr, start, 1, id)); 160 | events.add(new Event(chr, end + 1, -1, id)); 161 | } 162 | } 163 | input.close(); 164 | 165 | Collections.sort(events); 166 | 167 | return events; 168 | } 169 | 170 | /* 171 | * Gets a list of overlaps based on variant and region start and end events 172 | * For each variant, it outputs a list of the IDs of regions with which it overlaps 173 | */ 174 | static HashMap> getOverlaps(ArrayList regions, ArrayList variants) 175 | { 176 | // The next region and variant events to consider 177 | int regionIdx = 0, variantIdx = 0; 178 | 179 | // As we do the plane sweep, the list of regions we are currently inside, if any 180 | TreeSet openRegions = new TreeSet(); 181 | // The list of variant intervals we are currently inside 182 | TreeSet openVariants = new TreeSet(); 183 | 184 | HashMap> overlaps = new HashMap>(); 185 | 186 | // Plane sweep time! 187 | while(true) 188 | { 189 | // Stop when there are no more events 190 | if(regionIdx == regions.size() && variantIdx == variants.size()) 191 | { 192 | break; 193 | } 194 | 195 | // Whether or not the next event to process is a region event (as opposed to a variant) 196 | boolean nextEventRegion = false; 197 | 198 | // If we are out of regions, take a variant event 199 | if(regionIdx == regions.size()) 200 | { 201 | nextEventRegion = false; 202 | } 203 | 204 | // If we are out of variants, take a region event 205 | else if(variantIdx == variants.size()) 206 | { 207 | nextEventRegion = true; 208 | } 209 | 210 | // If we have both left, compare positions and choose according to overlap scheme 211 | else 212 | { 213 | Event nextRegion = regions.get(regionIdx); 214 | Event nextVariant = variants.get(variantIdx); 215 | 216 | // If region is on earlier chromosome, take that 217 | if(nextRegion.chr.compareTo(nextVariant.chr) < 0) 218 | { 219 | nextEventRegion = true; 220 | } 221 | 222 | // If region is on later chromosome, take variant 223 | else if(nextRegion.chr.compareTo(nextVariant.chr) > 0) 224 | { 225 | nextEventRegion = false; 226 | } 227 | 228 | // If region is at earlier position on same chromosome, take it 229 | else if(nextRegion.pos < nextVariant.pos) 230 | { 231 | nextEventRegion = true; 232 | } 233 | 234 | // If region is at later position on same chromosome, take variant 235 | else if(nextRegion.pos > nextVariant.pos) 236 | { 237 | nextEventRegion = false; 238 | } 239 | 240 | // Now the case where positions are the same - tie handling depends on overlap mode 241 | else if(FILTER_MODE.equalsIgnoreCase("CONTAINED_IN_REGION")) 242 | { 243 | // Order of priority is variant end, region end, region start, variant start 244 | if(nextVariant.type == 1) 245 | { 246 | nextEventRegion = false; 247 | } 248 | else 249 | { 250 | nextEventRegion = true; 251 | } 252 | } 253 | } 254 | 255 | // After deciding what kind of event to use, process it! 256 | 257 | // Case where next event is a region breakpoint 258 | if(nextEventRegion) 259 | { 260 | Event next = regions.get(regionIdx); 261 | // Start of region 262 | if(next.type == 1) 263 | { 264 | // Add to list of open regions 265 | openRegions.add(next.id); 266 | } 267 | // End of region 268 | else 269 | { 270 | // Remove from list of open regions 271 | openRegions.remove(next.id); 272 | 273 | for(String openVariant : openVariants) 274 | { 275 | if(overlaps.get(openVariant).contains(next.id)) 276 | { 277 | overlaps.get(openVariant).remove(next.id); 278 | } 279 | } 280 | } 281 | regionIdx++; 282 | } 283 | 284 | // Case where next event is a variant breakpoint 285 | else 286 | { 287 | Event next = variants.get(variantIdx); 288 | // Start of variant 289 | if(next.type == 1) 290 | { 291 | // Add to list of open variants 292 | openVariants.add(next.id); 293 | 294 | // Initialize list of overlaps to all open regions 295 | overlaps.put(next.id, new HashSet()); 296 | for(String openRegion : openRegions) 297 | { 298 | overlaps.get(next.id).add(openRegion); 299 | } 300 | } 301 | // End of variant 302 | else 303 | { 304 | // Remove from list of open variants 305 | openVariants.remove(next.id); 306 | 307 | // If all overlapping regions were removed, take this out of overlap list 308 | if(overlaps.get(next.id).size() == 0) 309 | { 310 | overlaps.remove(next.id); 311 | } 312 | } 313 | variantIdx++; 314 | } 315 | } 316 | return overlaps; 317 | } 318 | 319 | static void filterVcf() throws Exception 320 | { 321 | System.err.println("Getting regions"); 322 | ArrayList bedEvents = getBedEvents(); 323 | System.err.println("Found " + bedEvents.size() + " region breakpoints"); 324 | System.err.println("Getting variants"); 325 | ArrayList vcfEvents = getVcfEvents(); 326 | System.err.println("Found " + vcfEvents.size() + " variant breakpoints"); 327 | System.err.println("Finding overlaps"); 328 | HashMap> overlaps = getOverlaps(bedEvents, vcfEvents); 329 | System.err.println("Found " + overlaps.size() + " variants with at least one overlap"); 330 | System.err.println("Filtering variants"); 331 | Scanner input = new Scanner(new FileInputStream(new File(vcfFn))); 332 | PrintWriter out = new PrintWriter(new File(ofn)); 333 | VcfHeader header = new VcfHeader(); 334 | boolean printedHeader = false; 335 | while(input.hasNext()) 336 | { 337 | String line = input.nextLine(); 338 | if(line.length() == 0) 339 | { 340 | continue; 341 | } 342 | if(line.startsWith("#")) 343 | { 344 | header.addLine(line); 345 | } 346 | else 347 | { 348 | if(!printedHeader) 349 | { 350 | if(REPORT_MODE.equalsIgnoreCase("INFO")) 351 | { 352 | header.addInfoField(reportInfo, "1", "String", "Whether or not the variant is in the regions of interest listed in " + bedFn); 353 | } 354 | header.print(out); 355 | printedHeader = true; 356 | } 357 | VcfEntry entry = VcfEntry.fromLine(line); 358 | String id = entry.getId(); 359 | HashSet curOverlaps = overlaps.getOrDefault(id, null); 360 | boolean hasOverlap = curOverlaps != null; 361 | if(entry.getNormalizedType().equals("TRA")) 362 | { 363 | String id1 = id + "_breakpoint1", id2 = id + "_breakpoint2"; 364 | HashSet firstOverlap = overlaps.getOrDefault(id1, null); 365 | HashSet secondOverlap = overlaps.getOrDefault(id2, null); 366 | hasOverlap = firstOverlap != null && secondOverlap != null; 367 | } 368 | if(REPORT_MODE.equalsIgnoreCase("REMOVE")) 369 | { 370 | if(hasOverlap) 371 | { 372 | out.println(entry); 373 | } 374 | } 375 | if(REPORT_MODE.equalsIgnoreCase("INFO")) 376 | { 377 | if(hasOverlap) 378 | { 379 | entry.setInfo(reportInfo, "1"); 380 | } 381 | else 382 | { 383 | entry.setInfo(reportInfo, "0"); 384 | } 385 | out.println(entry); 386 | } 387 | } 388 | } 389 | input.close(); 390 | out.close(); 391 | } 392 | 393 | static class Event implements Comparable 394 | { 395 | String chr; 396 | int pos; 397 | int type; 398 | String id; 399 | Event(String chr, int pos, int type, String id) 400 | { 401 | this.chr = chr; 402 | this.pos = pos; 403 | this.type = type; 404 | this.id = id; 405 | } 406 | @Override 407 | public int compareTo(Event o) 408 | { 409 | if(!chr.equals(o.chr)) 410 | { 411 | return chr.compareTo(o.chr); 412 | } 413 | if(pos != o.pos) return pos - o.pos; 414 | return type - o.type; // Do ends before starts 415 | } 416 | } 417 | } 418 | -------------------------------------------------------------------------------- /src/ParallelMerger.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Multi-threading support for variant merging 3 | * Since each chromosome (and possibly type and strand) is its own graph, 4 | * the algorithm can be parallelized pretty naturally. 5 | * 6 | * The graphs that need to be processed are stored in a queue, and each thread 7 | * processes one graph at a time, querying the queue for the next graph to process 8 | */ 9 | 10 | import java.util.ArrayList; 11 | import java.util.Collections; 12 | import java.util.TreeMap; 13 | import java.util.concurrent.ConcurrentLinkedQueue; 14 | import java.util.concurrent.atomic.AtomicInteger; 15 | 16 | public class ParallelMerger { 17 | 18 | // IDs of graphs left to process 19 | ConcurrentLinkedQueue todo; 20 | 21 | // the variant graphs on which merging will be performed 22 | TreeMap> allVariants; 23 | 24 | // A data structure for holding merged variants to output 25 | VariantOutput output; 26 | 27 | // The number of threads to use 28 | int numThreads; 29 | 30 | // The total number of samples 31 | int sampleCount; 32 | 33 | AtomicInteger totalMerged = new AtomicInteger(0); 34 | 35 | ParallelMerger(TreeMap> allVariants, VariantOutput output, int sampleCount) 36 | { 37 | this.allVariants = allVariants; 38 | this.output = output; 39 | this.numThreads = Settings.THREADS; 40 | System.out.println("Number of threads: " + numThreads); 41 | this.sampleCount = sampleCount; 42 | todo = new ConcurrentLinkedQueue(); 43 | for(String s : allVariants.keySet()) 44 | { 45 | todo.add(s); 46 | } 47 | } 48 | 49 | /* 50 | * Start merging in parallel, initializing all threads 51 | */ 52 | void run() throws Exception 53 | { 54 | // The last thread in the array is the main thread, so it calls 55 | // run() instead of start() and doesn't get joined below 56 | MyThread[] threads = new MyThread[numThreads]; 57 | for(int i = 0; i variantList = allVariants.get(graphID); 87 | Collections.sort(variantList); 88 | VariantMerger vm = new VariantMerger(variantList); 89 | vm.runMerging(); 90 | ArrayList[] res = vm.getGroups(); 91 | output.addGraph(graphID, res, sampleCount); 92 | int merges = 0; 93 | for(ArrayList list : res) 94 | { 95 | if(list.size() > 1) 96 | { 97 | merges++; 98 | } 99 | } 100 | totalMerged.addAndGet(merges); 101 | } 102 | 103 | } 104 | 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/PipelineManager.java: -------------------------------------------------------------------------------- 1 | /* 2 | * A utility for managing pipeline steps for multiple VCF files 3 | * Most of the pre-processing and post-processing steps are done on a per-VCF basis, 4 | * so this manager performs them for all files and updates the filelist to point to a 5 | * list of updated files instead of the original ones 6 | */ 7 | 8 | import java.io.File; 9 | import java.io.FileInputStream; 10 | import java.io.PrintWriter; 11 | import java.nio.file.Files; 12 | import java.nio.file.Paths; 13 | import java.util.ArrayList; 14 | import java.util.HashSet; 15 | import java.util.Scanner; 16 | 17 | public class PipelineManager { 18 | 19 | /* 20 | * Convert duplications to insertions for all VCF files and update filelist 21 | * Returns a path to the new filelist 22 | */ 23 | static String convertDuplicationsToInsertions(String fileList) throws Exception 24 | { 25 | ArrayList vcfFiles = getFilesFromList(fileList); 26 | ArrayList newVcfFiles = new ArrayList(); 27 | 28 | HashSet basenames = new HashSet(); 29 | 30 | for(int i = 0; i vcfFiles = getFilesFromList(fileList); 56 | ArrayList newVcfFiles = new ArrayList(); 57 | 58 | HashSet basenames = new HashSet(); 59 | 60 | for(int i = 0; i vcfFiles = getFilesFromList(fileList), bamFiles = getFilesFromList(Settings.BAM_FILE_LIST); 86 | ArrayList newVcfFiles = new ArrayList(); 87 | 88 | // Get any optional arguments to be passed to Iris that the user specified 89 | String[] optionalArgs = Settings.IRIS_ARGS.split(","); 90 | 91 | HashSet basenames = new HashSet(); 92 | 93 | // Refine one VCF file at a time 94 | for(int i = 0; i vcfFiles = getFilesFromList(fileList); 144 | ArrayList newVcfFiles = new ArrayList(); 145 | 146 | HashSet basenames = new HashSet(); 147 | 148 | for(int i = 0; i newVcfFiles) throws Exception 172 | { 173 | if(Settings.USING_FILE_LIST) 174 | { 175 | String newFileList = Settings.OUT_DIR + "/" + StringUtils.addDescriptor(StringUtils.fileBaseName(oldFileList), suffix); 176 | PrintWriter newFileListOut = new PrintWriter(new File(newFileList)); 177 | for(String newVcfFile : newVcfFiles) 178 | { 179 | newFileListOut.println(newVcfFile); 180 | } 181 | newFileListOut.close(); 182 | return newFileList; 183 | } 184 | else 185 | { 186 | StringBuilder res = new StringBuilder(""); 187 | for(int i = 0; i getFilesFromList(String fileList) throws Exception 235 | { 236 | ArrayList res = new ArrayList(); 237 | 238 | if(!Settings.USING_FILE_LIST) 239 | { 240 | String[] fns = fileList.split(","); 241 | for(String fn : fns) res.add(fn); 242 | return res; 243 | } 244 | 245 | if(new File(fileList).exists()) 246 | { 247 | Scanner vcfListInput = new Scanner(new FileInputStream(new File(fileList))); 248 | 249 | while(vcfListInput.hasNext()) 250 | { 251 | String line = vcfListInput.nextLine(); 252 | if(line.length() > 0) 253 | { 254 | res.add(line); 255 | } 256 | } 257 | vcfListInput.close(); 258 | } 259 | 260 | return res; 261 | } 262 | 263 | } 264 | -------------------------------------------------------------------------------- /src/PreSplit.java: -------------------------------------------------------------------------------- 1 | import java.io.File; 2 | import java.io.FileInputStream; 3 | import java.io.PrintWriter; 4 | import java.nio.file.Path; 5 | import java.nio.file.Paths; 6 | import java.util.ArrayList; 7 | import java.util.HashMap; 8 | import java.util.HashSet; 9 | import java.util.Scanner; 10 | 11 | public class PreSplit 12 | { 13 | static String fileList = ""; 14 | static String outputDir = ""; 15 | static int segmentLength = -1; 16 | static boolean transTogether = false; 17 | 18 | static void usage() 19 | { 20 | System.out.println(); 21 | System.out.println("Usage: split_jasmine file_list output_dir [segment_length]"); 22 | System.out.println(" Example: split_jasmine file_list=filelist.txt output_dir=/path/to/split_dir segment_length=10m"); 23 | System.out.println(); 24 | System.out.println("Required args:"); 25 | System.out.println(" file_list (String) - A txt file with a line-separated list of VCFs to be split"); 26 | System.out.println(" output_dir (String) - The directory to write the split files to"); 27 | System.out.println(); 28 | System.out.println("Optional args:"); 29 | System.out.println(" segment_length (int) - length of segments to split chromosomes into (default whole-chromosome)"); 30 | System.out.println(" --ignore_strand - allow variants with different strands to be merged"); 31 | System.out.println(" --ignore_type - allow variants with different types to be merged"); 32 | System.out.println(" --combine_translocations - keep all translocations together to reduce number of groups"); 33 | System.out.println(); 34 | } 35 | 36 | static void parseArgs(String[] args) throws Exception 37 | { 38 | for(int i = 0; i 0 && key.charAt(0) == '-') 60 | { 61 | key = key.substring(1); 62 | } 63 | String val = args[i].substring(1 + equalIdx); 64 | 65 | switch(key) 66 | { 67 | case "segment_length": 68 | segmentLength = Settings.parseInt(val); 69 | break; 70 | case "file_list": 71 | fileList = val; 72 | break; 73 | case "output_dir": 74 | outputDir = val; 75 | break; 76 | default: 77 | break; 78 | } 79 | } 80 | } 81 | if(fileList.length() == 0 || outputDir.length() == 0) 82 | { 83 | usage(); 84 | System.exit(0); 85 | } 86 | 87 | } 88 | 89 | public static void main(String[] args) throws Exception 90 | { 91 | parseArgs(args); 92 | 93 | String[] filelists = convertAll(fileList, outputDir, segmentLength); 94 | for(String s : filelists) 95 | { 96 | System.out.println(s); 97 | } 98 | } 99 | 100 | @SuppressWarnings("unchecked") 101 | static String[] convertAll(String fileList, String outDir, int segmentLength) throws Exception 102 | { 103 | if(!outDir.startsWith("/")) 104 | { 105 | Path currentRelativePath = Paths.get(""); 106 | outDir = currentRelativePath.toAbsolutePath().toString() + "/" + outDir; 107 | } 108 | if(!new File(outDir).isDirectory()) 109 | { 110 | new File(outDir).mkdir(); 111 | } 112 | ArrayList vcfFiles = PipelineManager.getFilesFromList(fileList); 113 | int n = vcfFiles.size(); 114 | HashMap[] splitMaps = new HashMap[n]; 115 | HashSet allKeys = new HashSet(); 116 | for(int i = 0; i convertFile(String inputFile, String outputPrefix, int segmentLength) throws Exception 178 | { 179 | VcfHeader header = new VcfHeader(); 180 | Scanner input = new Scanner(new FileInputStream(new File(inputFile))); 181 | HashMap res = new HashMap(); 182 | HashMap writerMap = new HashMap(); 183 | while(input.hasNext()) 184 | { 185 | String line = input.nextLine(); 186 | if(line.length() == 0) 187 | { 188 | continue; 189 | } 190 | if(line.startsWith("#")) 191 | { 192 | header.addLine(line); 193 | continue; 194 | } 195 | VcfEntry entry = VcfEntry.fromLine(line); 196 | String graphId = VariantInput.fromVcfEntry(entry, 0).graphID; 197 | if(segmentLength != -1 && !entry.getNormalizedType().equals("TRA")) 198 | { 199 | graphId = graphId + "_" + ((entry.getPos() / segmentLength) * segmentLength); 200 | } 201 | if(entry.getNormalizedType().equals("TRA") && transTogether) 202 | { 203 | graphId = "TRA"; 204 | } 205 | if(!res.containsKey(graphId)) 206 | { 207 | String ofn = outputPrefix + "_" + graphId + ".vcf"; 208 | PrintWriter out = new PrintWriter(new File(ofn)); 209 | header.print(out); 210 | res.put(graphId, ofn); 211 | writerMap.put(graphId, out); 212 | } 213 | PrintWriter out = writerMap.get(graphId); 214 | out.println(line); 215 | } 216 | for(String key : writerMap.keySet()) 217 | { 218 | writerMap.get(key).close(); 219 | } 220 | return res; 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/StringUtils.java: -------------------------------------------------------------------------------- 1 | import java.util.HashMap; 2 | 3 | /* 4 | * A collection of String functions 5 | */ 6 | public class StringUtils { 7 | 8 | /* 9 | * The sequence identity of two strings based on their edit distance 10 | */ 11 | static double editDistanceSimilarity(String s, String t) 12 | { 13 | int n = s.length(), m = t.length(); 14 | int[][] editDistance = new int[n+1][m+1]; 15 | for(int i = 1; i<=m; i++) editDistance[0][i] = i; 16 | for(int i = 1; i<=n; i++) editDistance[i][0] = i; 17 | for(int i = 1; i<=n; i++) 18 | { 19 | for(int j = 1; j<=m; j++) 20 | { 21 | boolean sameChar = s.charAt(i-1) == t.charAt(j-1); 22 | int bestDistance = editDistance[i-1][j-1] + (sameChar ? 0 : 1); 23 | bestDistance = Math.min(bestDistance, 1 + editDistance[i-1][j]); 24 | bestDistance = Math.min(bestDistance, 1 + editDistance[i][j-1]); 25 | editDistance[i][j] = bestDistance; 26 | } 27 | } 28 | return 1.0 * editDistance[n][m] / Math.max(n, m); 29 | } 30 | 31 | /* 32 | * Gets the frequency of each k-mer in a string, skipping over non-base characters 33 | */ 34 | static HashMap countKmers(String s, int k) 35 | { 36 | HashMap kmerCount = new HashMap(); 37 | 38 | // The number of basepair characters (ACGT) we have seen so far 39 | int baseCount = 0; 40 | 41 | // The encoded (2 bits per character) value of the current kmer so far 42 | int kmer = 0; 43 | 44 | // Use a sliding window to get all of the kmer codes and add them to the frequency map 45 | for(int i = 0; i= k) 66 | { 67 | if(kmerCount.containsKey(kmer)) 68 | { 69 | kmerCount.put(kmer, kmerCount.get(kmer)+1); 70 | } 71 | else 72 | { 73 | kmerCount.put(kmer, 1); 74 | } 75 | } 76 | } 77 | } 78 | return kmerCount; 79 | } 80 | 81 | /* 82 | * The sequence identity of two strings based on their kmer Jaccard distance 83 | */ 84 | static double jaccardSimilarity(String s, String t) 85 | { 86 | int k = Settings.K_JACCARD; 87 | 88 | // Get the frequencies of kmers in s 89 | HashMap sKmerFreq = countKmers(s, k); 90 | if(sKmerFreq.size() <= 0) 91 | { 92 | return 1.0; 93 | } 94 | 95 | // Get the frequencies of kmer in t 96 | HashMap tKmerFreq = countKmers(t, k); 97 | if(tKmerFreq.size() <= 0) 98 | { 99 | return 1.0; 100 | } 101 | 102 | // Compute the min and max count of each kmer to get the intersection and union, respectively 103 | int intersection = 0, union = 0; 104 | 105 | // Iterate over everything in s - this includes both kmers distinct to s and kmers in both 106 | for(int sKmer : sKmerFreq.keySet()) 107 | { 108 | int sFrequency = sKmerFreq.get(sKmer); 109 | int tFrequency = tKmerFreq.getOrDefault(sKmer, 0); 110 | intersection += Math.min(sFrequency, tFrequency); 111 | union += Math.max(sFrequency, tFrequency); 112 | } 113 | 114 | // Add the kmers unique to t to the union 115 | for(int tKmer : tKmerFreq.keySet()) 116 | { 117 | if(!sKmerFreq.containsKey(tKmer)) 118 | { 119 | union += tKmerFreq.get(tKmer); 120 | } 121 | } 122 | 123 | // Compute the Jaccard similarity as the intersection size divided by the union size 124 | return 1.0 * intersection / union; 125 | 126 | } 127 | 128 | /* 129 | * Assumes input is a filename, and adds "_" right before the file extension 130 | */ 131 | static String addDescriptor(String input, String desc) 132 | { 133 | int idx = input.lastIndexOf("."); 134 | if(idx == -1) 135 | { 136 | return input + "_" + desc; 137 | } 138 | 139 | String before = input.substring(0, idx); 140 | String after = input.substring(idx); 141 | return before + "_" + desc + after; 142 | } 143 | 144 | /* 145 | * Gets the basename of a file from its path by removing the directory name 146 | */ 147 | static String fileBaseName(String path) 148 | { 149 | int idx = path.lastIndexOf('/'); 150 | if(idx == -1) 151 | { 152 | return path; 153 | } 154 | return path.substring(1 + idx); 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /src/TestKDTree.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Basic test for making sure my KD tree is behaving reasonably 3 | */ 4 | 5 | public class TestKDTree { 6 | public static void main(String[] args) 7 | { 8 | Variant[] data = new Variant[] { 9 | new Variant(0, "var1", 10, 5, "chr1", null), 10 | new Variant(0, "var2", 1, 5, "chr1", null), 11 | new Variant(0, "var3", 18, 5, "chr1", null), 12 | new Variant(0, "var4", 12, 7, "chr1", null), 13 | new Variant(0, "var5", 10, 5, "chr1", null), 14 | new Variant(0, "var6", 30, 30, "chr1", null), 15 | new Variant(0, "var7", 0, 0, "chr1", null) 16 | }; 17 | 18 | KDTree kdt = new KDTree(data); 19 | for(int i = 0; i 6 | { 7 | // Which sample number the variant came from 8 | int sample; 9 | 10 | // Variant ID, assumed to be unique for all variants. It has the "_" added to the beginning to ensure this. 11 | String id; 12 | 13 | // End should be start+length for insertion 14 | double start, end; 15 | 16 | // Store chromosome, and optionally type and strand 17 | String graphID; 18 | 19 | // This is initialized and used internally for bookkeeping and does not come from VCF 20 | int index; 21 | 22 | // For insertions, the sequence being inserted, or null otherwise 23 | String seq; 24 | 25 | // The maximum distance a variant can be away to merge with this one 26 | int maxDist; 27 | 28 | // The minimum sequence similarity another variant needs to merge with this one if both are insertions 29 | double minSeqId; 30 | 31 | // The stat and end of an interval for checking overlap 32 | double[] interval; 33 | 34 | int hash; 35 | static int hash(String infoString) 36 | { 37 | long res = 0; 38 | int mod = (int)(1e9+7); 39 | char[] cs = infoString.toCharArray(); 40 | for(char c : cs) 41 | { 42 | res = ((res * 17) + c)%mod; 43 | } 44 | return (int)res; 45 | } 46 | 47 | /* 48 | * Returns the distance from the variant's (start, end) pair to a given (x, y) point 49 | */ 50 | double distFromPoint(double x, double y) 51 | { 52 | double dStart = start - x; 53 | double dEnd = end - y; 54 | int norm = Settings.KD_TREE_NORM; 55 | if(norm == 2) 56 | { 57 | return Math.sqrt(dStart * dStart + dEnd * dEnd); 58 | } 59 | else 60 | { 61 | double powSum = Math.abs(Math.pow(dStart, norm)) + Math.abs(Math.pow(dEnd, norm)); 62 | return Math.pow(powSum, 1.0 / norm); 63 | } 64 | } 65 | 66 | Variant(int sample, String id, double start, double end, String graphID, String seq, int maxDist, double minSeqId) 67 | { 68 | this.sample = sample; 69 | this.id = id; 70 | this.start = start; 71 | this.end = end; 72 | this.graphID = graphID; 73 | if(minSeqId > 0) this.seq = seq; 74 | this.maxDist = maxDist; 75 | this.minSeqId = minSeqId; 76 | hash = 0; 77 | interval = null; 78 | } 79 | 80 | Variant(int sample, String id, double start, double end, String graphID, String seq) 81 | { 82 | this.sample = sample; 83 | this.id = id; 84 | this.start = start; 85 | this.end = end; 86 | this.graphID = graphID; 87 | if(minSeqId > 0) this.seq = seq; 88 | this.maxDist = Settings.MAX_DIST; 89 | this.minSeqId = Settings.MIN_SEQUENCE_SIMILARITY; 90 | interval = null; 91 | } 92 | 93 | /* 94 | * The distance between two variants based on differences in their start/end coordinates 95 | * The metric used is a generalization of Euclidean distance 96 | */ 97 | double distance(Variant v) 98 | { 99 | return distFromPoint(v.start, v.end); 100 | } 101 | 102 | /* 103 | * The similarity score of two variants, which is based on sequence similarity for pairs of insertions and 1 otherwise 104 | */ 105 | double stringSimilarity(Variant v) 106 | { 107 | // If either sequence is null, the variant is either non-insertion, or has no sequence, which we don't want to penalize for 108 | if(seq == null || v.seq == null) 109 | { 110 | return 1.0; 111 | } 112 | 113 | if(Settings.USE_EDIT_DISTANCE) 114 | { 115 | return StringUtils.editDistanceSimilarity(seq, v.seq); 116 | } 117 | else 118 | { 119 | return StringUtils.jaccardSimilarity(seq, v.seq); 120 | } 121 | } 122 | 123 | /* 124 | * Whether or not the sequence similarity of two variants is high enough for them to be merged 125 | */ 126 | boolean passesStringSimilarity(Variant v) 127 | { 128 | if(seq == null || v.seq == null) 129 | { 130 | return true; 131 | } 132 | 133 | String s = seq, t = v.seq; 134 | 135 | double similarityNeeded = Math.min(minSeqId, v.minSeqId); 136 | 137 | // If there is no sequence identity requirement, don't compute the score and just return true 138 | if(similarityNeeded <= 0) 139 | { 140 | return true; 141 | } 142 | 143 | int minLength = Math.min(s.length(), t.length()); 144 | int maxLength = s.length() + t.length() - minLength; 145 | 146 | if(minLength < maxLength * similarityNeeded - 1e-9) 147 | { 148 | return false; 149 | } 150 | 151 | return stringSimilarity(v) >= similarityNeeded - 1e-9; 152 | } 153 | 154 | /* 155 | * Human-readable format for printing some of the variant information 156 | */ 157 | public String toString() 158 | { 159 | return "id: " + id + ", sample: " + sample + ", start: " + start + ", end: " + end; 160 | } 161 | 162 | public int compareTo(Variant o) 163 | { 164 | if(hash != o.hash) return Long.compare(hash, o.hash); 165 | if(start != o.start) return Double.compare(start, o.start); 166 | return id.compareTo(o.id); 167 | } 168 | 169 | public boolean passesOverlap(Variant v) 170 | { 171 | if(interval == null || v.interval == null) 172 | { 173 | return true; 174 | } 175 | double maxStart = Math.max(interval[0], v.interval[0]); 176 | double minEnd = Math.min(interval[1], v.interval[1]); 177 | if(minEnd <= maxStart + 1E-9) 178 | { 179 | return false; 180 | } 181 | double maxIntervalSize = Math.max(interval[1] - interval[0], v.interval[1] - v.interval[0]); 182 | return minEnd - maxStart + 1e-9 >= maxIntervalSize * Settings.OVERLAP_REQUIRED; 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /src/VariantInput.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Methods for reading VCF entries from VCF files and dividing 3 | * the entries into separate groups by graph ID 4 | */ 5 | 6 | import java.io.File; 7 | import java.io.FileInputStream; 8 | import java.util.ArrayList; 9 | import java.util.HashMap; 10 | import java.util.HashSet; 11 | import java.util.Scanner; 12 | import java.util.TreeMap; 13 | 14 | public class VariantInput { 15 | 16 | // How many samples were merged to produce each input file 17 | static HashMap previouslyMergedSamples = new HashMap(); 18 | 19 | /* 20 | * Count the number of VCF files in a list 21 | */ 22 | public static int countFiles(String fileList) throws Exception 23 | { 24 | return PipelineManager.getFilesFromList(fileList).size(); 25 | } 26 | 27 | /* 28 | * Get a list of all variants from a group of files, binning them by graphID 29 | */ 30 | @SuppressWarnings("unchecked") 31 | public static TreeMap> readAllFiles(String fileList) throws Exception 32 | { 33 | ArrayList fileNames = PipelineManager.getFilesFromList(fileList); 34 | 35 | TreeMap>[] variantsPerFile = new TreeMap[fileNames.size()]; 36 | for(int i = 0; i> res = new TreeMap>(); 41 | for(int i = 0; i()); 48 | } 49 | for(Variant v : variantsPerFile[i].get(s)) 50 | { 51 | res.get(s).add(v); 52 | } 53 | } 54 | } 55 | return res; 56 | } 57 | 58 | /* 59 | * Get a list of variants binned by graphID for a single VCF file 60 | */ 61 | private static TreeMap> getSingleList(String filename, int sample) throws Exception 62 | { 63 | if(filename.endsWith(".gz")) 64 | { 65 | System.err.println("Warning: " + filename + " ends with .gz, but (b)gzipped VCFs are not accepted"); 66 | } 67 | Scanner input = new Scanner(new FileInputStream(new File(filename))); 68 | ArrayList allVariants = new ArrayList(); 69 | HashSet ids = new HashSet(); 70 | if(!previouslyMergedSamples.containsKey(sample)) 71 | { 72 | previouslyMergedSamples.put(sample, 1); 73 | } 74 | while(input.hasNext()) 75 | { 76 | String line = input.nextLine(); 77 | if(line.length() == 0 || line.startsWith("#")) 78 | { 79 | continue; 80 | } 81 | if(line.length() >=2 && line.charAt(0) == 31 && (line.charAt(1) == 65533 || line.charAt(1) == 139)) 82 | { 83 | throw new Exception(filename + " is a gzipped file, but only unzipped VCFs are accepted"); 84 | } 85 | VcfEntry entry = VcfEntry.fromLine(line); 86 | if(!previouslyMergedSamples.containsKey(sample)) 87 | { 88 | if(entry.getInfo("SUPP_VEC_EXT").length() > 0) 89 | { 90 | previouslyMergedSamples.put(sample, entry.getInfo("SUPP_VEC_EXT").length()); 91 | } 92 | else if(entry.getInfo("SUPP_VEC").length() > 0) 93 | { 94 | previouslyMergedSamples.put(sample, entry.getInfo("SUPP_VEC").length()); 95 | } 96 | else 97 | { 98 | previouslyMergedSamples.put(sample, 1); 99 | } 100 | } 101 | if(ids.contains(entry.getId())) 102 | { 103 | String oldId = entry.getId(); 104 | int index = 1; 105 | while(true) 106 | { 107 | String newId = oldId + "_duplicate" + index; 108 | if(!ids.contains(newId)) 109 | { 110 | entry.setId(newId); 111 | break; 112 | } 113 | else 114 | { 115 | index++; 116 | } 117 | } 118 | System.err.println("Warning: Duplicate variant ID " + oldId + " in " + filename + "; Replacing with " + entry.getId()); 119 | } 120 | ids.add(entry.getId()); 121 | allVariants.add(fromVcfEntry(entry, sample)); 122 | 123 | } 124 | 125 | System.out.println(filename + " has " + allVariants.size() + " variants"); 126 | input.close(); 127 | 128 | return divideIntoGraphs(allVariants); 129 | } 130 | 131 | /* 132 | * Take a list of variants and bin them by graphID 133 | */ 134 | private static TreeMap> divideIntoGraphs(ArrayList data) 135 | { 136 | TreeMap> groups = new TreeMap>(); 137 | for(Variant v : data) 138 | { 139 | String graphID = v.graphID; 140 | if(!groups.containsKey(graphID)) 141 | { 142 | groups.put(graphID, new ArrayList()); 143 | } 144 | groups.get(graphID).add(v); 145 | } 146 | return groups; 147 | } 148 | 149 | /* 150 | * From a line of a VCF file, extract the information needed for merging 151 | * and return it as a Variant object 152 | */ 153 | public static Variant fromVcfEntry(VcfEntry entry, int sample) throws Exception 154 | { 155 | double start = entry.getFirstCoord(); 156 | double end = entry.getSecondCoord(); 157 | 158 | entry.setId(sample + "_" + entry.getId()); 159 | 160 | String id = entry.getGraphID(); 161 | 162 | String seq = null; 163 | if(entry.getType().equals("INS")) 164 | { 165 | String entrySeq = entry.getSeq(); 166 | if(entrySeq.length() > 0) 167 | { 168 | seq = entrySeq; 169 | } 170 | } 171 | 172 | // Default distance threshold model is constant, so set to that first 173 | int maxDist = Settings.MAX_DIST; 174 | double minSeqId = Settings.MIN_SEQUENCE_SIMILARITY; 175 | 176 | // Then, check if there is a per-variant distance threshold 177 | String maxDistInfo = entry.getInfo("JASMINE_DIST"); 178 | if(maxDistInfo.length() > 0) 179 | { 180 | maxDist = Integer.parseInt(maxDistInfo); 181 | } 182 | 183 | // Next check if a per-sample distance threshold was set 184 | else if(Settings.PER_SAMPLE_DISTS != null && Settings.PER_SAMPLE_DISTS.length > sample) 185 | { 186 | maxDist = Settings.PER_SAMPLE_DISTS[sample]; 187 | } 188 | 189 | // Next, check if there is a length-based threshold 190 | else if(Settings.USE_LINEAR_THRESHOLD && Settings.MAX_DIST_LINEAR > 0) 191 | { 192 | maxDist = (int)(Settings.MAX_DIST_LINEAR * Math.abs(entry.getLength()) + 0.5); 193 | if(Settings.MAX_DIST_SET) 194 | { 195 | maxDist = Math.min(maxDist, Settings.MAX_DIST); 196 | } 197 | if(Settings.MIN_DIST != -1) 198 | { 199 | maxDist = Math.max(maxDist, Settings.MIN_DIST); 200 | } 201 | } 202 | 203 | // Check for per-variant sequence ID thresholds 204 | String minIdInfo = entry.getInfo("JASMINE_ID"); 205 | if(minIdInfo.length() > 0) 206 | { 207 | minSeqId = Double.parseDouble(minIdInfo); 208 | } 209 | 210 | Variant res = new Variant(sample, entry.getId(), start, end, id, seq, maxDist, minSeqId); 211 | res.hash = Variant.hash(entry.tabTokens[7]); 212 | if(Settings.OVERLAP_REQUIRED > 0 && (entry.getType().equals("DEL")) || entry.getType().equals("INV") || entry.getType().equals("DUP")) 213 | { 214 | res.interval = new double[] {entry.getPos(), entry.getEnd()}; 215 | } 216 | return res; 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /src/VariantMergeTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Class for testing variant merging 3 | */ 4 | 5 | import java.util.ArrayList; 6 | 7 | public class VariantMergeTest { 8 | public static void main(String[] args) 9 | { 10 | Variant[] data = new Variant[] { 11 | new Variant(0, "var1", 10, 5, "chr1", null), 12 | new Variant(0, "var2", 1, 5, "chr1", null), 13 | new Variant(0, "var3", 18, 5, "chr1", null), 14 | new Variant(1, "var4", 12, 7, "chr1", null), 15 | new Variant(1, "var5", 10, 5, "chr1", null), 16 | new Variant(1, "var6", 30, 30, "chr1", null), 17 | new Variant(1, "var7", 0, 0, "chr1", null), 18 | new Variant(2, "var8", 12, 12, "chr1", null), 19 | new Variant(2, "var9", 15, 15, "chr1", null), 20 | new Variant(2, "var10", 20, 20, "chr1", null), 21 | new Variant(2, "var11", 28, 28, "chr1", null), 22 | new Variant(3, "var12", 25, 25, "chr1", null), 23 | new Variant(4, "var13", 22, 22, "chr1", null) 24 | }; 25 | 26 | Settings.MAX_DIST = 5; 27 | VariantMerger vm = new VariantMerger(data); 28 | vm.runMerging(); 29 | ArrayList[] res = vm.getGroups(); 30 | 31 | System.out.println(); 32 | for(ArrayList list : res) 33 | { 34 | if(list.size() > 1) 35 | { 36 | System.out.println("Variant:"); 37 | for(Variant v : list) 38 | { 39 | System.out.println(v); 40 | } 41 | System.out.println(); 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/VariantMerger.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Main interface for merging variants. This assumes all variants are on the 3 | * same chromosome, and have the same type/strand if that separation is desired. 4 | */ 5 | 6 | import java.util.ArrayList; 7 | import java.util.PriorityQueue; 8 | 9 | public class VariantMerger 10 | { 11 | // An array of all of the variants to be considered 12 | Variant[] data; 13 | 14 | // The number of total variants 15 | int n; 16 | 17 | // A forest in which connected components will represent merged groups 18 | Forest forest; 19 | 20 | // A KD-tree data structure for fast k-nearest-neighbors queries 21 | KDTree knn; 22 | 23 | // Indices of variants in each group, used for more advanced distance checks like clique and centroid 24 | ArrayList[] merged; 25 | 26 | @SuppressWarnings("unchecked") 27 | public VariantMerger(Variant[] data) 28 | { 29 | n = data.length; 30 | 31 | forest = new Forest(data); 32 | knn = new KDTree(data, false); 33 | 34 | for(int i = 0; i(); 43 | merged[i].add(i); 44 | } 45 | } 46 | } 47 | 48 | /* 49 | * Helper function to convert an ArrayList to an array to make the constructor more flexible 50 | */ 51 | static Variant[] listToArray(ArrayList data) 52 | { 53 | int length = data.size(); 54 | Variant[] asArray = new Variant[length]; 55 | for(int i = 0; i data) 66 | { 67 | this(listToArray(data)); 68 | } 69 | 70 | /* 71 | * Runs the core algorithm for building the implicit merging graph and 72 | * performing merging 73 | */ 74 | void runMerging() 75 | { 76 | if(n == 1) 77 | { 78 | return; 79 | } 80 | 81 | // For each variant v, how many of its nearest neighbors have had their edges 82 | // from v considered already. 83 | int[] countEdgesProcessed = new int[n]; 84 | 85 | // nearestNeighbors will be used as a cache to store the next few nearest neighbors 86 | // The purpose of this is to prevent performing a new KNN-query every time an edge 87 | // is considered, but instead a logarithmic number of times. 88 | Variant[][] nearestNeighbors = new Variant[n][]; 89 | 90 | // A heap of edges to be processed in non-decreasing order of distance 91 | PriorityQueue toProcess = new PriorityQueue(); 92 | 93 | // Get the first 4 nearest neighbors for every variant, and add their first edges to 94 | // the heap 95 | for(int i = 0; i maxDistAllowed + 1e-9) 135 | { 136 | valid = false; 137 | } 138 | } 139 | } 140 | } 141 | // Make sure everything being merged can be merged with their overall centroid 142 | else if(Settings.CENTROID_MERGE) 143 | { 144 | double avgStart = 0.0, avgEnd = 0.0; 145 | for(int i = 0; i= nearestNeighbors[e.from].length) 204 | { 205 | nearestNeighbors[e.from] = knn.kNearestNeighbor(data[e.from], 2 * nearestNeighbors[e.from].length); 206 | } 207 | 208 | // If we tried to get more and didn't find anymore, then we are done with this variant 209 | if(countEdgesProcessed[e.from] >= nearestNeighbors[e.from].length) 210 | { 211 | break; 212 | } 213 | Variant candidateTo = nearestNeighbors[e.from][countEdgesProcessed[e.from]]; 214 | 215 | // This edge was invalid because of distance from the query, so stop looking at any edges 216 | // since they'll only get farther away 217 | int maxDistAllowed = Math.max(data[e.from].maxDist, candidateTo.maxDist); 218 | if(Settings.REQUIRE_MUTUAL_DISTANCE) 219 | { 220 | maxDistAllowed = Math.min(data[e.from].maxDist, candidateTo.maxDist); 221 | } 222 | 223 | if(data[e.from].distance(candidateTo) > data[e.from].maxDist + 1e-9) 224 | { 225 | break; 226 | } 227 | 228 | else if(data[e.from].distance(candidateTo) > maxDistAllowed + 1e-9) 229 | { 230 | countEdgesProcessed[e.from]++; 231 | continue; 232 | } 233 | 234 | // If edge was invalid because of coming from the same sample, ignore it and try the next one 235 | else if(!Settings.ALLOW_INTRASAMPLE && data[e.from].sample == candidateTo.sample) 236 | { 237 | toProcess.add(new Edge(e.from, candidateTo.index, data[e.from].distance(candidateTo))); 238 | countEdgesProcessed[e.from]++; 239 | break; 240 | } 241 | 242 | // If sequences weren't similar enough for two insertions, ignore and try again 243 | else if(!data[e.from].passesStringSimilarity(candidateTo)) 244 | { 245 | countEdgesProcessed[e.from]++; 246 | continue; 247 | } 248 | 249 | else if(!data[e.from].passesOverlap(candidateTo)) 250 | { 251 | countEdgesProcessed[e.from]++; 252 | continue; 253 | } 254 | 255 | // The next edge is something we want to consider since it is close enough and goes to a 256 | // different sample 257 | else 258 | { 259 | toProcess.add(new Edge(e.from, candidateTo.index, data[e.from].distance(candidateTo))); 260 | countEdgesProcessed[e.from]++; 261 | break; 262 | } 263 | } 264 | } 265 | } 266 | 267 | /* 268 | * Get an array of all of the groups of variants 269 | */ 270 | @SuppressWarnings("unchecked") 271 | ArrayList[] getGroups() 272 | { 273 | ArrayList[] res = new ArrayList[n]; 274 | for(int i = 0; i(); 277 | } 278 | for(int i = 0; i 297 | { 298 | int from, to; 299 | double dist; 300 | Edge(int from, int to, double dist) 301 | { 302 | this.from = from; 303 | this.to = to; 304 | this.dist = dist; 305 | } 306 | @Override 307 | public int compareTo(Edge o) { 308 | if(Math.abs(dist - o.dist) > 1e-9) 309 | { 310 | return Double.compare(dist, o.dist); 311 | } 312 | if(data[from].hash != data[o.from].hash) return data[from].hash - (data[o.from].hash); 313 | if(data[to].hash != data[o.to].hash) return data[to].hash - (data[o.to].hash); 314 | if(from != o.from) return data[from].id.compareTo(data[o.from].id); 315 | return data[to].id.compareTo(data[o.to].id); 316 | } 317 | } 318 | } 319 | -------------------------------------------------------------------------------- /src/VcfHeader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * The header of a VCF file, including a list of INFO fields 3 | * The main purpose of this is to manage INFO field description lines and avoid duplicates 4 | */ 5 | 6 | import java.io.PrintWriter; 7 | import java.util.ArrayList; 8 | import java.util.HashSet; 9 | 10 | public class VcfHeader { 11 | 12 | static String infoKey = "##INFO= lines; 17 | 18 | // The names of all INFO fields present in the VCF file 19 | HashSet infoFields; 20 | HashSet formatFields; 21 | 22 | // The index of the last INFO field line, or the line before where the next INFO field should go 23 | int lastInfoFieldIndex; 24 | 25 | // The index of the last FORMAT field line, or the line before where the next FORMAT field should go 26 | int lastFormatFieldIndex; 27 | 28 | // Constructor - just initializes the data structures 29 | VcfHeader() 30 | { 31 | lines = new ArrayList(); 32 | infoFields = new HashSet(); 33 | formatFields = new HashSet(); 34 | lastInfoFieldIndex = -1; 35 | lastFormatFieldIndex = -1; 36 | } 37 | 38 | /* 39 | * Print all lines of the header 40 | */ 41 | void print(PrintWriter out) 42 | { 43 | for(int i = 0; i", id, number, type, desc); 96 | infoFields.add(id); 97 | lines.add(lastInfoFieldIndex + 1, line); 98 | lastInfoFieldIndex++; 99 | lastFormatFieldIndex++; 100 | } 101 | 102 | /* 103 | * Remove all format fields from the header 104 | */ 105 | void resetFormatFields() 106 | { 107 | formatFields = new HashSet(); 108 | ArrayList newLines = new ArrayList(); 109 | int oldIndex = lastFormatFieldIndex; 110 | lastFormatFieldIndex = -1; 111 | for(int i = 0; i", id, number, type, desc); 141 | formatFields.add(id); 142 | lines.add(lastFormatFieldIndex + 1, line); 143 | lastFormatFieldIndex++; 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/VcfHeaderTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Test to make sure VCF header is properly handling adding INFO fields and adding/resetting FORMAT fields 3 | * Output should be (in order): Test, SVTYPE, INFO1-3, FORMAT1-3, and #CHR 4 | */ 5 | import java.io.PrintWriter; 6 | 7 | public class VcfHeaderTest { 8 | public static void main(String[] args) 9 | { 10 | VcfHeader header = new VcfHeader(); 11 | header.addLine("#Test"); 12 | header.addLine("##INFO="); 13 | header.addLine("##FORMAT="); 14 | header.addLine("##CHR etc."); 15 | header.addInfoField("INFO1", "1", "String", "desc1"); 16 | header.addInfoField("INFO2", "1", "String", "desc2"); 17 | header.addFormatField("FORMAT3", "1", "String", "descf3"); 18 | header.resetFormatFields(); 19 | header.addFormatField("FORMAT1", "1", "String", "descf1"); 20 | header.addInfoField("INFO3", "1", "String", "desc3"); 21 | header.addFormatField("FORMAT2", "1", "String", "descf2"); 22 | header.addFormatField("FORMAT3", "1", "String", "descf3"); 23 | header.addFormatField("FORMAT1", "1", "String", "descf1"); 24 | PrintWriter out = new PrintWriter(System.out); 25 | header.print(out); 26 | out.close(); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/VisualizationPrep.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Given the results of both Jasmine and SURVIVOR, extract out shared and different merges, 3 | * producing a list of points and line segments which can be plotted to visualize the results. 4 | * 5 | * For now, this only works on datasets with two samples (i.e., two VCFs input to the merging software). 6 | */ 7 | import java.io.File; 8 | import java.io.FileInputStream; 9 | import java.io.PrintWriter; 10 | import java.util.ArrayList; 11 | import java.util.HashMap; 12 | import java.util.Scanner; 13 | import java.util.TreeSet; 14 | public class VisualizationPrep { 15 | 16 | // Empty string if whole genome or chromosome name for plotting that chromosome 17 | static String chrToPlot = "1"; 18 | 19 | // True iff we did one of the two with the samples in reverse order 20 | static boolean secondRev = false; 21 | 22 | // Whether or not each file was produced by SURVIVOR and needs to be parsed differently 23 | static boolean firstSurvivor = true; 24 | static boolean secondSurvivor = false; 25 | 26 | // Whether or not to print merges unique to one output file 27 | static boolean printUnique = false; 28 | 29 | static int sampleCount = 0; 30 | 31 | @SuppressWarnings("unchecked") 32 | public static void main(String[] args) throws Exception 33 | { 34 | // File containing a list of VCF files 35 | String fileList = "/home/mkirsche/eichler/filelist.txt"; 36 | 37 | // The resulting merged VCF files from both Jasmine and SURVIVOR 38 | String firstOutput = "/home/mkirsche/eichler/survmerged.vcf"; 39 | String secondOutput = "/home/mkirsche/eichler/merged.vcf"; 40 | 41 | // Get the list of VCF files 42 | ArrayList vcfsAsList = PipelineManager.getFilesFromList(fileList); 43 | String[] vcfs = new String[vcfsAsList.size()]; 44 | for(int i = 0; i[] positions = new ArrayList[vcfs.length]; 64 | ArrayList[] colors = new ArrayList[vcfs.length]; 65 | int[] colorCounts = new int[4]; 66 | HashMap[] idToEntry = new HashMap[vcfs.length]; 67 | 68 | // Hard-code the colors of the common variant types, so we know which color is which in downstream plotting 69 | // There may be other colors in the case of other types. 70 | HashMap typeToInt = new HashMap(); 71 | typeToInt.put("INS", 0); 72 | typeToInt.put("DEL", 1); 73 | typeToInt.put("DUP", 2); 74 | typeToInt.put("INV", 3); 75 | 76 | // Iterate over input VCF files to record entries 77 | for(int i = 0; i(); 80 | colors[i] = new ArrayList(); 81 | idToEntry[i] = new HashMap(); 82 | Scanner input = new Scanner(new FileInputStream(new File(vcfs[i]))); 83 | 84 | // Read entries one at a time and store information about them 85 | while(input.hasNext()) 86 | { 87 | String line = input.nextLine(); 88 | if(line.length() == 0 || line.startsWith("#")) 89 | { 90 | continue; 91 | } 92 | VcfEntry entry = new VcfEntry(line); 93 | if(chrToPlot.length() > 0 && !entry.getChromosome().equals(chrToPlot)) continue; 94 | 95 | // Below is an example of how to restrict the plot to certain positions 96 | if(entry.getPos() > 10000000) continue; 97 | 98 | int pos = (int)entry.getPos(); 99 | positions[i].add(pos); 100 | idToEntry[i].put(entry.getId(), entry); 101 | if(!typeToInt.containsKey(entry.getType())) 102 | { 103 | typeToInt.put(entry.getType(), typeToInt.size()); 104 | } 105 | colors[i].add(typeToInt.get(entry.getType())); 106 | } 107 | input.close(); 108 | } 109 | 110 | // We now have the variant points, so output those for plotting 111 | for(int i = 0; i firstEdges = getJoinedPairs(firstOutput, firstSurvivor, false); 122 | System.out.println("Merges in first output: " + firstEdges.size()); 123 | TreeSet secondEdges = getJoinedPairs(secondOutput, secondSurvivor, secondRev); 124 | System.out.println("Merges in second output: " + secondEdges.size()); 125 | 126 | // Store the union of the merge-sets so we get every line segment 127 | TreeSet union = new TreeSet(); 128 | for(Merge s : secondEdges) union.add(s); 129 | for(Merge s : firstEdges) union.add(s); 130 | 131 | // For each line segment, color it based on which output it came from (possibly both) 132 | for(Merge edge : union) 133 | { 134 | String[] ids = new String[] {edge.id1, edge.id2}; 135 | int[] samples = new int[] {edge.sample1, edge.sample2}; 136 | boolean okay = true; 137 | int[] curPositions = new int[2]; 138 | for(int i = 0; i<2; i++) 139 | { 140 | if(idToEntry[samples[i]].containsKey(ids[i])) 141 | { 142 | curPositions[i] = (int)idToEntry[samples[i]].get(ids[i]).getPos(); 143 | } 144 | else 145 | { 146 | okay = false; 147 | break; 148 | } 149 | } 150 | if(!okay) 151 | { 152 | continue; 153 | } 154 | int color = 0; 155 | if(secondEdges.contains(edge)) color |= 2; 156 | if(firstEdges.contains(edge)) color |= 1; 157 | 158 | String firstSoftware = firstSurvivor ? "survivor" : "Jasmine"; 159 | String secondSoftware = secondSurvivor ? "survivor" : "Jasmine"; 160 | if(secondRev) secondSoftware += "rev"; 161 | 162 | colorCounts[color]++; 163 | 164 | // If the pair was only merged by one software, print out information about it 165 | if(color == 1 || color == 2) 166 | { 167 | VcfEntry first = idToEntry[samples[0]].get(ids[0]); 168 | VcfEntry second = idToEntry[samples[1]].get(ids[1]); 169 | 170 | if(printUnique) 171 | { 172 | System.out.println("Merge unique to " + (color == 1 ? firstSoftware : secondSoftware)); 173 | System.out.println(" " + ids[0] + " " + first.getType() + " " + first.getStrand() + " at " + first.getPos() + " (length " + first.getLength() + ")"); 174 | System.out.println(" " + ids[1] + " " + second.getType() + " " + second.getStrand() + " at " + second.getPos() + " (length " + second.getLength() + ")"); 175 | System.out.println(" " + edge.line); 176 | System.out.println(" Samples: " + edge.sample1 + " " + edge.sample2); 177 | Variant a = VariantInput.fromVcfEntry(first, 0), b = VariantInput.fromVcfEntry(second, 0); 178 | System.out.println(" Distance according to Jasmine: " + a.distance(b)); 179 | } 180 | } 181 | 182 | // Print the line segment 183 | out.println(curPositions[0]+" "+ys[edge.sample1]+" "+curPositions[1]+" "+ys[edge.sample2]+" "+color); 184 | } 185 | System.out.println("First output unique merges: " + colorCounts[1]); 186 | System.out.println("Second output unique merges: " + colorCounts[2]); 187 | System.out.println("Shared merges: " + colorCounts[3]); 188 | out.close(); 189 | 190 | 191 | } 192 | 193 | /* 194 | * For a given merged VCF file, get the list of all pairs of variants which were joined 195 | * For now, assumes only 2 samples, and the survivor flag is true if SURVIVOR was used 196 | * and false if Jasmine was used instead. 197 | */ 198 | static TreeSet getJoinedPairs(String fn, boolean survivor, boolean rev) throws Exception 199 | { 200 | Scanner input = new Scanner(new FileInputStream(new File(fn))); 201 | TreeSet res = new TreeSet(); 202 | while(input.hasNext()) 203 | { 204 | String line = input.nextLine(); 205 | if(line.length() == 0 || line.startsWith("#")) continue; 206 | VcfEntry entry = new VcfEntry(line); 207 | if(chrToPlot.length() > 0 && !entry.getChromosome().equals(chrToPlot)) continue; 208 | String supportVector = entry.getInfo("SUPP_VEC"); 209 | ArrayList samples = new ArrayList(); 210 | for(int i = 0; i ids = new ArrayList(); 221 | for(int i = 9; i 260 | { 261 | String id1, id2; 262 | int sample1, sample2; 263 | String line; 264 | Merge(String ii1, String ii2, int ss1, int ss2, String ll) 265 | { 266 | line = ll; 267 | id1 = ii1; 268 | id2 = ii2; 269 | sample1 = ss1; 270 | sample2 = ss2; 271 | } 272 | @Override 273 | public int compareTo(Merge o) { 274 | if(sample1 != o.sample1) 275 | { 276 | return sample1 - o.sample1; 277 | } 278 | if(sample2 != o.sample2) 279 | { 280 | return sample2 - o.sample2; 281 | } 282 | if(!id1.equals(o.id1)) 283 | { 284 | return id1.compareTo(o.id1); 285 | } 286 | return id2.compareTo(o.id2); 287 | } 288 | } 289 | } 290 | -------------------------------------------------------------------------------- /test_data/a.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##contig= 3 | ##contig= 4 | ##INFO= 5 | ##INFO= 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##FORMAT= 13 | 1 1000000 1 CACGTACGTACGTACGTACGTACGTACTGACGTACGT C . PASS PRECISE;CHR2=1;END=1000036;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-36;STRANDS=+-;RE=12;IS_SPECIFIC=1 GT 1/1 14 | 1 1999960 2 C CACGTACGTACGTACGTACGTACGTACTGACGTACGT . PASS PRECISE;CHR2=1;END=1999960;SVTYPE=INS;SUPTYPE=AL;SVLEN=36;STRANDS=+-;RE=10;IS_SPECIFIC=1 GT 1/1 15 | 1 3000000 3 N ]2:1000000]N . PASS PRECISE;CHR2=2;END=3000000;SVTYPE=BND;SUPTYPE=AL;SVLEN=1;STRANDS=-+;RE=15;IS_SPECIFIC=1 GT 1/1 16 | 1 4000200 4 CACGTACGTACGTACGTACGTACGTACTGACGT C . PASS PRECISE;CHR2=1;END=4000232;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;RE=12;IS_SPECIFIC=1 GT 1/1 17 | 18 | -------------------------------------------------------------------------------- /test_data/b.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##contig= 3 | ##contig= 4 | ##INFO= 5 | ##INFO= 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##FORMAT= 13 | 1 1000000 1 CACGTACGTACGTACGTACGTACGTACTGACGT C . PASS PRECISE;CHR2=1;END=1000032;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;RE=12;IS_SPECIFIC=1 GT 1/1 14 | 1 2000045 2 C CACGTACGTACGTACGTACGTACGTACTGACGTACGTACGT . PASS PRECISE;CHR2=1;END=2000045;SVTYPE=INS;SUPTYPE=AL;SVLEN=40;STRANDS=+-;RE=10;IS_SPECIFIC=1 GT 1/1 15 | 1 3000000 3 N ]2:1000000]N . PASS PRECISE;CHR2=2;END=3000000;SVTYPE=BND;SUPTYPE=AL;SVLEN=1;STRANDS=-+;RE=15;IS_SPECIFIC=1 GT 1/1 16 | 1 4000000 4 CACGTACGTACGTACGTACGTACGTACTGACGT C . PASS PRECISE;CHR2=1;END=4000032;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;RE=12;IS_SPECIFIC=1 GT 1/1 17 | 18 | -------------------------------------------------------------------------------- /test_data/c.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##contig= 3 | ##contig= 4 | ##INFO= 5 | ##INFO= 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##INFO= 15 | ##INFO= 16 | ##INFO= 17 | ##INFO= 18 | ##INFO= 19 | ##INFO= 20 | ##INFO= 21 | ##INFO= 22 | ##INFO= 23 | ##INFO= 24 | ##FORMAT= 25 | 1 4000200 0_4 CACGTACGTACGTACGTACGTACGTACTGACGT C . PASS PRECISE;CHR2=1;END=4000232;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;RE=12;IS_SPECIFIC=1;STARTVARIANCE=0.000000;ENDVARIANCE=0.000000;AVG_LEN=-32.000000;AVG_START=4000200.000000;AVG_END=4000232.000000;SUPP_VEC_EXT=10;IDLIST_EXT=4;SUPP_EXT=1;SUPP_VEC=10;SUPP=1;SVMETHOD=JASMINE;IDLIST=4 GT 1/1 26 | 1 1000000 0_1 CACGTACGTACGTACGTACGTACGTACTGACGTACGT C . PASS PRECISE;CHR2=1;END=1000036;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-36;STRANDS=+-;RE=12;IS_SPECIFIC=1;STARTVARIANCE=0.000000;ENDVARIANCE=4.000000;AVG_LEN=-34.000000;AVG_START=1000000.000000;AVG_END=1000034.000000;SUPP_VEC_EXT=11;IDLIST_EXT=1,1;SUPP_EXT=2;SUPP_VEC=11;SUPP=2;SVMETHOD=JASMINE;IDLIST=1,1 GT 1/1 27 | 1 1999960 0_2 C CACGTACGTACGTACGTACGTACGTACTGACGTACGT . PASS PRECISE;CHR2=1;END=1999960;SVTYPE=INS;SUPTYPE=AL;SVLEN=36;STRANDS=+-;RE=10;IS_SPECIFIC=1;STARTVARIANCE=1806.250000;ENDVARIANCE=1806.250000;AVG_LEN=38.000000;AVG_START=2000002.500000;AVG_END=2000002.500000;SUPP_VEC_EXT=11;IDLIST_EXT=2,2;SUPP_EXT=2;SUPP_VEC=11;SUPP=2;SVMETHOD=JASMINE;IDLIST=2,2 GT 1/1 28 | 1 3000000 0_3 N ]2:1000000]N . PASS PRECISE;CHR2=2;END=3000000;SVTYPE=BND;SUPTYPE=AL;SVLEN=1;STRANDS=-+;RE=15;IS_SPECIFIC=1;STARTVARIANCE=0.000000;ENDVARIANCE=0.000000;AVG_LEN=1.000000;AVG_START=3000000.000000;AVG_END=3000000.000000;SUPP_VEC_EXT=11;IDLIST_EXT=3,3;SUPP_EXT=2;SUPP_VEC=11;SUPP=2;SVMETHOD=JASMINE;IDLIST=3,3 GT 1/1 29 | 1 4000000 1_4 CACGTACGTACGTACGTACGTACGTACTGACGT C . PASS PRECISE;CHR2=1;END=4000032;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;RE=12;IS_SPECIFIC=1;STARTVARIANCE=0.000000;ENDVARIANCE=0.000000;AVG_LEN=-32.000000;AVG_START=4000000.000000;AVG_END=4000032.000000;SUPP_VEC_EXT=01;IDLIST_EXT=4;SUPP_EXT=1;SUPP_VEC=01;SUPP=1;SVMETHOD=JASMINE;IDLIST=4 GT 1/1 30 | --------------------------------------------------------------------------------