├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── build.sh
├── build_jar.sh
├── build_no_iris.sh
├── igv_jasmine
├── jasmine
├── jasmine.jar
├── jasmine_igv.jar
├── jasmine_iris.jar
├── jasmine_split.jar
├── pipeline
    ├── README.md
    ├── align_single.snakefile
    ├── call_svs_sniffles_single.snakefile
    ├── cut_regions_bam.snakefile
    ├── data.yaml
    ├── fix_sam.py
    ├── jasmine_pre.snakefile
    ├── main_chr_filter.py
    ├── marcc_config.yaml
    ├── pipeline.snakefile
    ├── pipelineoverview.svg
    ├── rockfish_config.yaml
    ├── sv_sizes.py
    ├── sv_supports.py
    ├── tools.yaml
    └── utils.py
├── plot_merges.py
├── run.sh
├── smalltest.sh
├── split_jasmine
├── src
    ├── AddGenotypes.java
    ├── BndVcfEntry.java
    ├── ChrNameNormalization.java
    ├── DuplicationsToInsertions.java
    ├── Forest.java
    ├── GenomeQuery.java
    ├── IgvScreenshotMaker.java
    ├── InsertionsToDuplications.java
    ├── KDTree.java
    ├── Main.java
    ├── MarkSpecificCalls.java
    ├── NormalizeTypes.java
    ├── Overlap.java
    ├── ParallelMerger.java
    ├── PipelineManager.java
    ├── PreSplit.java
    ├── Settings.java
    ├── StringUtils.java
    ├── TestKDTree.java
    ├── Variant.java
    ├── VariantInput.java
    ├── VariantMergeTest.java
    ├── VariantMerger.java
    ├── VariantOutput.java
    ├── VcfEntry.java
    ├── VcfHeader.java
    ├── VcfHeaderTest.java
    └── VisualizationPrep.java
└── test_data
    ├── a.vcf
    ├── b.vcf
    └── c.vcf


/.gitignore:
--------------------------------------------------------------------------------
 1 | bin/
 2 | .classpath
 3 | .project
 4 | .settings/
 5 | *.class
 6 | *.vcf
 7 | *.vcf.graph
 8 | *.bed
 9 | *.txt
10 | # IDEA setup
11 | .idea/
12 | *.iml
13 | src/FixStrands.java
14 | data/
15 | output/
16 | out.log
17 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "Iris"]
2 | 	path = Iris
3 | 	url = https://github.com/mkirsche/Iris.git
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Melanie Kirsche
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/jasminesv/README.html)
 2 | [![European Galaxy server](https://img.shields.io/badge/usegalaxy-.eu-brightgreen?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAASCAYAAABB7B6eAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAACXBIWXMAAAsTAAALEwEAmpwYAAACC2lUWHRYTUw6Y29tLmFkb2JlLnhtcAAAAAAAPHg6eG1wbWV0YSB4bWxuczp4PSJhZG9iZTpuczptZXRhLyIgeDp4bXB0az0iWE1QIENvcmUgNS40LjAiPgogICA8cmRmOlJERiB4bWxuczpyZGY9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkvMDIvMjItcmRmLXN5bnRheC1ucyMiPgogICAgICA8cmRmOkRlc2NyaXB0aW9uIHJkZjphYm91dD0iIgogICAgICAgICAgICB4bWxuczp0aWZmPSJodHRwOi8vbnMuYWRvYmUuY29tL3RpZmYvMS4wLyI+CiAgICAgICAgIDx0aWZmOlJlc29sdXRpb25Vbml0PjI8L3RpZmY6UmVzb2x1dGlvblVuaXQ+CiAgICAgICAgIDx0aWZmOkNvbXByZXNzaW9uPjE8L3RpZmY6Q29tcHJlc3Npb24+CiAgICAgICAgIDx0aWZmOk9yaWVudGF0aW9uPjE8L3RpZmY6T3JpZW50YXRpb24+CiAgICAgICAgIDx0aWZmOlBob3RvbWV0cmljSW50ZXJwcmV0YXRpb24+MjwvdGlmZjpQaG90b21ldHJpY0ludGVycHJldGF0aW9uPgogICAgICA8L3JkZjpEZXNjcmlwdGlvbj4KICAgPC9yZGY6UkRGPgo8L3g6eG1wbWV0YT4KD0UqkwAAAn9JREFUOBGlVEuLE0EQruqZiftwDz4QYT1IYM8eFkHFw/4HYX+GB3/B4l/YP+CP8OBNTwpCwFMQXAQPKtnsg5nJZpKdni6/6kzHvAYDFtRUT71f3UwAEbkLch9ogQxcBwRKMfAnM1/CBwgrbxkgPAYqlBOy1jfovlaPsEiWPROZmqmZKKzOYCJb/AbdYLso9/9B6GppBRqCrjSYYaquZq20EUKAzVpjo1FzWRDVrNay6C/HDxT92wXrAVCH3ASqq5VqEtv1WZ13Mdwf8LFyyKECNbgHHAObWhScf4Wnj9CbQpPzWYU3UFoX3qkhlG8AY2BTQt5/EA7qaEPQsgGLWied0A8VKrHAsCC1eJ6EFoUd1v6GoPOaRAtDPViUr/wPzkIFV9AaAZGtYB568VyJfijV+ZBzlVZJ3W7XHB2RESGe4opXIGzRTdjcAupOK09RA6kzr1NTrTj7V1ugM4VgPGWEw+e39CxO6JUw5XhhKihmaDacU2GiR0Ohcc4cZ+Kq3AjlEnEeRSazLs6/9b/kh4eTC+hngE3QQD7Yyclxsrf3cpxsPXn+cFdenF9aqlBXMXaDiEyfyfawBz2RqC/O9WF1ysacOpytlUSoqNrtfbS642+4D4CS9V3xb4u8P/ACI4O810efRu6KsC0QnjHJGaq4IOGUjWTo/YDZDB3xSIxcGyNlWcTucb4T3in/3IaueNrZyX0lGOrWndstOr+w21UlVFokILjJLFhPukbVY8OmwNQ3nZgNJNmKDccusSb4UIe+gtkI+9/bSLJDjqn763f5CQ5TLApmICkqwR0QnUPKZFIUnoozWcQuRbC0Km02knj0tPYx63furGs3x/iPnz83zJDVNtdP3QAAAABJRU5ErkJggg==)](https://usegalaxy.eu/root?tool_id=jasminesv)
 3 | 
 4 | 
 5 | 
 6 | 
 7 | # Jasmine
 8 | 
 9 | JASMINE: Jointly Accurate Sv Merging with Intersample Network Edges
10 | 
11 | Version 1.1.5
12 | 
13 | This tool is used to merge structural variants (SVs) across samples.  Each sample has a number of SV calls, consisting of position information (chromosome, start, end, length), type and strand information, and a number of other values.  Jasmine represents the set of all SVs across samples as a network, and uses a modified minimum spanning forest algorithm to determine the best way of merging the variants such that each merged variants represents a set of analogous variants occurring in different samples.
14 | 
15 | 
16 | ## Conda Installation
17 | 
18 | The recommended installation method is through [bioconda](https://bioconda.github.io/).
19 | 
20 | Conda Installation command (typically takes under a minute to install):
21 | 
22 | ```
23 | conda config --add channels bioconda
24 | conda config --add channels conda-forge
25 | conda install jasminesv
26 | ```
27 | 
28 | 
29 | ## Instructions for building from source
30 | 
31 | When running Jasmine, one of the preprocessing options is to run Iris, a tool which refines the sequences and breakpoints of insertions in datasets with high-error reads.  Iris depends on samtools, minimap2, and racon by default, which can be installed separately and either added to your path or pointed to with the `iris_args` parameter.  Once these dependencies are installed (or if running Jasmine without Iris preprocessing), Jasmine can be built with the following command:
32 | 
33 | ```
34 | path_to_jasmine_repo/build_jar.sh
35 | ```
36 | 
37 | 
38 | ## Instructions for running
39 | 
40 | After building the jar file, Jasmine can be run with the executable file `jasmine`, which will be in the main folder of this repository if building from source, or in the condabin folder if installed through conda.  Running it with no parameters will print a usage menu describing the required and optional arguments.
41 | 
42 | 
43 | ## Demo Dataset
44 | To run Jasmine on HiFi data from the HG002 trio, run the following commands (typically takes about a minute to download and under five minutes to run on a modern desktop):
45 | ```
46 | wget http://data.schatz-lab.org/jasmine/HG002Trio/UnmergedVCFs/HG002vGRCh38_wm_50md_PBCCS_sniffles.s2l20.refined.nSVtypes.ism.vcf.gz
47 | wget http://data.schatz-lab.org/jasmine/HG002Trio/UnmergedVCFs/HG003vGRCh38_wm_50md_PBCCS_sniffles.s2l20.refined.nSVtypes.ism.vcf.gz
48 | wget http://data.schatz-lab.org/jasmine/HG002Trio/UnmergedVCFs/HG004vGRCh38_wm_50md_PBCCS_sniffles.s2l20.refined.nSVtypes.ism.vcf.gz
49 | wget http://data.schatz-lab.org/jasmine/HG002Trio/HG002Trio_HiFi.merged.vcf.gz
50 | gunzip *
51 | ls *vGRCh38_wm_50md_PBCCS_sniffles.s2l20.refined.nSVtypes.ism.vcf > filelist.txt
52 | jasmine file_list=filelist.txt out_file=merged.vcf
53 | jasmine --dup_to_ins --postprocess_only out_file=merged.vcf
54 | ```
55 | 
56 | The output of merged.vcf should then exactly match the contents of HG002Trio_HiFi.merged.vcf.
57 | 
58 | 
59 | ## Optimized SV Inference Pipeline
60 | 
61 | Jasmine is offered as standalone software and will accurately merge SV calls from any SV callers, including short-read callers. However, if calling SVs from genomic long reads (PacBio CLR, PacBio HiFi, or Oxford Nanopore), for best results, we recommend using the following optimized pipeline to obtain population-scale SV calls from FASTQ files.  This pipeline is provided as a [Snakemake pipeline](https://github.com/mkirsche/Jasmine/tree/master/pipeline). 
62 | 
63 | 
64 | ![Jasmine SV Inference Pipeline](https://github.com/mkirsche/Jasmine/blob/master/pipeline/pipelineoverview.svg)
65 | 
66 | 
67 | 
68 | ## IGV visualization module
69 | 
70 | Jasmine also includes a module for automating the creation of [IGV](http://software.broadinstitute.org/software/igv/) screenshots of variants of interest.  It can be run through the `igv_jasmine` executable file.  Running it with no parameters will print a usage menu describing the required and optional arguments, and it requires at minimum the following:
71 | - BAM files from which variants were called in each sample
72 | - The reference genome
73 | - The merged VCF file, or a BED file with regions of interest
74 | 
75 | Running this module creates a folder which will store IGV screenshots for each variant (optionally filtered based on the command line parameters), and populates that folder with a .bat file, a script which can be run through IGV by selecting Tools -> Run Batch Script and navigating to the file.  After running this script, the folder containing the .bat file will also include images of the regions surrounding each variant of interest.
76 | 
77 | 
78 | ## User Manual
79 | 
80 | The user manual with detailed information about input/output files and command line arguments can be found here: https://github.com/mkirsche/Jasmine/wiki/Jasmine-User-Manual
81 | 
82 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
 1 | if [ "$(uname -s)" = 'Linux' ]; then
 2 |     BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
 3 | else
 4 |     BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
 5 | fi
 6 | 
 7 | WORKINGDIR=`pwd`
 8 | 
 9 | cd $BINDIR
10 | git submodule update --init --recursive
11 | cd $WORKINGDIR
12 | $BINDIR/Iris/build.sh
13 | $BINDIR/Iris/rebuild_default_external.sh
14 | 
15 | javac -cp $BINDIR/Iris/src $BINDIR/src/*.java 
16 | 


--------------------------------------------------------------------------------
/build_jar.sh:
--------------------------------------------------------------------------------
 1 | if [ "$(uname -s)" = 'Linux' ]; then
 2 |     BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
 3 | else
 4 |     BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
 5 | fi
 6 | 
 7 | git submodule update $BINDIR/Iris
 8 | 
 9 | irisjar=''
10 | if [ $# -eq 1 ]
11 | then
12 |     echo "Iris jar: " $1
13 |     irisjar=$1
14 | else
15 |     $BINDIR/Iris/build_jar.sh
16 |     irisjar=$BINDIR/Iris/iris.jar
17 | fi
18 | 
19 | cp $irisjar $BINDIR/jasmine_iris.jar
20 | 
21 | cd $BINDIR/src
22 | javac -cp $BINDIR/jasmine_iris.jar *.java
23 | jar -c -e Main -f jasmine.jar *.class
24 | jar -c -e PreSplit -f jasmine_split.jar *.class
25 | jar -c -e IgvScreenshotMaker -f jasmine_igv.jar *.class
26 | mv jasmine.jar $BINDIR
27 | mv jasmine_igv.jar $BINDIR
28 | mv jasmine_split.jar $BINDIR
29 | cd $BINDIR
30 | 


--------------------------------------------------------------------------------
/build_no_iris.sh:
--------------------------------------------------------------------------------
 1 | if [ "$(uname -s)" = 'Linux' ]; then
 2 |     BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
 3 | else
 4 |     BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
 5 | fi
 6 | 
 7 | if [ ! -d $BINDIR/Iris/src ]
 8 | then
 9 |     git submodule update --init --remote Iris
10 | fi
11 | 
12 | javac -cp $BINDIR/Iris/src $BINDIR/src/*.java 
13 | 
14 | 


--------------------------------------------------------------------------------
/igv_jasmine:
--------------------------------------------------------------------------------
 1 | # Script for running Jasmine's IgvScreenshotMaker
 2 | if [ "$(uname -s)" = 'Linux' ]; then
 3 |     BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
 4 | else
 5 |     BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
 6 | fi
 7 | 
 8 | java -jar $BINDIR/jasmine_igv.jar "${@:1}"
 9 | 
10 | 


--------------------------------------------------------------------------------
/jasmine:
--------------------------------------------------------------------------------
 1 | # Script for running Jasmine
 2 | if [ "$(uname -s)" = 'Linux' ]; then
 3 |     BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
 4 | else
 5 |     BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
 6 | fi
 7 | 
 8 | java -cp $BINDIR/jasmine_iris.jar:$BINDIR/jasmine.jar Main iris_args=samtools_path=samtools,racon_path=racon,minimap_path=minimap2 "${@:1}"
 9 | 
10 | 


--------------------------------------------------------------------------------
/jasmine.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mkirsche/Jasmine/a4f6988eeb191c1f8cf2e8d73869b6a5ffc14665/jasmine.jar


--------------------------------------------------------------------------------
/jasmine_igv.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mkirsche/Jasmine/a4f6988eeb191c1f8cf2e8d73869b6a5ffc14665/jasmine_igv.jar


--------------------------------------------------------------------------------
/jasmine_iris.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mkirsche/Jasmine/a4f6988eeb191c1f8cf2e8d73869b6a5ffc14665/jasmine_iris.jar


--------------------------------------------------------------------------------
/jasmine_split.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mkirsche/Jasmine/a4f6988eeb191c1f8cf2e8d73869b6a5ffc14665/jasmine_split.jar


--------------------------------------------------------------------------------
/pipeline/README.md:
--------------------------------------------------------------------------------
 1 | # Automated pipeline for alignment and SV calling in long-read datasets
 2 | 
 3 | ## Installation
 4 | Ensure that `snakemake` is installed.
 5 | Clone the repository via `git clone https://github.com/mkirsche/Jasmine.git` into a general location (e.g., `/path/to/pipelines`) on a computing cluster.
 6 | 
 7 | ## Experiment run
 8 | Determine and `cd` into experiment dedicated folder (e.g., `experiment`).
 9 | Create symlinks to all `snakefile` and `py` files in the `Jasmine/pipeline` master folder:
10 | ```bash
11 | ln -s /path/to/pipelines/Jasmine/pipeline/*snakefile .
12 | ln -s /path/to/pipelines/Jasmine/pipeline/*py .
13 | ``` 
14 | Copy configuration `yaml` files:
15 | ```bash
16 | cp /path/to/pipelines/Jasmine/pipeline/*yaml .
17 | ```
18 | Set up targeted dataset inside `data.yaml` file.
19 | 
20 | Run snakemake (dry run) to see automated pipeline:
21 | ```bash
22 | snakemake -s pipeline.snakefile -npr
23 | ``` 
24 | If the dry run produces satisfactory results, run `snakemake` with production settings, including, possible cluster setup, multithreading, etc.
25 | 
26 | ### Example SLURM run
27 | ```bash
28 | snakemake -s pipeline.snakefile --latency-wait 200 -pr -j 20 --rerun-incomplete --cluster "sbatch --account={cluster.account} --partition={cluster.partition} --job-name={cluster.name} --nodes={cluster.nodes} --cpus-per-task={cluster.nCPUs} --time={cluster.time} --out={cluster.out} --err={cluster.err} --mem={cluster.mem_mb}M"
29 | ```
30 | which ensures that
31 |  * no more than 20 jobs (`-j`) are submitted at a time
32 |  * any incomplete results from possible previous failed runs are regenerated (`--rerun-incomplete`)
33 |  * sets up a SLURM submission setup, which in turn requests a single node of `parallel` partition per job with:
34 |     * a 3 day time limit,
35 |     * 24G of RAM per node,
36 | 
37 | ## Pipeline Overview
38 | 
39 | 1. Align reads in each sample with [Winnowmap](https://github.com/marbl/Winnowmap) with the recommended parameters for the type of sequencing data being used
40 | 2. Call SVs in each sample with [sniffles](https://github.com/fritzsedlazeck/Sniffles) using sensitive parameters and report all supporting reads: `sniffles -m <alignments> -v <vcf> --threads <threads> --min_support 2 --max_distance 50  --min_length 20 --num_reads_report -1`
41 | 3. Convert duplications to insertions temporarily for breakpoint refinement and better cross-sample comparison: `jasmine --dup_to_ins --preprocess_only vcf_filelist=<vcf> --comma_filelist`
42 | 4. Refine SVs in each sample with [Iris](https://github.com/mkirsche/Iris/)
43 | 5. Normalize SV types in each sample: `jasmine --preprocess_only --pre_normalize --comma_filelist file_list=<vcf>`
44 | 6. Mark high-confidence callset (high-specificity callset) in each sample: `jasmine file_list=<vcf> --comma_filelist --preprocess_only --mark_specific spec_reads=<min(10, 25% average coverage)> spec_len=30`
45 | 7. Remove duplicate calls in each sample: `jasmine file_list=<vcf> max_dist=200 --allow_intrasample out_file=<outputvcf> --comma_filelist --nonlinear_dist`
46 | 8. Generate a list of all finalized per-sample VCF files (txt file, one per line)
47 | 9. Merge SVs across samples: `jasmine file_list=<vcflist> out_file=<outputmergedvcf>`
48 | 10. Convert insertions back to duplications: `jasmine --dup_to_ins --postprocess_only out_file=<mergedvcf>`
49 | 11. Remove low-confidence or imprecise calls: `cat <mergedvcf> | grep -v 'IMPRECISE;' | grep -v 'IS_SPECIFIC=0'`
50 | 


--------------------------------------------------------------------------------
/pipeline/call_svs_sniffles_single.snakefile:
--------------------------------------------------------------------------------
  1 | import os
  2 | import utils
  3 | 
  4 | if os.path.exists("data.yaml"):
  5 |     configfile: "data.yaml"
  6 | if os.path.exists("tools.yaml"):
  7 |     configfile: "tools.yaml"
  8 | 
  9 | output_dir = config.get(utils.OUTPUT_DIR, "")
 10 | alignment_output_dir = os.path.join(output_dir, utils.ALIGNMENTS)
 11 | svs_output_dir = os.path.join(output_dir, utils.SVS)
 12 | raw_svs_output_dir = os.path.join(svs_output_dir, utils.RAW)
 13 | 
 14 | utils.ensure_samples_correctness(config)
 15 | sample_to_reads_paths = utils.get_samples_to_reads_paths(config)
 16 | utils.ensure_ref_correctness(config)
 17 | 
 18 | sniffles_sens_suffix = utils.get_sniffles_sens_suffix(config)
 19 | samples_regex = utils.get_samples_regex(sample_to_reads_paths)
 20 | 
 21 | sniffles_config = config.get(utils.TOOLS, {}).get(utils.SNIFFLES, {})
 22 | jasmine_config=config.get(utils.TOOLS, {}).get(utils.JASMINE, {})
 23 | iris_config=config.get(utils.TOOLS, {}).get(utils.IRIS, {})
 24 | tech_regex = utils.get_tech_regex(config)
 25 | java_config=config.get(utils.TOOLS, {}).get(utils.JAVA, {})
 26 | sv_sizes_config=config.get(utils.TOOLS, {}).get(utils.SV_SIZES, {})
 27 | 
 28 | rule raw_sv_tally:
 29 |     input: os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + "{file_suffix}")
 30 |     output: os.path.join(raw_svs_output_dir, utils.STATS, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + "{file_suffix}.stats.sizes.txt")
 31 |     log: os.path.join(raw_svs_output_dir, utils.LOG, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + "{file_suffix}.stats.sizes.txt.log")
 32 |     resources:
 33 |         mem_mb=utils.DEFAULT_CLUSTER_MEM_MB
 34 |     params:
 35 |         python=config.get(utils.TOOLS, {}).get(utils.PYTHON, {}).get(utils.PATH, "python3"),
 36 |         script_path=sv_sizes_config.get(utils.PATH, "sv_sizes.py"),
 37 |         bins=sv_sizes_config.get(utils.BINS, "1,30,50,100,150,200,350,300,500,750,1000,2000,5000,10000,50000,100000,500000"),
 38 |         types=sv_sizes_config.get(utils.TYPES, "INS,DEL,DUP,INV,TRA"),
 39 |         abs_length="" if sv_sizes_config.get(utils.ABS_LENGTH, True) else "--no-abs-length",
 40 |         info_len_field=sv_sizes_config.get(utils.INFO_LENGTH_FIELD, "SVLEN")
 41 |     shell:
 42 |          "{params.python} {params.script_path} {input} -o {output} --bins {params.bins} --types {params.types} {params.abs_length} --info-len-field {params.info_len_field} &> {log}"
 43 | 
 44 | 
 45 | rule get_raw_specific:
 46 |     output: os.path.join(raw_svs_output_dir, "{sample," + samples_regex + "}_{tech," + tech_regex + "}_sniffles." + sniffles_sens_suffix + ".specific.vcf")
 47 |     input: vcf=os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + "_markedSpec.vcf"),
 48 |            vcf_list=os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + ".vcf_list_markedSpec.txt")
 49 |     resources:
 50 |         mem_mb=utils.DEFAULT_CLUSTER_MEM_MB
 51 |     shell:
 52 |         "awk '($0 ~/^#/ || $0 ~/IS_SPECIFIC=1/)' {input.vcf} > {output}"
 53 | 
 54 | rule mark_specific_in_raw:
 55 |     output: vcf=temp(os.path.join(raw_svs_output_dir, "{sample," + samples_regex + "}_{tech," + tech_regex + "}_sniffles." + sniffles_sens_suffix + "_markedSpec.vcf")),
 56 |             vcf_file_list=temp(os.path.join(raw_svs_output_dir, "{sample," + samples_regex + "}_{tech," + tech_regex + "}_sniffles." + sniffles_sens_suffix + ".vcf_list_markedSpec.txt"))
 57 |     input: vcf=os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + ".vcf"),
 58 |            coverage=os.path.join(alignment_output_dir, utils.STATS, "{sample}_{tech}.coverage.txt"),
 59 |            vcf_file_list=os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + ".vcf_list.txt")
 60 |     threads: lambda wc: min(cluster_config.get("sensitive_ins_to_dup_conversion", {}).get(utils.NCPUS, utils.DEFAULT_THREAD_CNT), jasmine_config.get(utils.THREADS, utils.DEFAULT_THREAD_CNT))
 61 |     log: os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + "_markedSpec.vcf.log")
 62 |     resources:
 63 |         mem_mb=lambda wildcards, threads: jasmine_config.get(utils.MEM_MB_CORE, 20000) + jasmine_config.get(utils.MEM_MB_PER_THREAD, 1000) * threads
 64 |     params:
 65 |         output_dir=raw_svs_output_dir,
 66 |         min_support_fixed=jasmine_config.get(utils.SPECIFIC_MARKED, {}).get(utils.SPEC_READS_FIXED, 10),
 67 |         min_support_fraction=jasmine_config.get(utils.SPECIFIC_MARKED, {}).get(utils.SPEC_READS_FRACTION, 0.25),
 68 |         min_length=jasmine_config.get(utils.SPECIFIC_MARKED, {}).get(utils.SPEC_LEN, 30),
 69 |         java_src=":".join(x for x in [jasmine_config.get(utils.SRC_PATH, ""), iris_config.get(utils.SRC_PATH, "")] if len(x) > 0),
 70 |         java=java_config.get(utils.PATH, "java"),
 71 |     run:
 72 |         min_support=utils.get_min_support(input.coverage, params.min_support_fixed, params.min_support_fraction)
 73 |         shell("{params.java} -cp {params.java_src} Main file_list={input.vcf_file_list} --preprocess_only --mark_specific out_dir={params.output_dir} spec_reads=" + str(min_support) + " spec_len={params.min_length} out_file=test.vcf &> {log}")
 74 | 
 75 | rule raw_vcf_files_list:
 76 |     input: os.path.join(raw_svs_output_dir, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + ".vcf")
 77 |     output: temp(os.path.join(raw_svs_output_dir, "{sample," + samples_regex + "}_{tech," + tech_regex + "}_sniffles." + sniffles_sens_suffix + ".vcf_list.txt"))
 78 |     resources:
 79 |         mem_mb=utils.DEFAULT_CLUSTER_MEM_MB
 80 |     run:
 81 |         dirname = os.path.dirname(output[0])
 82 |         os.makedirs(dirname,exist_ok=True)
 83 |         with open(output[0], "wt") as dest:
 84 |             print(input[0], file=dest)
 85 | 
 86 | def get_sniffles_parameter(parameter, sample=None, tech=None, default=None):
 87 |     if sample is None or tech is None:
 88 |         return default
 89 |     result = default
 90 |     result = sniffles_config.get(parameter, result)
 91 |     result = sniffles_config.get(tech, {}).get(parameter, result)
 92 |     for sample_data in config["samples"]:
 93 |         if sample_data["sample"] == sample and sample_data["tech"] == tech:
 94 |             result = sample_data.get(utils.SNIFFLES, {}).get(parameter, result)
 95 |             break
 96 |     return result
 97 | 
 98 | 
 99 | rule sensitive_svs_sniffles:
100 |     input: os.path.join(alignment_output_dir, "{sample}_{tech}.sort.bam")
101 |     output: os.path.join(raw_svs_output_dir, "{sample," + samples_regex + "}_{tech," + tech_regex + "}_sniffles." + sniffles_sens_suffix + ".vcf")
102 |     threads: lambda wildcards: min(cluster_config.get("sensitive_svs_sniffles", {}).get(utils.NCPUS, utils.DEFAULT_THREAD_CNT), sniffles_config.get(utils.THREADS, utils.DEFAULT_THREAD_CNT))
103 |     log: os.path.join(raw_svs_output_dir, utils.LOG, "{sample}_{tech}_sniffles." + sniffles_sens_suffix + ".vcf.log")
104 |     resources:
105 |         mem_mb = lambda wildcards, threads: sniffles_config.get(utils.MEM_MB_CORE, 25000) + sniffles_config.get(utils.MEM_MB_PER_THREAD, 1000) * threads
106 |     params:
107 |         sniffles = sniffles_config.get(utils.PATH, "sniffles"),
108 |         min_length = lambda wc: get_sniffles_parameter(utils.MIN_LENGTH, sample=wc.sample, tech=wc.tech, default=20),
109 |         min_support = lambda wc: get_sniffles_parameter(utils.MIN_SUPPORT, sample=wc.sample, tech=wc.tech, default=2),
110 |         max_num_splits = lambda wc: get_sniffles_parameter(utils.MAX_NUM_SPLIT_READS, sample=wc.sample, tech=wc.tech, default=10),
111 |         max_distance = lambda wc: get_sniffles_parameter(utils.MAX_DISTANCE, sample=wc.sample, tech=wc.tech, default=50),
112 |         num_reads_report = lambda wc: get_sniffles_parameter(utils.NUM_READS_REPORT, sample=wc.sample, tech=wc.tech, default=-1),
113 |         min_seq_size = lambda wc: get_sniffles_parameter(utils.MIN_SEQ_SIZE, sample=wc.sample, tech=wc.tech, default=1000),
114 |     shell:
115 |         "{params.sniffles} -m {input} -v {output} --threads {threads} --min_support {params.min_support} --max_distance {params.max_distance} --max_num_splits {params.max_num_splits} --min_length {params.min_length} --num_reads_report {params.num_reads_report} --min_seq_size {params.min_seq_size} &> {log}"
116 | 
117 | localrules: raw_vcf_files_list, get_raw_specific
118 | 
119 | include: "align_single.snakefile"


--------------------------------------------------------------------------------
/pipeline/cut_regions_bam.snakefile:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | files = []
 4 | out_dir = config["out_dir"]
 5 | suffix = config.get("name", "regions")
 6 | input_by_base = {}
 7 | for path in config["bams"]:
 8 |     basename = os.path.basename(path)
 9 |     base = os.path.splitext(basename)[0]
10 |     input_by_base[base] = path
11 |     files.append(os.path.join(out_dir, f"{base}.{suffix}.sort.bam"))
12 |     files.append(os.path.join(out_dir, f"{base}.{suffix}.sort.bam.bai"))
13 | 
14 | regions = []
15 | with open(config["regions"], "rt") as source:
16 |     for line in source:
17 |         data = line.strip().split("\t")
18 |         regions.append(f"{data[0]}:{data[1]}-{data[2]}")
19 | regions = " ".join(regions)
20 | 
21 | 
22 | rule all:
23 |     input: files
24 | 
25 | 
26 | rule index_bam:
27 |     output: os.path.join(out_dir, "{base}.{suffix," + suffix + "}.sort.bam.bai")
28 |     input: os.path.join(out_dir, "{base}.{suffix}.sort.bam")
29 |     shell:
30 |         "samtools index {input}"
31 | 
32 | rule sort_cut_bam:
33 |     output: os.path.join(out_dir, "{base}.{suffix," + suffix+ "}.sort.bam")
34 |     input: os.path.join(out_dir, "{base}.{suffix}.bam")
35 |     shell:
36 |         "samtools sort -@ 4 -O bam -o {output} {input}"
37 | 
38 | rule create_cut_bam:
39 |     output: temp(os.path.join(out_dir, "{base}.{suffix," + suffix + "}.bam"))
40 |     input: bam=lambda wc: input_by_base[wc.base]
41 |     params:
42 |         regions=regions,
43 |     shell:
44 |         "samtools view -O bam -o {output} {input} {params.regions}"


--------------------------------------------------------------------------------
/pipeline/data.yaml:
--------------------------------------------------------------------------------
 1 | samples:
 2 |   - sample: sample_1
 3 |     # list of paths for fastq reads. Can be with fastq, fq, fastq.gz, or fq.gz extensions. At least one entry is required.
 4 |     reads_paths:
 5 |       - "path1/reads.fq"
 6 |       - "path2/reads.fq"
 7 |       - "path3/reads.fq"
 8 |     # ONT or PB. Required.
 9 |     tech: "ONT"
10 | 
11 |   - sample: sample_2
12 |     reads_paths:
13 |       - "path1/reads.fq"
14 |     existing_alignments:
15 |       - ""
16 |     tech: "ONT"
17 | 
18 | # full path to the reference. Required.
19 | ref: "GRCh38.fa"
20 | 
21 | # not required. `alignments`, `svs`, etc subdirs will be created in it. Default is "."
22 | output_dir: ""


--------------------------------------------------------------------------------
/pipeline/fix_sam.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | 
 4 | import pysam
 5 | 
 6 | 
 7 | def main():
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("sam", type=argparse.FileType("rt"), default=sys.stdin)
10 |     parser.add_argument("-o", "--output", type=argparse.FileType("rt"), default=sys.stdout)
11 |     args = parser.parse_args()
12 |     source = pysam.AlignmentFile(args.sam, "r")
13 |     dest = pysam.AlignmentFile(args.output, "w", template=source)
14 |     counter = 0
15 |     while True:
16 |         try:
17 |             record = next(source)
18 |             dest.write(record)
19 |         except OSError as oe:
20 |             print(oe, file=sys.stderr)
21 |             counter += 1
22 |         except StopIteration:
23 |             break
24 |     print(f"{counter} overall skipped records", file=sys.stderr)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     main()
29 | 


--------------------------------------------------------------------------------
/pipeline/main_chr_filter.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | 
 5 | def execute(command: str, dry: bool = True):
 6 |     if dry:
 7 |         print(command)
 8 |     else:
 9 |         os.system(command)
10 | 
11 | 
12 | def main():
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("VCF")
15 |     parser.add_argument("--chr_bed", default="/work-zfs/mschatz1/pipelines/resources/ref/human/GRCh38.main_chr.bed")
16 |     parser.add_argument("--bad-chr-FLs", default="LKG")
17 |     parser.add_argument("-n", "--dry-run", dest="dry", action="store_true")
18 |     args = parser.parse_args()
19 |     basename = os.path.basename(args.VCF)
20 |     new_vcf_name = basename.split(".")[0] + ".all_chr." + ".".join(basename.split(".")[1:])
21 |     execute(f'mv {args.VCF} {new_vcf_name}', dry=args.dry)
22 |     execute(f'grep "#" {new_vcf_name} > {args.VCF}', dry=args.dry)
23 |     execute(f'bedtools intersect -a {new_vcf_name} -b {args.chr_bed} -u | awk \'{{if ($8 !~/SVTYPE=BND/ || $5 !~ /[\\[\\]][{args.bad_chr_FLs}]/) print $0}}\' >> {args.VCF}',
24 |             dry=args.dry)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     main()
29 | 


--------------------------------------------------------------------------------
/pipeline/marcc_config.yaml:
--------------------------------------------------------------------------------
 1 | "__default__":
 2 |   account: "FILL"
 3 |   time: "24:0:0"
 4 |   nCPUs: 2
 5 |   nodes: 1
 6 |   partition: "shared,parallel,lrgmem"
 7 |   mem_mb: "2000"
 8 |   name: "JOBNAME.{rule}.{wildcards}"
 9 |   out: "logs/cluster/{rule}.{wildcards}.out"
10 |   err: "logs/cluster/{rule}.{wildcards}.err"
11 | 
12 | single_alignment:
13 |   time: "72:0:0"
14 |   partition: "parallel,lrgmem,shared"
15 |   nCPUs: 24
16 |   mem_mb: "{resources.mem_mb}"
17 |   name: "{rule}.{wildcards}"
18 | 
19 | single_sam_to_sort_bam:
20 |   time: "24:0:0"
21 |   partition: "parallel,shared,lrgmem"
22 |   nCPUs: 24
23 |   mem_mb: "{resources.mem_mb}"
24 |   name: "samtools.{rule}.{wildcards}"
25 | 
26 | merge_sorted:
27 |   time: "24:0:0"
28 |   partition: "shared,parallel,lrgmem"
29 |   nCPUs: 1
30 |   mem_mb: "{resources.mem_mb}"
31 |   name: "samtools.{rule}.{wildcards}"
32 | 
33 | merged_average_coverage_samtools:
34 |   time: "24:0:0"
35 |   partition: "shared,parallel,lrgmem"
36 |   nCPUs: 1
37 |   name: "samtools.{rule}.{wildcards}"
38 | 
39 | merged_coverage_mosdepth:
40 |   time: "24:0:0"
41 |   partition: "shared,parallel,lrgmem"
42 |   nCPUs: 24
43 |   name: "samtools.{rule}.{wildcards}"
44 | 
45 | sensitive_svs_sniffles:
46 |   time: "24:0:0"
47 |   partition: "shared,parallel,lrgmem"
48 |   nCPUs: 5
49 |   mem_mb: "{resources.mem_mb}"
50 |   name: "sv_inference.{rule}.{wildcards}"
51 | 
52 | sensitive_ins_to_dup_conversion:
53 |   time: "24:0:0"
54 |   partition: "shared,parallel,lrgmem"
55 |   nCPUs: 2
56 |   mem_mb: "{resources.mem_mb}"
57 |   name: "jasmine_pre.{rule}.{wildcards}"
58 | 
59 | refined_sensitive_new_sv_types:
60 |   time: "72:0:0"
61 |   partition: "parallel,lrgmem,shared"
62 |   nCPUs: 24
63 |   mem_md: "{resources.mem_mb}"
64 |   name: "jasmine_pre.{rule}.{wildcards}"
65 | 
66 | intra_sample_merging:
67 |   time: "24:0:0"
68 |   partition: "parallel,lrgmem,shared"
69 |   nCPUs: 24
70 |   mem_md: "{resources.mem_mb}"
71 |   name: "jasmine_pre.{rule}.{wildcards}"
72 | 


--------------------------------------------------------------------------------
/pipeline/pipeline.snakefile:
--------------------------------------------------------------------------------
 1 | import os
 2 | import utils
 3 | 
 4 | if os.path.exists("data.yaml"):
 5 |     configfile: "data.yaml"
 6 | if os.path.exists("tools.yaml"):
 7 |     configfile: "tools.yaml"
 8 | 
 9 | output_dir = config.get(utils.OUTPUT_DIR, "")
10 | alignment_output_dir = os.path.join(output_dir, utils.ALIGNMENTS)
11 | svs_output_dir = os.path.join(output_dir, utils.SVS)
12 | raw_svs_output_dir = os.path.join(svs_output_dir, utils.RAW)
13 | refined_svs_output_dir = os.path.join(svs_output_dir, utils.REFINED)
14 | ins_to_dup_output_dir = os.path.join(refined_svs_output_dir, utils.INS_TO_DUP)
15 | iris_refined_output_dir = os.path.join(refined_svs_output_dir, utils.IRIS_REFINED)
16 | specific_marked_output_dir = os.path.join(refined_svs_output_dir, utils.SPECIFIC_MARKED)
17 | 
18 | utils.ensure_samples_correctness(config)
19 | sample_to_reads_paths = utils.get_samples_to_reads_paths(config)
20 | utils.ensure_ref_correctness(config)
21 | utils.ensure_enabled_sv_tools(config)
22 | 
23 | 
24 | # during development this thing guarantees that only the latest supported part of pipeline produces results
25 | overall_expected_files = []
26 | # print(sample_to_reads_paths)
27 | for (sample, tech) in sample_to_reads_paths.keys():
28 |     if config.get(utils.ENABLE_ALIGNMENT_STATS, True):
29 |         overall_expected_files.append(os.path.join(alignment_output_dir, utils.STATS, f"{sample}_{tech}.coverage.txt"))
30 |         overall_expected_files.append(os.path.join(alignment_output_dir, utils.STATS, f"{sample}_{tech}.samtools.stats.txt"))
31 |         overall_expected_files.append(os.path.join(alignment_output_dir, utils.STATS, f"{sample}_{tech}.alignment.yield.txt"))
32 |         overall_expected_files.append(os.path.join(alignment_output_dir, utils.STATS, f"{sample}_{tech}.mosdepth.global.dist.txt"))
33 |     if config.get(utils.ENABLE_SV_INFERENCE, True):
34 |         for sv_tool in config[utils.SV_TOOLS_ENABLED]:
35 |             if sv_tool == "sniffles":
36 |                 suffix = utils.get_sniffles_sens_suffix(config) + "."
37 |             else:
38 |                 suffix = ""
39 |             overall_expected_files.append(os.path.join(raw_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}vcf"))
40 |             overall_expected_files.append(os.path.join(raw_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}specific.vcf"))
41 |             overall_expected_files.append(os.path.join(raw_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}vcf.stats.sizes.txt"))
42 |             overall_expected_files.append(os.path.join(raw_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}specific.vcf.stats.sizes.txt"))
43 |             if config.get(utils.ENABLE_SV_REFINEMENT, True):
44 |                 overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.specific.vcf"))
45 |                 overall_expected_files.append(os.path.join(refined_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.specific.vcf.stats.sizes.txt"))
46 |                 overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.vcf"))
47 |                 overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.norm.vcf"))
48 |                 overall_expected_files.append(os.path.join(refined_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.vcf.stats.sizes.txt"))
49 |                 overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.specific.vcf"))
50 |                 overall_expected_files.append(os.path.join(refined_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}refined.specific.vcf.stats.sizes.txt"))
51 |                 overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.vcf"))
52 |                 overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.norm.vcf"))
53 |                 overall_expected_files.append(os.path.join(refined_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}refined.vcf.stats.sizes.txt"))
54 |                 if config.get(utils.ENABLE_IS_MERGING, True):
55 |                     overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.ism.vcf"))
56 |                     overall_expected_files.append(os.path.join(refined_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.ism.vcf.stats.sizes.txt"))
57 |                     overall_expected_files.append(os.path.join(refined_svs_output_dir, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.ism.specific.vcf"))
58 |                     overall_expected_files.append(os.path.join(refined_svs_output_dir, utils.STATS, f"{sample}_{tech}_{sv_tool}.{suffix}refined.nSVtypes.ism.specific.vcf.stats.sizes.txt"))
59 | 
60 | rule main:
61 |     input: overall_expected_files
62 | 
63 | include: "call_svs_sniffles_single.snakefile"
64 | include: "align_single.snakefile"
65 | include: "jasmine_pre.snakefile"


--------------------------------------------------------------------------------
/pipeline/rockfish_config.yaml:
--------------------------------------------------------------------------------
 1 | "__default__":
 2 |   account: "FILL"
 3 |   time: "24:0:0"
 4 |   nCPUs: 2
 5 |   nodes: 1
 6 |   partition: "defq,lrgmem"
 7 |   mem_mb: "2000"
 8 |   name: "JOBNAME.{rule}.{wildcards}"
 9 |   out: "logs/cluster/{rule}.{wildcards}.out"
10 |   err: "logs/cluster/{rule}.{wildcards}.err"
11 | 
12 | single_alignment:
13 |   time: "48:0:0"
14 |   partition: "defq,lrgmem"
15 |   nCPUs: 24
16 |   mem_mb: "{resources.mem_mb}"
17 |   name: "alignment.{rule}.{wildcards}"
18 | 
19 | single_sam_to_sort_bam:
20 |   time: "24:0:0"
21 |   partition: "defq,lrgmem"
22 |   nCPUs: 24
23 |   mem_mb: "{resources.mem_mb}"
24 |   name: "samtools_sort.{rule}.{wildcards}"
25 | 
26 | merge_sorted:
27 |   time: "24:0:0"
28 |   partition: "defq,lrgmem"
29 |   nCPUs: 1
30 |   mem_mb: "{resources.mem_mb}"
31 |   name: "samtools_merge.{rule}.{wildcards}"
32 | 
33 | merged_average_coverage_samtools:
34 |   time: "24:0:0"
35 |   partition: "defq,lrgmem"
36 |   nCPUs: 1
37 |   name: "samtools_cov.{rule}.{wildcards}"
38 | 
39 | merged_coverage_mosdepth:
40 |   time: "24:0:0"
41 |   partition: "defq,lrgmem"
42 |   nCPUs: 24
43 |   name: "mosdepth.{rule}.{wildcards}"
44 | 
45 | sensitive_svs_sniffles:
46 |   time: "24:0:0"
47 |   partition: "defq,lrgmem"
48 |   nCPUs: 5
49 |   mem_mb: "{resources.mem_mb}"
50 |   name: "sniffles.{rule}.{wildcards}"
51 | 
52 | sensitive_ins_to_dup_conversion:
53 |   time: "24:0:0"
54 |   partition: "defq,lrgmem"
55 |   nCPUs: 2
56 |   mem_mb: "{resources.mem_mb}"
57 |   name: "jasmine_pre_ins_dup.{rule}.{wildcards}"
58 | 
59 | refined_sensitive_new_sv_types:
60 |   time: "72:0:0"
61 |   partition: "defq,lrgmem"
62 |   nCPUs: 24
63 |   mem_md: "{resources.mem_mb}"
64 |   name: "jasmine_iris_refine.{rule}.{wildcards}"
65 | 
66 | intra_sample_merging:
67 |   time: "24:0:0"
68 |   partition: "defq,lrgmem"
69 |   nCPUs: 24
70 |   mem_md: "{resources.mem_mb}"
71 |   name: "jasmine_intra.{rule}.{wildcards}"
72 | 


--------------------------------------------------------------------------------
/pipeline/sv_sizes.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import bisect
 3 | import logging
 4 | import sys
 5 | import cyvcf2
 6 | 
 7 | import utils
 8 | 
 9 | 
10 | def bin_string_repr(bin_value: int) -> str:
11 |     suffixes = ["", "K", "M", "G"]
12 |     suffix_index = 0
13 |     while abs(bin_value) >= 1000:
14 |         suffix_index += 1
15 |         bin_value /= 1000
16 |     if bin_value == int(bin_value):
17 |         bin_value = int(bin_value)
18 |     return f"{bin_value:,}{suffixes[suffix_index]}"
19 | 
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument("VCF", type=str)
24 |     parser.add_argument("--bins", default="1,30,50,100,150,200,350,300,500,750,1000,2000,5000,10000,50000,100000,500000")
25 |     parser.add_argument("-o", "--output", default=sys.stdout, type=argparse.FileType("wt"))
26 |     parser.add_argument("--no-out-header", dest="out_header", action="store_false")
27 |     parser.add_argument("--no-out-total-t", dest="out_total_type", action="store_false")
28 |     parser.add_argument("--no-out-total-s", dest="out_total_size", action="store_false")
29 |     parser.add_argument("--no-out-indiv-bins", dest="out_indiv_bins", action="store_false")
30 |     parser.add_argument("--types", type=str, default="INS,DEL,DUP,INV,TRA")
31 |     parser.add_argument("--no-abs-length", dest="abs_length", action="store_false")
32 |     parser.add_argument("--info-len-field", type=str, default="SVLEN")
33 |     parser.add_argument("--info-type-field", type=str, default="SVTYPE")
34 |     args = parser.parse_args()
35 |     logger = logging.getLogger("SV-stats")
36 |     logger.setLevel(logging.DEBUG)
37 |     ch = logging.StreamHandler()
38 |     ch.setLevel(logging.DEBUG)
39 |     logger.addHandler(ch)
40 |     supplied_bins = sorted(set(map(int, args.bins.split(","))))
41 |     bins = [-3000000000] + supplied_bins + [3000000000]
42 |     logger.debug(f"bins: [{','.join(map(str, bins))}]")
43 |     types = [utils.SVType.from_str(string=s) for s in args.types.split(",")]
44 |     if not set(types).issubset({x for x in utils.SVType}):
45 |         logger.critical(f"Supplied type list {','.join(map(str, types))} is not a subset of a standardized 5 types.")
46 |         exit(1)
47 |     bin_counts = {bin_value: {sv_type: 0 for sv_type in types} for bin_value in bins}
48 |     reader = cyvcf2.VCF(args.VCF)
49 |     for cnt, record in enumerate(reader):
50 |         sv_type = utils.get_sv_type(vcf_record=record, info_type_field=args.info_type_field, info_len_field=args.info_len_field, logger=logger)
51 |         sv_length = utils.get_sv_length(record, sv_type=sv_type, abs_value=args.abs_length, info_len_field=args.info_len_field, info_type_field=args.info_type_field)
52 |         bin_index = bisect.bisect_right(bins, sv_length)
53 |         if bin_index < 1:
54 |             logger.error(f"Something is wrong with length bin determination for record {str(record)} with type {str(sv_type)}")
55 |         if sv_type not in bin_counts[bins[bin_index - 1]]:
56 |             continue
57 |         bin_counts[bins[bin_index - 1]][sv_type] += 1
58 |     type_totals = {sv_type: sum(bin_counts[bin_v][sv_type] for bin_v in bins) for sv_type in types}
59 |     header = ["bin"] + types
60 |     if args.out_total_size:
61 |         header += ["total"]
62 |     if args.out_header:
63 |         print(",".join(map(str, header)), file=args.output)
64 |     if args.out_indiv_bins:
65 |         for lv, rv in zip(bins[:-1], bins[1:]):
66 |             bin_str_value = f"[{bin_string_repr(lv)} - {bin_string_repr(rv)})"
67 |             sv_type_values = [bin_counts[lv][sv_type] for sv_type in types]
68 |             bin_total = sum(sv_type_values)
69 |             result = f"{bin_str_value}," + ",".join(map(str, sv_type_values))
70 |             if args.out_total_size:
71 |                 result += f",{bin_total}"
72 |             print(result, file=args.output)
73 |     if args.out_total_type:
74 |         print("total," + ",".join(map(str, (type_totals[sv_type] for sv_type in types))), file=args.output)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/pipeline/sv_supports.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import bisect
 3 | import logging
 4 | import sys
 5 | 
 6 | import cyvcf2
 7 | 
 8 | import utils
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("VCF", type=str)
14 |     parser.add_argument("--supports", default="0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20")
15 |     parser.add_argument("-o", "--output", default=sys.stdout, type=argparse.FileType("wt"))
16 |     parser.add_argument("--no-out-header", dest="out_header", action="store_false")
17 |     parser.add_argument("--no-out-total-b", dest="out_total_bins", action="store_false")
18 |     parser.add_argument("--no-out-type", dest="out_indiv_types", action="store_false")
19 |     parser.add_argument("--types", type=str, default="INS,DEL,DUP,INV,TRA")
20 |     parser.add_argument("--info-support-field", type=str, default="RE")
21 |     parser.add_argument("--info-reads-field", type=str, default="RNAMES")
22 |     parser.add_argument("--info-type-field", type=str, default="SVTYPE")
23 |     parser.add_argument("--info-len-field", type=str, default="SVLEN")
24 |     args = parser.parse_args()
25 |     logger = logging.getLogger("SV-stats")
26 |     logger.setLevel(logging.DEBUG)
27 |     ch = logging.StreamHandler()
28 |     ch.setLevel(logging.DEBUG)
29 |     logger.addHandler(ch)
30 |     bins = sorted(set(map(int, args.supports.split(","))))
31 |     logger.debug(f"bins: [{','.join(map(str, bins))}]")
32 |     types = [utils.SVType.from_str(string=s) for s in args.types.split(",")]
33 |     if not set(types).issubset({x for x in utils.SVType}):
34 |         logger.critical(f"Supplied type list {','.join(map(str, types))} is not a subset of a standardized 5 types.")
35 |         exit(1)
36 |     if not all(map(lambda x: x >= 0, bins)):
37 |         logger.warning(f"Some bins were of negative values. Only non-negative values are permitted. Removing all negative values.")
38 |         bins = [x for x in bins if x >= 0]
39 |     if 0 not in bins:
40 |         logger.warning(f"0 value is not in bins. Adding 0.")
41 |         bins = [0] + bins
42 |     bin_counts = {bin_value: {sv_type: 0 for sv_type in types} for bin_value in bins}
43 |     reader = cyvcf2.VCF(args.VCF)
44 |     for cnt, record in enumerate(reader):
45 |         sv_type = utils.get_sv_type(vcf_record=record, info_type_field=args.info_type_field, info_len_field=args.info_len_field, logger=logger)
46 |         sv_support = utils.get_sv_support_cnt(vcf_record=record, info_re_field=args.info_support_field, info_reads_field=args.info_reads_field)
47 |         bin_index = bisect.bisect_right(bins, sv_support)
48 |         assert bin_index > 0
49 |         if sv_type not in bin_counts[bins[bin_index - 1]]:
50 |             continue
51 |         bin_counts[bins[bin_index - 1]][sv_type] += 1
52 |     header = ["bin"]
53 |     if args.out_indiv_types:
54 |         header += types
55 |     if args.out_total_bins:
56 |         header += ["total"]
57 |     if args.out_header:
58 |         print(",".join(map(str, header)), file=args.output)
59 |     type_totals = {sv_type: sum(bin_counts[bin_v][sv_type] for bin_v in bins) for sv_type in types}
60 |     for bin_value in bins:
61 |         sv_type_values = []
62 |         for sv_type in types:
63 |             sv_type_values.append(type_totals[sv_type])
64 |             type_totals[sv_type] -= bin_counts[bin_value][sv_type]
65 |         bin_total = sum(sv_type_values)
66 |         result = f"{bin_value}"
67 |         if args.out_indiv_types:
68 |              result += "," + ",".join(map(str, sv_type_values))
69 |         if args.out_total_bins:
70 |             result += f",{bin_total}"
71 |         print(result, file=args.output)
72 |     assert all(map(lambda x: x == 0, type_totals.values()))
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/pipeline/tools.yaml:
--------------------------------------------------------------------------------
  1 | sv_tools_enabled:
  2 |   - "sniffles"
  3 | 
  4 | aligner: "ngmlr"
  5 | 
  6 | tools:
  7 |   tmp_dir: ""
  8 |   samtools:
  9 |     path: "samtools"
 10 |     threads: 100
 11 |     mem_mb_per_thread: 1000
 12 |     mem_mb_core: 2000
 13 |   ngmlr:
 14 |     reads_cnt_per_run: 200000
 15 |     path: "ngmlr"
 16 |     threads: 100
 17 |     time: "72:0:0"
 18 |     mem_mb_per_thread: 1000
 19 |     mem_mb_core: 6000
 20 |   sniffles:
 21 |     path: "sniffles"
 22 |     threads: 5
 23 |     min_support: 2
 24 |     min_length: 20
 25 |     max_num_splits: 10
 26 |     max_distance: 50
 27 |     num_reads_report: -1
 28 |     min_seq_size: 1000
 29 |     time: "24:0:0"
 30 |     mem_mb_per_thread: 1000
 31 |     mem_mb_core: 25000
 32 |   java:
 33 |     path: "java"
 34 |   jasmine:
 35 |     src_path: ""
 36 |     threads: 2
 37 |     mem_mb_core: 20000
 38 |     mem_mb_per_thread: 4000
 39 |     ins_to_dup:
 40 |       max_dup_length: 10000
 41 |       script_name: "InsertionsToDuplications"
 42 |     specific_marked:
 43 |       spec_reads_fixed: 10
 44 |       spec_reads_fraction: 0.25
 45 |       spec_len: 30
 46 |     is_merging:
 47 |       normalize_types: True
 48 |       use_types: True
 49 |       use_strands: True
 50 |       use_edit_distance: False
 51 |       use_end: False
 52 |       max_distance: 100
 53 |       min_distance: -1
 54 |       threads: 100
 55 |       strategy: "default"
 56 |       kd_tree_norm: 2
 57 |       min_seq_id: 0
 58 |       max_distance_linear: 0
 59 |       k_jaccard: 9
 60 |   iris:
 61 |     src_path: ""
 62 |     threads: 100
 63 |     mem_mb_core: 20000
 64 |     mem_mb_per_thread: 1000
 65 |     min_ins_length: 30
 66 |     max_out_length: 100000
 67 |     max_ins_dist: 100
 68 |     max_length_change: 0.25
 69 |   minimap2:
 70 |     reads_cnt_per_run: 800000
 71 |     mem_mb_per_thread: 1000
 72 |     mem_mb_core: 6000
 73 |     path: "minimap2"
 74 |     threads: 100
 75 |     time: "72:0:0"
 76 |   racon:
 77 |     path: "racon"
 78 |   sv_sizes:
 79 |     path: "sv_sizes.py"
 80 |     bins: "1,30,50,100,150,200,350,300,500,750,1000,2000,5000,10000,50000,100000,500000"
 81 |     types: "INS,DEL,DUP,INV,TRA"
 82 |     abs_length: True
 83 |     info_length_field: "SVLEN"
 84 |   sam_fix:
 85 |     path: "fix_sam.py"
 86 |   seqtk:
 87 |     path: "seqtk"
 88 |   mosdepth:
 89 |     path: "mosdepth"
 90 |     mem_mb_per_thread: 1000
 91 |     mem_mb_core: 2000
 92 |     threads: 100
 93 |     paer_base: False
 94 |     fast_mode: True
 95 |     window_size: 500
 96 |   meryl:
 97 |     path: "meryl"
 98 |     distinct: 0.9998
 99 |     k: 15
100 |   winnowmap:
101 |     reads_cnt_per_run: 800000
102 |     mem_mb_per_thread: 1000
103 |     mem_mb_core: 6000
104 |     path: "winnowmap"
105 |     threads: 100
106 |     time: "72:0:0"
107 | 


--------------------------------------------------------------------------------
/pipeline/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from collections import defaultdict
  4 | from enum import Enum
  5 | import logging
  6 | from typing import Optional
  7 | 
  8 | DEFAULT_THREAD_CNT = 99
  9 | DEFAULT_CLUSTER_MEM_MB = 4000
 10 | 
 11 | OUTPUT_DIR = "output_dir"
 12 | ALIGNMENTS = "alignments"
 13 | READS_PATHS = "reads_paths"
 14 | SAMPLES = "samples"
 15 | LOG = "log"
 16 | TOOLS = "tools"
 17 | THREADS = "threads"
 18 | SAMTOOLS = "samtools"
 19 | ALIGNER = "aligner"
 20 | MOSDEPTH = "mosdepth"
 21 | WINDOW_SIZE = "window_size"
 22 | PER_BASE = "per_base"
 23 | FAST_MODE = "fast_mode"
 24 | PYTHON = "python"
 25 | 
 26 | WINNOWMAP = "winnowmap"
 27 | MERYL = "meryl"
 28 | DISTINCT = "distinct"
 29 | K = "k"
 30 | 
 31 | AWK = "awk"
 32 | SEQTK = "seqtk"
 33 | NGMLR = "ngmlr"
 34 | PATH = "path"
 35 | TMP_DIR = "tmp_dir"
 36 | TECH = "tech"
 37 | REFERENCE = "ref"
 38 | READS_CNT_PER_RUN = "reads_cnt_per_run"
 39 | 
 40 | 
 41 | RAW = "raw"
 42 | SVS = "svs"
 43 | REFINED = "refined"
 44 | INS_TO_DUP = "ins_to_dup"
 45 | IRIS_REFINED = "iris_refined"
 46 | NORM_SV = "norm_sv"
 47 | SPECIFIC_MARKED = "specific_marked"
 48 | SPEC_READS_FIXED = "spec_reads_fixed"
 49 | SPEC_READS_FRACTION = "spec_reads_fraction"
 50 | SPEC_LEN = "spec_len"
 51 | MAX_DUP_LENGTH = "max_dup_length"
 52 | 
 53 | SNIFFLES = "sniffles"
 54 | MIN_SUPPORT = "min_support"
 55 | MIN_LENGTH = "min_length"
 56 | MAX_NUM_SPLIT_READS = "max_num_splits"
 57 | MAX_DISTANCE = "max_distance"
 58 | NUM_READS_REPORT = "num_reads_report"
 59 | MIN_SEQ_SIZE = "min_seq_size"
 60 | 
 61 | SV_TOOLS_ENABLED = "sv_tools_enabled"
 62 | 
 63 | JAVA = "java"
 64 | JASMINE = "jasmine"
 65 | IRIS = "iris"
 66 | SRC_PATH = "src_path"
 67 | MINIMAP2 = "minimap2"
 68 | RACON = "racon"
 69 | SCRIPT_NAME = "script_name"
 70 | MIN_INS_LENGTH = "min_ins_length"
 71 | MAX_OUT_LENGTH = "max_out_length"
 72 | MAX_INS_DIST = "max_ins_dist"
 73 | MAX_LENGTH_CHANGE = "max_length_change"
 74 | IS_MERGING = "is_merging"
 75 | NORMALIZE_TYPES = "normalize_types"
 76 | USE_STRANDS = "use_strands"
 77 | USE_TYPES = "use_types"
 78 | USE_EDIT_DISTANCE = "use_edit_distance"
 79 | USE_END = "use_end"
 80 | STRATEGY = "strategy"
 81 | KD_TREE_NORM = "kd_tree_norm"
 82 | MAX_DISTANCE_LINEAR = "max_distance_linear"
 83 | MIN_DISTANCE = "min_distance"
 84 | MIN_SEQ_ID = "min_seq_id"
 85 | K_JACCARD = "k_jaccard"
 86 | 
 87 | SV_SIZES = "sv_sizes"
 88 | BINS = "bins"
 89 | TYPES = "types"
 90 | ABS_LENGTH = "abs_length"
 91 | INFO_LENGTH_FIELD = "info_length_field"
 92 | 
 93 | MEM_MB_PER_THREAD = "mem_mb_per_thread"
 94 | MEM_MB_CORE = "mem_mb_core"
 95 | 
 96 | NCPUS = "nCPUs"
 97 | 
 98 | STATS = "stats"
 99 | 
100 | 
101 | ENABLE_SV_INFERENCE = "enable_sv_inference"
102 | ENABLE_SV_REFINEMENT = "enable_sv_refinement"
103 | ENABLE_IS_MERGING = "enable_is_merging"
104 | ENABLE_ALIGNMENT_STATS = "enable_alignment_stats"
105 | 
106 | 
107 | EXISTING_ALIGNMENTS = "existing_alignments"
108 | BGZIP = "bgzip"
109 | 
110 | ##########
111 | #
112 | # snakemake data preparation utils
113 | #
114 | ##########
115 | 
116 | 
117 | def ensure_samples_correctness(config):
118 |     if SAMPLES not in config or not isinstance(config[SAMPLES], list) or len(config[SAMPLES]) < 1:
119 |         raise ValueError("Configuration data file is missing information about samples or the setup is not dictionary-like")
120 | 
121 | 
122 | def get_samples_to_reads_paths(config):
123 |     samples_to_reads_paths = defaultdict(list)
124 |     for sample_data in config["samples"]:
125 |         sample_name = sample_data["sample"]
126 |         if TECH not in sample_data or sample_data[TECH].lower() not in ["ont", "pb", "pacbio", "pbccs", "pacbioccs"]:
127 |             raise ValueError(
128 |                 f"incorrect or missing tech {sample_data[TECH]} specified for sample {sample_name} in data.yaml. Only ONT or PB are supported, and tech specification is required")
129 |         tech = sample_data[TECH].upper()
130 |         has_alignment = os.path.exists(os.path.join(config.get(OUTPUT_DIR, ""), ALIGNMENTS, f"{sample_name}_{tech}.sort.bam"))
131 |         if not has_alignment:
132 |             if READS_PATHS not in sample_data or not isinstance(sample_data[READS_PATHS], list) or len(sample_data[READS_PATHS]) < 1:
133 |                 raise ValueError(
134 |                     f"Error when parsing reads paths for sample {sample_name} sample. Make sure the entries are formatted as a list of strings under the {READS_PATHS} key")
135 |             if (sample_name, tech) in samples_to_reads_paths:
136 |                 warning_message = f"sample {sample_name} with read tech {tech} is specified in input data multiple times."
137 |                 if not config.get("allow_dup_st_entries", False):
138 |                     raise ValueError(f"Error! {warning_message}")
139 |                 else:
140 |                     print(f"WARNING! {warning_message} Proceeding because `allow_dup_st_entries` is set to True", file=sys.stderr)
141 |             for read_path in sample_data[READS_PATHS]:
142 |                 if not read_path.endswith(("fastq", "fq", "fastq.gz", "fq.gz", "fasta", "fasta.gz", "fa", "fa.gz")):
143 |                     raise ValueError(f"Unsupported input format for read path {read_path}. Only 'fastq', 'fq', 'fastq.gz', 'fq.gz', 'fasta', 'fasta.gz', 'fa', and 'fa.gz' are supported")
144 |                 samples_to_reads_paths[(sample_name, tech)].append(read_path)
145 |             if len(samples_to_reads_paths[(sample_name, tech)]) != len(set(samples_to_reads_paths[(sample_name, tech)])):
146 |                 warning_message = f"sample {sample_name} with read tech {tech} has some read file paths specified multiple times."
147 |                 if not config.get("allow_dup_reads_entries", False):
148 |                     raise ValueError(f"Error! {warning_message}")
149 |                 else:
150 |                     print(f"WARNING! {warning_message} Proceeding because `allow_dup_reads_entries` is set to True", file=sys.stderr)
151 |         else:
152 |             samples_to_reads_paths[(sample_name, tech)].append("")
153 |     return samples_to_reads_paths
154 | 
155 | 
156 | def ensure_aligner(config):
157 |     if config['aligner'] not in {"ngmlr", "minimap2", "winnowmap"}:
158 |         raise ValueError(f'unsupported aligner option {config["aligner"]}, only ngmlr, minimap2, and winnowmap are supported')
159 | 
160 | 
161 | def get_extra_alignments_paths(config):
162 |     samples_to_reads_paths = defaultdict(list)
163 |     for sample_data in config["samples"]:
164 |         sample_name = sample_data["sample"]
165 |         if TECH not in sample_data or sample_data[TECH].lower() not in ["ont", "pb", "pacbio", "pbccs", "pacbioccs"]:
166 |             raise ValueError(
167 |                 f"incorrect or missing tech {sample_data[TECH]} specified for sample {sample_name} in data.yaml. Only ONT or PB are supported, and tech specification is required")
168 |         tech = sample_data[TECH].upper()
169 |         if EXISTING_ALIGNMENTS not in sample_data or not isinstance(sample_data[EXISTING_ALIGNMENTS], list) or len(sample_data[EXISTING_ALIGNMENTS]) < 1:
170 |             samples_to_reads_paths[(sample_name, tech)] = []
171 |             continue
172 |         for alignment_path in sample_data[EXISTING_ALIGNMENTS]:
173 |             if not alignment_path.endswith(("bam")):
174 |                 raise ValueError(
175 |                     f"Unsupported extra alignment format for alignment {alignment_path}. Only 'bam' are supported")
176 |             samples_to_reads_paths[(sample_name, tech)].append(alignment_path)
177 |     return samples_to_reads_paths
178 | 
179 | 
180 | def get_samples_regex(samples_to_reads_paths):
181 |     return f"({'|'.join(x[0] for x in samples_to_reads_paths.keys())})"
182 | 
183 | 
184 | def get_reads_paths_regex(samples_to_reads_paths):
185 |     bases = set()
186 |     for (sample_name, tech), reads_paths in samples_to_reads_paths.items():
187 |         for read_path in reads_paths:
188 |             bases.add(os.path.basename(read_path))
189 |     return f"({'|'.join(bases)})"
190 | 
191 | 
192 | def get_tech_regex(config):
193 |     techs = set()
194 |     for sample_data in config[SAMPLES]:
195 |         techs.add(sample_data[TECH])
196 |     return f"({'|'.join(techs)})"
197 | 
198 | 
199 | def ensure_ref_correctness(config):
200 |     if REFERENCE not in config:
201 |         raise ValueError(f"No reference fasta file specified under 'ref' key in data.yaml. Reference is required.")
202 | 
203 | 
204 | def get_sniffles_sens_suffix(config):
205 |     min_support = config.get(TOOLS, {}).get(SNIFFLES, {}).get(MIN_SUPPORT, 2)
206 |     min_length = config.get(TOOLS, {}).get(SNIFFLES, {}).get(MIN_LENGTH, 20)
207 |     return f"s{min_support}l{min_length}"
208 | 
209 | 
210 | SUPPORTED_SV_TOOLS = {"sniffles"}
211 | 
212 | 
213 | def ensure_enabled_sv_tools(config):
214 |     for tool in config[SV_TOOLS_ENABLED]:
215 |         if tool.lower() not in SUPPORTED_SV_TOOLS:
216 |             raise ValueError(f"Attempt to enable unsupported SV inference tool {tool}. Only {','.join(SUPPORTED_SV_TOOLS)} are supported")
217 | 
218 | 
219 | def get_min_support(coverage_file, min_support_fixed_cnt, min_support_fraction):
220 |     coverage = 100
221 |     with open(coverage_file, "rt") as source:
222 |         for line in source:
223 |             coverage = int(float(line.strip().split("=")[1].strip()))
224 |             print(f"extracted coverage of {coverage} from file {coverage_file}")
225 |             break
226 |     result = min(int(min_support_fixed_cnt), int(coverage * min_support_fraction))
227 |     print(f"target min support cnt {result} with min support fixed cnt = {min_support_fixed_cnt} and min_support_fraction = {min_support_fraction}")
228 |     return result
229 | 
230 | ##########
231 | #
232 | # SV type and length utils
233 | #
234 | ##########
235 | 
236 | 
237 | class SVType(Enum):
238 |     INS = "INS"
239 |     DEL = "DEL"
240 |     DUP = "DUP"
241 |     INV = "INV"
242 |     TRA = "TRA"
243 | 
244 |     def __str__(self) -> str:
245 |         return str(self.value)
246 | 
247 |     def __repr__(self):
248 |         return str(self)
249 | 
250 |     @classmethod
251 |     def from_str(cls, string: str) -> "SVType":
252 |         for entry in cls:
253 |             if string.lower() == entry.value.lower():
254 |                 return entry
255 |         raise ValueError(f"Could not determine SVType from its supplied str version {string}")
256 | 
257 | 
258 | def get_chr_from_alt_bnd_record(bnd_string, default: str = "XXX") -> str:
259 |     splitter = "[" if "[" in bnd_string else "]"
260 |     chr_entry = [x for x in bnd_string.split(splitter) if ":" in x]
261 |     if len(chr_entry) < 1:
262 |         return default
263 |     return chr_entry[0].split(":")[0]
264 | 
265 | 
266 | def get_sv_type(vcf_record, info_type_field: str = "SVTYPE", info_len_field: str = "SVLEN", logger: Optional[logging.Logger] = None) -> SVType:
267 |     logger = logger if logger else logging.getLogger("Dummy")
268 |     strands = vcf_record.INFO.get("STRANDS", "??")
269 |     chr1 = str(vcf_record.CHROM)
270 |     chr2 = str(vcf_record.INFO.get("CHR2", get_chr_from_alt_bnd_record(bnd_string=vcf_record.ALT[0], default=chr1)))
271 |     if chr1 != chr2:
272 |         return SVType.TRA
273 |     if strands in ["--", "++"]:
274 |         return SVType.INV
275 |     if strands == "-+":
276 |         return SVType.DUP
277 |     info_svtype = vcf_record.INFO.get(info_type_field, None)
278 |     if info_svtype is not None:
279 |         if "INS" in info_svtype:
280 |             return SVType.INS
281 |         if "DEL" in info_svtype:
282 |             return SVType.DEL
283 |     coord_length = get_sv_length_from_coordinates(vcf_record)
284 |     if coord_length in [0, 1]:
285 |         return SVType.INS
286 |     info_length = vcf_record.INFO.get(info_len_field, None)
287 |     if info_length is not None and int(float(info_length)) < 0:
288 |         return SVType.DEL
289 |     logger.warning(f"Can't determine the SV type for VCF record {str(vcf_record)}. Defaulting to DEL")
290 |     return SVType.DEL
291 | 
292 | 
293 | def get_sv_length_from_coordinates(vcf_record) -> int:
294 |     try:
295 |         return abs(int(vcf_record.POS) - vcf_record.INFO["END"])
296 |     except KeyError:
297 |         print(f"No END field in VCF record {str(vcf_record)}")
298 | 
299 | 
300 | def get_sv_length_from_ref_alt(vcf_record) -> int:
301 |     return abs(len(vcf_record.ALT[0]) - len(vcf_record.REF))
302 | 
303 | 
304 | def get_sv_length(vcf_record, abs_value: bool = True, sv_type: Optional[SVType] = None, info_len_field: str = "SVLEN", info_type_field: str = "SVTYPE") -> int:
305 |     """
306 |     0 value is reserved for TRA SVs
307 |     """
308 |     sv_type = sv_type if sv_type else get_sv_type(vcf_record=vcf_record, info_type_field=info_type_field)
309 |     result = 0
310 |     if sv_type == SVType.TRA:
311 |         result = 0
312 |     elif sv_type in [SVType.DUP, SVType.INV]:
313 |         result = int(float(vcf_record.INFO.get(info_len_field, get_sv_length_from_coordinates(vcf_record))))
314 |     elif sv_type == SVType.INS:
315 |         result = int(float(vcf_record.INFO.get(info_len_field, get_sv_length_from_ref_alt(vcf_record))))
316 |     elif sv_type == SVType.DEL:
317 |         result = int(float(vcf_record.INFO.get(info_len_field, get_sv_length_from_coordinates(vcf_record))))
318 |         if result > 0:
319 |             result *= -1
320 |     if abs_value:
321 |         result = abs(result)
322 |     return result
323 | 
324 | 
325 | def get_sv_support_cnt(vcf_record, info_re_field: str = "RE", info_reads_field: str = "RNAMES") -> int:
326 |     re_value = int(vcf_record.INFO.get(info_re_field, 0))
327 |     if re_value != 0:
328 |         return re_value
329 |     reads = vcf_record.INFO.get(info_reads_field, "").split(",")
330 |     if len(reads) > 1 or len(reads[0]) > 0:
331 |         return len(reads)
332 |     return 0
333 | 


--------------------------------------------------------------------------------
/plot_merges.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | A simple visualization for comparing Jasmine results to those of SURVIVOR on human chr1
  3 | It assumes that the exact lines and points to plot, along with their colors, have already
  4 | been determined with the companion program src/VisualizationPrep.java.
  5 | 
  6 | This program takes in a single command line argument - the name of the file with points/lines to plot
  7 | '''
  8 | 
  9 | # Lots of matplotlib imports - we need Qt5Agg for scrollbar
 10 | import matplotlib
 11 | matplotlib.use('Qt5Agg')
 12 | 
 13 | # Matplotlib's libraries
 14 | import matplotlib.patches as mpatches
 15 | import matplotlib.pyplot as plt
 16 | from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
 17 | from matplotlib.backends.backend_qt5agg import NavigationToolbar2QT as NavigationToolbar
 18 | from matplotlib import collections as mc
 19 | from matplotlib.lines import Line2D
 20 | 
 21 | # Other imports
 22 | import numpy as np
 23 | import pylab as pl
 24 | import sys
 25 | from PyQt5 import QtWidgets, QtCore
 26 | 
 27 | # No yellow because it's hard to see so use brown instead
 28 | colors = ['red', 'orange', 'brown', 'green', 'blue', 'purple', 'pink', 'gray', 'black']
 29 | 
 30 | xs = [] # x values are genomic positions of variants
 31 | ys = [] # y values are the sample ID
 32 | cs = [] # Color of each point based on its variant type
 33 | xline = [] # x-coordinate pairs for merged variants
 34 | yline = [] # y-coordinate pairs for merged variants
 35 | linecs = [] # Color of each line segment based on which software merged that pair
 36 | 
 37 | 
 38 | '''
 39 | Each line will contain either a point (x, y) or a line segment (x1, y1, x2, y2).
 40 | Both types of lines have the option of adding a color value afterwards.
 41 | '''
 42 | with open(sys.argv[1], "r") as f:
 43 |   for line in f.readlines():
 44 |     tokens = line.split()
 45 |     if len(tokens) == 2: # Point with no color
 46 |         xs.append(int(tokens[0]))
 47 |         ys.append(int(tokens[1]))
 48 |         cs.append(colors[ys[len(ys)-1]])
 49 |     elif len(tokens) == 4: # Line segment with no color
 50 |         xline.append([int(tokens[0]), int(tokens[2])])
 51 |         yline.append([int(tokens[1]), int(tokens[3])])
 52 |         linecs.append('black')
 53 |     elif len(tokens) == 3: # Point with color
 54 |         xs.append(int(tokens[0]))
 55 |         ys.append(int(tokens[1]))
 56 |         cs.append(colors[int(tokens[2])])
 57 |     elif len(tokens) == 5: # Line segment with color
 58 |         xline.append([int(tokens[0]), int(tokens[2])])
 59 |         yline.append([int(tokens[1]), int(tokens[3])])
 60 |         linecs.append(colors[2*int(tokens[4])]) # Double color value so not too similar
 61 | 
 62 | #plt.scatter(xs, ys, c = cs)
 63 | #for i in range(0, len(xline)):
 64 | #  plt.plot(xline[i], yline[i], c = linecs[i])
 65 | 
 66 | # A window to show a plot with scrolling along the x-axis enabled
 67 | class ScrollableWindow(QtWidgets.QMainWindow):
 68 | 
 69 |     # Here step is what proportion of x-axis to show at once
 70 |     def __init__(self, fig, ax, step=0.01):
 71 |         plt.close("all")
 72 |         if not QtWidgets.QApplication.instance():
 73 |             self.app = QtWidgets.QApplication(sys.argv)
 74 |         else:
 75 |             self.app = QtWidgets.QApplication.instance() 
 76 | 
 77 |         QtWidgets.QMainWindow.__init__(self)
 78 |         self.widget = QtWidgets.QWidget()
 79 |         self.setCentralWidget(self.widget)
 80 |         self.widget.setLayout(QtWidgets.QVBoxLayout())
 81 |         self.widget.layout().setContentsMargins(0,0,0,0)
 82 |         self.widget.layout().setSpacing(0)
 83 | 
 84 |         self.fig = fig
 85 |         self.ax = ax
 86 |         self.canvas = FigureCanvas(self.fig)
 87 |         self.canvas.draw()
 88 |         self.scroll = QtWidgets.QScrollBar(QtCore.Qt.Horizontal)
 89 |         self.step = step
 90 |         self.setupSlider()
 91 |         self.nav = NavigationToolbar(self.canvas, self.widget)
 92 |         self.widget.layout().addWidget(self.nav)
 93 |         self.widget.layout().addWidget(self.canvas)
 94 |         self.widget.layout().addWidget(self.scroll)
 95 | 
 96 |         self.canvas.draw()
 97 |         self.show()
 98 |         self.app.exec_()
 99 | 
100 |     def setupSlider(self):
101 |         self.lims = np.array(self.ax.get_xlim())
102 |         self.scroll.setPageStep(self.step*100)
103 |         self.scroll.actionTriggered.connect(self.update)
104 |         self.update()
105 | 
106 |     # Update the window limits based on the scrollbar position
107 |     def update(self, evt=None):
108 |         r = self.scroll.value()/((1+self.step)*100)
109 |         l1 = self.lims[0]+r*np.diff(self.lims)
110 |         l2 = l1 +  np.diff(self.lims)*self.step
111 |         self.ax.set_xlim(l1,l2)
112 |         self.fig.canvas.draw_idle()
113 | 
114 | fig, ax = plt.subplots()
115 | 
116 | # Set the x-axis to go from 0 to the last variant position
117 | plt.xlim(0, max(xs))
118 | 
119 | # Plot the variant points
120 | ax.scatter(xs, ys, c = cs)
121 | 
122 | # Add axis labels and title
123 | ax.set_ylabel('Sample ID')
124 | ax.set_xlabel('Position (chr1)')
125 | ax.set_yticks(np.arange(0, max(ys)+1))
126 | ax.set_title('chr1')
127 | 
128 | # Plot the line segments
129 | for i in range(0, len(xline)):
130 |   if linecs[i] == colors[4]:
131 |     ax.plot(xline[i], yline[i], c = linecs[i], linestyle='dotted')
132 |   else:
133 |     ax.plot(xline[i], yline[i], c = linecs[i])
134 | custom_lines = [Line2D([0], [0], color=colors[2], lw=4),
135 |                 Line2D([0], [0], color=colors[4], lw=4),
136 |                 Line2D([0], [0], color=colors[6], lw=4)]
137 | 
138 | # Add legend for merging software colors
139 | legend1 = plt.legend(custom_lines, ['Jasmine', 'SURVIVOR', 'BOTH'], bbox_to_anchor=(.3, 1.05), ncol = 3)
140 | ax.add_artist(legend1)
141 | 
142 | # Add legend for variant type colors
143 | patches = [mpatches.Patch(color=colors[0], label='INS'), mpatches.Patch(color=colors[1], label='DEL'),
144 |     mpatches.Patch(color=colors[2], label='DUP'), mpatches.Patch(color=colors[3], label='INV')]
145 | legend2 = plt.legend(handles=patches, bbox_to_anchor=(.9, 1.05), ncol=len(patches))
146 | ax.add_artist(legend2)
147 | 
148 | # Generate the plot with a scrolling window
149 | a = ScrollableWindow(fig,ax)
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | # Script for running Jasmine
2 | if [ "$(uname -s)" = 'Linux' ]; then
3 |     BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
4 | else
5 |     BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
6 | fi
7 | 
8 | java -cp $BINDIR/src:$BINDIR/Iris/src Main "${@:1}"
9 | 


--------------------------------------------------------------------------------
/smalltest.sh:
--------------------------------------------------------------------------------
 1 | if [ "$(uname -s)" = 'Linux' ]; then
 2 |     BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
 3 | else
 4 |     BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
 5 | fi
 6 | 
 7 | $BINDIR/build_no_iris.sh
 8 | $BINDIR/run.sh file_list=$BINDIR/test_data/a.vcf,$BINDIR/test_data/b.vcf out_file=$BINDIR/test_data/merged.vcf --comma_filelist
 9 |  
10 | myout=$BINDIR/test_data/merged.vcf
11 | correctout=$BINDIR/test_data/c.vcf
12 | 
13 | diff -w $myout $correctout >/dev/null;REPLY=$?
14 | echo ''
15 | if [ ${REPLY} -eq 0 ]
16 | then
17 |   echo '### TEST SUCCEEDED ###'
18 | else
19 |   echo '### TEST FAILED ###'
20 |   diff -w $myout $correctout
21 | fi
22 | 


--------------------------------------------------------------------------------
/split_jasmine:
--------------------------------------------------------------------------------
1 | # Script for running Jasmine's Splitting utility script
2 | if [ "$(uname -s)" = 'Linux' ]; then
3 |     BINDIR=$(dirname "$(readlink -f "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
4 | else
5 |     BINDIR=$(dirname "$(readlink "$0" || echo "$(echo "$0" | sed -e 's,\\,/,g')")")
6 | fi
7 | 
8 | java -jar $BINDIR/jasmine_split.jar "${@:1}"
9 | 


--------------------------------------------------------------------------------
/src/AddGenotypes.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Adds genotype information to a merged VCF file based on the genotypes of the original variants
  3 |  */
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.PrintWriter;
  7 | import java.util.ArrayList;
  8 | import java.util.Arrays;
  9 | import java.util.HashMap;
 10 | import java.util.HashSet;
 11 | import java.util.Scanner;
 12 | 
 13 | public class AddGenotypes {
 14 | 	
 15 | 	public static void main(String[] args) throws Exception
 16 | 	{
 17 | 		if(args.length == 3)
 18 | 		{
 19 | 			String inputFile = args[0];
 20 | 			String fileList = args[1];
 21 | 			String outFile = args[2];
 22 | 			addGenotypes(inputFile, fileList, outFile);
 23 | 		}
 24 | 		else
 25 | 		{
 26 | 			System.out.println("Usage: java AddGenotypes <inputfile> <vcflist> <outputfile>");
 27 | 			return;
 28 | 		}
 29 | 	}
 30 | 	
 31 | 	/*
 32 | 	 * To add other FORMAT fields, add their details here and add the logic to initialize them in reformatVariantFormat
 33 | 	 */
 34 | 	static String[] newFieldNames = {"GT", "IS", "OT", "DV", "DR"};
 35 | 	static String[] newFieldNums = {"1", "1", "1", "1", "1"};
 36 | 	static String[] newFieldTypes = {"String", "String", "String", "String", "String"};
 37 | 	static String[] newFieldDescs = new String[] {
 38 | 			"The genotype of the variant",
 39 | 			"Whether or not the variant call was marked as specific due to high read support and length",
 40 | 			"The original type of the variant",
 41 | 			"The number of reads supporting the variant sequence",
 42 | 			"The number of reads supporting the reference sequence"
 43 | 	};
 44 | 	
 45 | 	/*
 46 | 	 * Adds FORMAT fields, including per-sample genotypes, to the variants in a merged VCF file
 47 | 	 */
 48 | 	static void addGenotypes(String inputFile, String fileList, String outputFile) throws Exception
 49 | 	{
 50 | 		// FORMAT fields of all per-file variant calls
 51 | 		ArrayList<FileFormatField> inputFormats = new ArrayList<FileFormatField>();
 52 | 		
 53 | 		// The names of the samples present across all input files
 54 | 		ArrayList<String> allSampleNamesList = new ArrayList<String>();
 55 | 		
 56 | 		ArrayList<String> vcfFiles = PipelineManager.getFilesFromList(fileList);
 57 | 		for(String vcfFile : vcfFiles)
 58 | 		{
 59 | 			FileFormatField fileFormats = new FileFormatField(vcfFile, true);
 60 | 			for(String sampleName : fileFormats.sampleNames)
 61 | 			{
 62 | 				allSampleNamesList.add(inputFormats.size() + "_" + sampleName);
 63 | 			}
 64 | 			inputFormats.add(fileFormats);
 65 | 		}
 66 | 		
 67 | 		// Get the number of samples per file to know how much to skip in samples where a variant is absent
 68 | 		int[] sampleCounts = new int[inputFormats.size()];
 69 | 		for(int i = 0; i<inputFormats.size(); i++)
 70 | 		{
 71 | 			sampleCounts[i] = inputFormats.get(i).sampleNames.length;
 72 | 		}
 73 | 				
 74 | 		// Put all the sample names in an array
 75 | 		String[] allSampleNames = new String[allSampleNamesList.size()];
 76 | 		for(int i = 0; i<allSampleNames.length; i++)
 77 | 		{
 78 | 			allSampleNames[i] = allSampleNamesList.get(i);
 79 | 		}
 80 | 				
 81 | 		// Now scan through merged VCF and combine FORMAT fields as needed, printing the updated file at the same time
 82 | 		Scanner input = new Scanner(new FileInputStream(new File(inputFile)));
 83 | 		PrintWriter out = new PrintWriter(new File(outputFile));
 84 | 		VcfHeader header = new VcfHeader();
 85 | 		boolean headerPrinted = false;
 86 | 		while(input.hasNext())
 87 | 		{
 88 | 			String line = input.nextLine();
 89 | 			if(line.length() == 0)
 90 | 			{
 91 | 				continue;
 92 | 			}
 93 | 			if(line.startsWith("#"))
 94 | 			{
 95 | 				header.addLine(line);
 96 | 			}
 97 | 			else
 98 | 			{
 99 | 				// If this is the first variant, update and print the header
100 | 				if(!headerPrinted)
101 | 				{
102 | 					headerPrinted = true;
103 | 					header.resetFormatFields();
104 | 					for(int i = 0; i<newFieldNames.length; i++)
105 | 					{
106 | 						header.addFormatField(newFieldNames[i], newFieldNums[i], newFieldTypes[i], newFieldDescs[i]);
107 | 					}
108 | 					
109 | 					// Update last line to include all sample names
110 | 					String[] lastHeaderLineTokens = header.lines.get(header.lines.size()-1).split("\t");
111 | 					if(lastHeaderLineTokens.length >= 9)
112 | 					{
113 | 						StringBuilder newLastLine = new StringBuilder("");
114 | 						for(int i = 0; i<9; i++)
115 | 						{
116 | 							newLastLine.append(lastHeaderLineTokens[i]);
117 | 							if(i < 8)
118 | 							{
119 | 								newLastLine.append("\t");
120 | 							}
121 | 						}
122 | 						for(String sampleName : allSampleNames)
123 | 						{
124 | 							newLastLine.append("\t" + sampleName);
125 | 						}
126 | 						header.lines.set(header.lines.size() - 1, newLastLine.toString());
127 | 					}
128 | 					
129 | 					header.print(out);
130 | 				}
131 | 				
132 | 				// This is the per-variant merging and printing logic
133 | 				VcfEntry entry = new VcfEntry(line);
134 | 				String suppVec = entry.getInfo("SUPP_VEC");
135 | 				if(suppVec.length() == 0)
136 | 				{
137 | 					// If there is no support vector field, just leave the entry as-is
138 | 					out.println(entry);
139 | 				}
140 | 				else
141 | 				{
142 | 					// The list of format fields of all variants merged into this one
143 | 					ArrayList<VariantFormatField> toMerge = new ArrayList<VariantFormatField>();
144 | 					String[] ids = entry.getInfo("IDLIST").split(",");
145 | 					for(int i = 0; i<suppVec.length(); i++)
146 | 					{
147 | 						if(suppVec.charAt(i) == '1')
148 | 						{
149 | 							String curId = ids[toMerge.size()];
150 | 							
151 | 							// Get the index of the line where this variant was within its original VCF file
152 | 							int variantIndex = inputFormats.get(i).idToVariantIndex.get(curId);
153 | 							
154 | 							// Add the variant's format fields to the list to merge
155 | 							toMerge.add(inputFormats.get(i).variantFormats.get(variantIndex));
156 | 						}
157 | 					}
158 | 					
159 | 					// Merge all format fields together and print the resulting VCF entry
160 | 					VariantFormatField merged = merge(toMerge, sampleCounts, suppVec);
161 | 					for(int i = 0; i<8; i++)
162 | 					{
163 | 						out.print(entry.tabTokens[i] + "\t");
164 | 					}
165 | 					out.println(merged);
166 | 				}
167 | 			}
168 | 		}
169 | 		input.close();
170 | 		out.close();
171 | 	}
172 | 	
173 | 	/*
174 | 	 * Merges the format field of multiple variants which share the same FORMAT string
175 | 	 * Creates one variant whose set of samples is the concatenation of the inputs' samples
176 | 	 */
177 | 	static VariantFormatField merge(ArrayList<VariantFormatField> list, int[] sampleCounts, String suppVec)
178 | 	{
179 | 		int numSamples = 0;
180 | 		for(int count : sampleCounts)
181 | 		{
182 | 			numSamples += count;
183 | 		}
184 | 		// Initialize empty format field data structure big enough for all of the samples
185 | 		VariantFormatField res = new VariantFormatField(numSamples, newFieldNames);
186 | 		
187 | 		// Update one field at a time
188 | 		for(int i = 0; i<newFieldNames.length; i++)
189 | 		{
190 | 			String fieldName = newFieldNames[i];
191 | 			int sampleIndex = 0;
192 | 			int listIndex = 0;
193 | 			
194 | 			// See which samples were in the support vector and fill values accordingly
195 | 			for(int j = 0; j<suppVec.length(); j++)
196 | 			{
197 | 				// Whether or this sample was in the support vector for the variant
198 | 				boolean include = suppVec.charAt(j) == '1';
199 | 				for(int k = 0; k<sampleCounts[j]; k++)
200 | 				{
201 | 					if(include)
202 | 					{
203 | 						// Use the values from the next VariantFormatField
204 | 						VariantFormatField cur = list.get(listIndex);
205 | 						String val = cur.getValue(k, fieldName);
206 | 						if(val.length() > 0)
207 | 						{
208 | 							res.sampleFieldValues[sampleIndex][res.getFieldIndex(fieldName)] = val;
209 | 						}
210 | 						else
211 | 						{
212 | 							res.sampleFieldValues[sampleIndex][res.getFieldIndex(fieldName)] = ".";
213 | 						}
214 | 					}
215 | 					else
216 | 					{
217 | 						// Fill fields with "NA" but use "./." or "0|0" for genotype
218 | 						String val = "NA";
219 | 						if(fieldName.equals("GT"))
220 | 						{
221 | 							if(Settings.DEFAULT_ZERO_GENOTYPE)
222 | 							{
223 | 								val = "0|0";
224 | 							}
225 | 							else
226 | 							{
227 | 								val = "./.";
228 | 							}
229 | 						}
230 | 						res.sampleFieldValues[sampleIndex][res.getFieldIndex(fieldName)] = val;
231 | 					}
232 | 					sampleIndex++;
233 | 				}
234 | 				if(include) listIndex++;
235 | 			}
236 | 		}
237 | 		
238 | 		return res;
239 | 		
240 | 	}
241 | 	
242 | 	/*
243 | 	 * Reformats a variant's format fields to match what we want
244 | 	 */
245 | 	static VariantFormatField reformatVariantFormat(VariantFormatField oldVariant, VcfEntry entry) throws Exception
246 | 	{
247 | 		int numSamples = oldVariant.numSamples();
248 | 		VariantFormatField res = new VariantFormatField(numSamples, newFieldNames);
249 | 		
250 | 		for(int i = 0; i<newFieldNames.length; i++)
251 | 		{
252 | 			String field = newFieldNames[i];
253 | 			for(int j = 0; j<numSamples; j++)
254 | 			{
255 | 				if(field.equals("GT"))
256 | 				{
257 | 					String oldGt = oldVariant.getValue(j, "GT");
258 | 					if(oldGt.length() > 0)
259 | 					{
260 | 						res.sampleFieldValues[j][i] = oldGt;
261 | 					}
262 | 					else
263 | 					{
264 | 						if(Settings.DEFAULT_ZERO_GENOTYPE)
265 | 						{
266 | 							res.sampleFieldValues[j][i] = "0|0";
267 | 						}
268 | 						res.sampleFieldValues[j][i] = "./.";
269 | 					}
270 | 				}
271 | 				else if(field.equals("IS"))
272 | 				{
273 | 					if(entry.hasInfoField("IS_SPECIFIC"))
274 | 					{
275 | 						res.sampleFieldValues[j][i] = entry.getInfo("IS_SPECIFIC");
276 | 					}
277 | 					else
278 | 					{
279 | 						res.sampleFieldValues[j][i] = ".";
280 | 					}
281 | 				}
282 | 				else if(field.equals("OT"))
283 | 				{
284 | 					if(entry.hasInfoField("OLDTYPE"))
285 | 					{
286 | 						res.sampleFieldValues[j][i] = entry.getInfo("OLDTYPE");
287 | 					}
288 | 					else
289 | 					{
290 | 						String type = entry.getType();
291 | 						if(type.length() > 0)
292 | 						{
293 | 							res.sampleFieldValues[j][i] = type;
294 | 						}
295 | 						else
296 | 						{
297 | 							res.sampleFieldValues[j][i] = ".";
298 | 						}
299 | 					}
300 | 				}
301 | 				else if(field.equals("DV"))
302 | 				{
303 | 					String oldDv = oldVariant.getValue(j, "DV");
304 | 					if(oldDv.length() > 0)
305 | 					{
306 | 						res.sampleFieldValues[j][i] = oldDv;
307 | 					}
308 | 					else
309 | 					{
310 | 						res.sampleFieldValues[j][i] = entry.getReadSupport() + "";
311 | 					}
312 | 				}
313 | 				else if(field.equals("DR"))
314 | 				{
315 | 					String oldDr = oldVariant.getValue(j, "DR");
316 | 					if(oldDr.length() > 0)
317 | 					{
318 | 						res.sampleFieldValues[j][i] = oldDr;
319 | 					}
320 | 					else
321 | 					{
322 | 						res.sampleFieldValues[j][i] = ".";
323 | 					}
324 | 				}
325 | 			}
326 | 		}
327 | 		
328 | 		return res;
329 | 	}
330 | 	
331 | 	/*
332 | 	 * The values of FORMAT fields for an entire VCF file, including the sample names in the header
333 | 	 */
334 | 	static class FileFormatField
335 | 	{
336 | 		// FORMAT field names and value for each individual variant
337 | 		ArrayList<VariantFormatField> variantFormats;
338 | 		
339 | 		// Names of samples which are present in the file
340 | 		String[] sampleNames;
341 | 		
342 | 		// Map from variant ID to index in variantFormats for fast lookup of particular variants
343 | 		HashMap<String, Integer> idToVariantIndex;
344 | 		
345 | 		// The header of the VCF file
346 | 		VcfHeader header;
347 | 		
348 | 		FileFormatField(String fileName, boolean reformat) throws Exception
349 | 		{
350 | 			HashSet<String> ids = new HashSet<String>();
351 | 			variantFormats = new ArrayList<VariantFormatField>();
352 | 			idToVariantIndex = new HashMap<String, Integer>();
353 | 			header = new VcfHeader();
354 | 			Scanner input = new Scanner(new FileInputStream(new File(fileName)));
355 | 			boolean extractedSampleNames = false;
356 | 			while(input.hasNext())
357 | 			{
358 | 				String line = input.nextLine();
359 | 				if(line.length() == 0)
360 | 				{
361 | 					continue;
362 | 				}
363 | 				if(line.startsWith("#"))
364 | 				{
365 | 					header.addLine(line);
366 | 				}
367 | 				else
368 | 				{
369 | 					// If this is the first variant, we finished the header, so get sample names from the last header line
370 | 					if(!extractedSampleNames)
371 | 					{
372 | 						extractedSampleNames = true;
373 | 												
374 | 						// Get the list of sample names from the last header line
375 | 						String lastLine = header.lines.get(header.lines.size() - 1);
376 | 						String[] tabTokens = lastLine.split("\t");
377 | 						
378 | 						// Check if there are actually sample names in the header
379 | 						if(tabTokens.length > 9)
380 | 						{
381 | 							sampleNames = new String[tabTokens.length - 9];
382 | 							for(int i = 0; i<sampleNames.length; i++)
383 | 							{
384 | 								sampleNames[i] = tabTokens[i + 9];
385 | 							}
386 | 						}
387 | 						else
388 | 						{
389 | 							sampleNames = new String[0];
390 | 						}
391 | 					}
392 | 					
393 | 					// Add this variant's format fields to the list
394 | 					VcfEntry entry = new VcfEntry(line);
395 | 					if(ids.contains(entry.getId()))
396 | 					{
397 | 						String oldId = entry.getId();
398 | 						int index = 1;
399 | 						while(true)
400 | 						{
401 | 							String newId = oldId + "_duplicate" + index;
402 | 							if(!ids.contains(newId))
403 | 							{
404 | 								entry.setId(newId);
405 | 								break;
406 | 							}
407 | 							else
408 | 							{
409 | 								index++;
410 | 							}
411 | 						}
412 | 					}
413 | 					ids.add(entry.getId());
414 | 					idToVariantIndex.put(entry.getId(), variantFormats.size());
415 | 					VariantFormatField vff = new VariantFormatField(line);
416 | 					if(reformat)
417 | 					{
418 | 						vff = reformatVariantFormat(vff, entry);
419 | 					}
420 | 					
421 | 					variantFormats.add(vff);
422 | 				}
423 | 			}
424 | 			if(!extractedSampleNames)
425 | 			{
426 | 				extractedSampleNames = true;
427 | 										
428 | 				// Get the list of sample names from the last header line
429 | 				String lastLine = header.lines.get(header.lines.size() - 1);
430 | 				String[] tabTokens = lastLine.split("\t");
431 | 				
432 | 				// Check if there are actually sample names in the header
433 | 				if(tabTokens.length > 9)
434 | 				{
435 | 					sampleNames = new String[tabTokens.length - 9];
436 | 					for(int i = 0; i<sampleNames.length; i++)
437 | 					{
438 | 						sampleNames[i] = tabTokens[i + 9];
439 | 					}
440 | 				}
441 | 				else
442 | 				{
443 | 					sampleNames = new String[0];
444 | 				}
445 | 			}
446 | 			input.close();
447 | 		}
448 | 	}
449 | 	
450 | 	/*
451 | 	 * The FORMAT information for a single variant call
452 | 	 * It includes the list of fields as well as their values for all samples
453 | 	 */
454 | 	static class VariantFormatField
455 | 	{
456 | 		// The names of FORMAT fields in order
457 | 		String[] fieldNames;
458 | 		
459 | 		// The value within each sample of each field, in the same order as in fieldNames 
460 | 		String[][] sampleFieldValues;
461 | 		
462 | 		/*
463 | 		 * Initialize the format fields with all "NA" values
464 | 		 */
465 | 		VariantFormatField(int numSamples, String[] fieldNames)
466 | 		{
467 | 			this.fieldNames = fieldNames;
468 | 			sampleFieldValues = new String[numSamples][fieldNames.length];
469 | 			for(int i = 0; i<numSamples; i++)
470 | 			{
471 | 				Arrays.fill(sampleFieldValues[i], "NA");
472 | 			}
473 | 		}
474 | 		
475 | 		/*
476 | 		 * Initialize the format fields from a VCF line
477 | 		 */
478 | 		VariantFormatField(String line) throws Exception
479 | 		{
480 | 			VcfEntry entry = new VcfEntry(line);
481 | 			if(entry.tabTokens.length > 8)
482 | 			{
483 | 				sampleFieldValues = new String[entry.tabTokens.length - 9][];
484 | 				String formatString = entry.tabTokens[8];
485 | 				fieldNames = formatString.split(":");
486 | 				for(int i = 0; i<sampleFieldValues.length; i++)
487 | 				{
488 | 					sampleFieldValues[i] = entry.tabTokens[i + 9].split(":");
489 | 				}
490 | 			}
491 | 			else
492 | 			{
493 | 				sampleFieldValues = new String[0][];
494 | 				fieldNames = new String[] {};
495 | 			}
496 | 		}
497 | 		
498 | 		/*
499 | 		 * Gets the number of samples in the VCF this variant came from
500 | 		 */
501 | 		int numSamples()
502 | 		{
503 | 			return sampleFieldValues.length;
504 | 		}
505 | 		
506 | 		/*
507 | 		 * Gets the position of a given field in the FORMAT string, or -1 if it's not present
508 | 		 */
509 | 		int getFieldIndex(String field)
510 | 		{
511 | 			for(int i = 0; i<fieldNames.length; i++)
512 | 			{
513 | 				if(fieldNames[i].equals(field))
514 | 				{
515 | 					return i;
516 | 				}
517 | 			}
518 | 			return -1;
519 | 		}
520 | 		
521 | 		/*
522 | 		 * Gets the value of a particular field in the given sample, or "" if it's not present
523 | 		 */
524 | 		String getValue(int sampleIndex, String field)
525 | 		{
526 | 			int fieldIndex = getFieldIndex(field);
527 | 			if(fieldIndex == -1)
528 | 			{
529 | 				return "";
530 | 			}
531 | 			return sampleFieldValues[sampleIndex][fieldIndex];
532 | 		}
533 | 		
534 | 		/*
535 | 		 * Gets a VCF-format, tab-separated representation of the FORMAT string plus per-sample genotypes
536 | 		 */
537 | 		public String toString()
538 | 		{
539 | 			if(fieldNames.length == 0)
540 | 			{
541 | 				return "";
542 | 			}
543 | 			
544 | 			StringBuilder res = new StringBuilder("");
545 | 			
546 | 			// First token is the FORMAT string, with field names separated by ":"
547 | 			for(int i = 0; i<fieldNames.length; i++)
548 | 			{
549 | 				res.append(fieldNames[i]);
550 | 				if(i < fieldNames.length - 1)
551 | 				{
552 | 					res.append(":");
553 | 				}
554 | 			}
555 | 			res.append("\t");
556 | 			
557 | 			// Field values with samples separated by tabs and values within each sample separated by colons
558 | 			for(int i = 0; i<sampleFieldValues.length; i++)
559 | 			{
560 | 				for(int j = 0; j<sampleFieldValues[i].length; j++)
561 | 				{
562 | 					res.append(sampleFieldValues[i][j]);
563 | 					if(j < sampleFieldValues[i].length - 1)
564 | 					{
565 | 						res.append(":");
566 | 					}
567 | 				}
568 | 				if(i < sampleFieldValues.length - 1)
569 | 				{
570 | 					res.append("\t");
571 | 				}
572 | 			}
573 | 			return res.toString();
574 | 		}
575 | 	}
576 | }
577 | 


--------------------------------------------------------------------------------
/src/BndVcfEntry.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * A VCF entry where the ALT field contains most of the SV information in BND format
  3 |  * It is assumed that all such SVs are translocations and that each entry encodes its own variant
  4 |  */
  5 | public class BndVcfEntry extends VcfEntry {
  6 | 	
  7 | 	String[] altTokens;
  8 | 	public BndVcfEntry(String line) throws Exception
  9 | 	{
 10 | 		super(line);
 11 | 		altTokens =  getAlt().split("[\\[\\]]");
 12 | 	}
 13 | 	
 14 | 	/*
 15 | 	 * Fall back on the length being zero if the SVLEN field is not there
 16 | 	 */
 17 | 	public int getLength() throws Exception
 18 | 	{
 19 | 		if(hasInfoField("SVLEN"))
 20 | 		{
 21 | 			return Integer.parseInt(getInfo("SVLEN"));
 22 | 		}
 23 | 		return 0;
 24 | 	}
 25 | 	
 26 | 	/*
 27 | 	 * Getting the end coordinate may require parsing the ALT field
 28 | 	 */
 29 | 	public long getEnd() throws Exception
 30 | 	{
 31 | 		if(hasInfoField("END"))
 32 | 		{
 33 | 			return Long.parseLong(getInfo("END"));
 34 | 		}
 35 | 		
 36 | 		if(altTokens.length == 1)
 37 | 		{
 38 | 			return getPos();
 39 | 		}
 40 | 		String chrPosToken = altTokens[1];
 41 | 		return Long.parseLong(chrPosToken.substring(1 + chrPosToken.lastIndexOf(':')));
 42 | 	}
 43 | 	
 44 | 	/*
 45 | 	 * Always call the type of a BND-style VCF translocation to ensure consistency with different formats
 46 | 	 */
 47 | 	public String getType()
 48 | 	{
 49 | 		return "TRA";
 50 | 	}
 51 | 	
 52 | 	/*
 53 | 	 * The graph ID here has to consider both chromosomes
 54 | 	 */
 55 | 	public String getGraphId() throws Exception
 56 | 	{
 57 | 		String first = getChromosome(), second = getChr2();
 58 | 		if(first.compareTo(second) > 0)
 59 | 		{
 60 | 			String tmp = first;
 61 | 			first = second;
 62 | 			second = tmp;
 63 | 		}
 64 | 		String id = first + "_" + second;
 65 | 		if(Settings.USE_TYPE)
 66 | 		{
 67 | 			id += "_" + getType();
 68 | 		}
 69 | 		if(Settings.USE_STRAND)
 70 | 		{
 71 | 			id += "_" + getStrand();
 72 | 		}
 73 | 		return id;
 74 | 	}
 75 | 	
 76 | 	/*
 77 | 	 * The second chromosome can be found in either the CHR2 INFO field or the ALT field
 78 | 	 */
 79 | 	public String getChr2() throws Exception
 80 | 	{
 81 | 		if(hasInfoField("CHR2"))
 82 | 		{
 83 | 			return Settings.CHR_NAME_MAP.normalize(getInfo("CHR2"));
 84 | 		}
 85 | 		if(altTokens.length == 1)
 86 | 		{
 87 | 			return getChromosome();
 88 | 		}
 89 | 		String chrPosToken = altTokens[1];
 90 | 		return Settings.CHR_NAME_MAP.normalize(chrPosToken.substring(0, chrPosToken.lastIndexOf(':')));
 91 | 	}
 92 | 	
 93 | 	/*
 94 | 	 * The strands may need to be inferred from the ALT square bracket format
 95 | 	 */
 96 | 	public String getStrand() throws Exception
 97 | 	{
 98 | 		String res = getInfo("STRANDS");
 99 | 		if(res.length() == 0)
100 | 		{
101 | 			return strandsFromAltFormat();
102 | 		}
103 | 		return res;
104 | 	}
105 | 	
106 | 	/*
107 | 	 * Determine the strands from the ALT square bracket format
108 | 	 */
109 | 	public String strandsFromAltFormat()
110 | 	{
111 | 		String alt = getAlt();
112 | 		if(alt.startsWith("["))
113 | 		{
114 | 			return "+-";
115 | 		}
116 | 		else if(alt.startsWith("]"))
117 | 		{
118 | 			return "--";
119 | 		}
120 | 		else if(alt.contains("["))
121 | 		{
122 | 			return "++";
123 | 		}
124 | 		else if(alt.contains("]"))
125 | 		{
126 | 			return "-+";
127 | 		}
128 | 		return "";
129 | 	}
130 | 	
131 | 	/*
132 | 	 * Gets the first coordinate of the variant
133 | 	 */
134 | 	public double getFirstCoord() throws Exception
135 | 	{
136 | 		String chr = getChromosome(), chr2 = getChr2();
137 | 		if(chr.compareTo(chr2) > 0 || (chr.equals(chr2) && getPos() > getEnd()))
138 | 		{
139 | 			if(hasInfoField("AVG_END"))
140 | 			{
141 | 				return Double.parseDouble(getInfo("AVG_END"));
142 | 			}
143 | 			return getEnd();
144 | 		}
145 | 		if(hasInfoField("AVG_START"))
146 | 		{
147 | 			return Double.parseDouble(getInfo("AVG_START"));
148 | 		}
149 | 		return getPos();
150 | 	}
151 | 	
152 | 	/*
153 | 	 * Since length is undefined, get the second coord instead
154 | 	 */
155 | 	public double getSecondCoord() throws Exception
156 | 	{
157 | 		String chr = getChromosome(), chr2 = getChr2();
158 | 		if(chr.compareTo(chr2) > 0 || (chr.equals(chr2) && getPos() > getEnd()))
159 | 		{
160 | 			if(hasInfoField("AVG_START"))
161 | 			{
162 | 				return Double.parseDouble(getInfo("AVG_START"));
163 | 			}
164 | 			return getPos();
165 | 		}
166 | 		if(hasInfoField("AVG_END"))
167 | 		{
168 | 			return Double.parseDouble(getInfo("AVG_END"));
169 | 		}
170 | 		return getEnd();
171 | 	}
172 | }
173 | 


--------------------------------------------------------------------------------
/src/ChrNameNormalization.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Map for normalizing chromosome names
 3 |  */
 4 | import java.io.File;
 5 | import java.io.FileInputStream;
 6 | import java.util.HashMap;
 7 | import java.util.Scanner;
 8 | 
 9 | public class ChrNameNormalization 
10 | {
11 | 	HashMap<String, String> normMap = new HashMap<String, String>();
12 | 	ChrNameNormalization() throws Exception
13 | 	{
14 | 		normMap = new HashMap<String, String>();
15 | 		if(Settings.DEFAULT_CHR_NORM)
16 | 		{
17 | 			// Remove "chr" from chromosome names
18 | 			for(int i = 1; i<=22; i++)
19 | 			{
20 | 				normMap.put("chr" + i, i + "");
21 | 			}
22 | 			normMap.put("chrX", "X");
23 | 			normMap.put("chrY", "Y");
24 | 			normMap.put("chrM", "MT");
25 | 		}
26 | 		else if(Settings.CHR_NORM_FILE.length() > 0)
27 | 		{
28 | 			// Read in chromosome name map
29 | 			Scanner input = new Scanner(new FileInputStream(new File(Settings.CHR_NORM_FILE)));
30 | 			while(input.hasNext())
31 | 			{
32 | 				String line = input.nextLine();
33 | 				if(line.length() == 0)
34 | 				{
35 | 					continue;
36 | 				}
37 | 				String[] tokens = line.split(" ");
38 | 				for(int i = 1; i<tokens.length; i++) normMap.put(tokens[i], tokens[0]);
39 | 			}
40 | 		}
41 | 	}
42 | 	
43 | 	/*
44 | 	 * Returns normalized chromosome name
45 | 	 */
46 | 	String normalize(String chrName)
47 | 	{
48 | 		if(normMap.containsKey(chrName))
49 | 		{
50 | 			return normMap.get(chrName);
51 | 		}
52 | 		return chrName;
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/src/DuplicationsToInsertions.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Converts duplications to insertions
  3 |  * Usage: java DuplicationsToInsertions input_vcf reference_genome output_vcf
  4 |  */
  5 | 
  6 | import java.io.File;
  7 | import java.io.FileInputStream;
  8 | import java.io.PrintWriter;
  9 | import java.util.ArrayList;
 10 | import java.util.Scanner;
 11 | 
 12 | public class DuplicationsToInsertions {
 13 | 	static String inputFile = "";
 14 | 	static String genomeFile = "";
 15 | 	static String outputFile = "";
 16 | 	public static void main(String[] args) throws Exception
 17 | 	{
 18 | 		if(args.length != 3)
 19 | 		{
 20 | 			System.out.println("Usage: java DuplicationsToInsertions input_vcf reference_genome output_vcf");
 21 | 			return;
 22 | 		}
 23 | 		else
 24 | 		{
 25 | 			inputFile = args[0];
 26 | 			genomeFile = args[1];
 27 | 			outputFile = args[2];
 28 | 			convertFile(inputFile, genomeFile, outputFile);
 29 | 		}		
 30 | 	}
 31 | 	
 32 | 	/*
 33 | 	 * Convert duplications in inputFile to insertions and write updated VCF to outputFile.
 34 | 	 * A genome file is needed to get the insertion sequences based on the duplication position
 35 | 	 */
 36 | 	static void convertFile(String inputFile, String genomeFile, String outputFile) throws Exception
 37 | 	{
 38 | 		Scanner input = new Scanner(new FileInputStream(new File(inputFile)));
 39 | 		
 40 | 		GenomeQuery gq = new GenomeQuery(genomeFile);
 41 | 		
 42 | 		PrintWriter out = new PrintWriter(new File(outputFile));
 43 | 		
 44 | 		VcfHeader header = new VcfHeader();
 45 | 		ArrayList<VcfEntry> entries = new ArrayList<VcfEntry>();
 46 | 		
 47 | 		int countDup = 0;
 48 | 		
 49 | 		while(input.hasNext())
 50 | 		{
 51 | 			String line = input.nextLine();
 52 | 			if(line.startsWith("#"))
 53 | 			{
 54 | 				header.addLine(line);
 55 | 			}
 56 | 			else if(line.length() > 0)
 57 | 			{
 58 | 				VcfEntry ve = new VcfEntry(line);
 59 | 				if(ve.getType().equals("DUP") && ve.getLength() < Settings.MAX_DUP_LEN)
 60 | 				{
 61 | 					countDup++;
 62 | 					
 63 | 					long start = ve.getPos(), end = Long.parseLong(ve.getInfo("END"));
 64 | 					int length = ve.getLength();
 65 | 					if(end <= start)
 66 | 					{
 67 | 						System.err.printf("Duplication with ID %s has end (%d) <= start (%d), so was not converted\n", 
 68 | 								ve.getId(), start, end);
 69 | 						entries.add(ve);
 70 | 						continue;
 71 | 					}
 72 | 					long nstart = start + length - 1, nend = nstart;
 73 | 
 74 | 					if(ve.getAlt().equals("<DUP>"))
 75 | 					{
 76 | 						String seq = gq.genomeSubstring(ve.getChromosome(), start, end-1);
 77 | 						
 78 | 						if(length < 100000)
 79 | 						{
 80 | 							ve.setRef(seq.charAt(seq.length()-1)+"");
 81 | 							ve.setAlt(seq.charAt(seq.length()-1)+seq);
 82 | 						}
 83 | 						else
 84 | 						{
 85 | 							ve.setRef(".");
 86 | 							ve.setAlt("<INS>");
 87 | 						}
 88 | 						ve.setInfo("END", nend+"");
 89 | 						ve.setInfo("STRANDS", "+-");
 90 | 						ve.setPos(nstart);
 91 | 						ve.setInfo("OLDTYPE", "DUP");
 92 | 					}
 93 | 					else
 94 | 					{
 95 | 						ve.setInfo("OLDTYPE", "DUP");
 96 | 					}
 97 | 					ve.setType("INS");
 98 | 					
 99 | 				}
100 | 				else
101 | 				{
102 | 					ve.setInfo("OLDTYPE", ve.getType());
103 | 				}
104 | 				entries.add(ve);
105 | 			}
106 | 		}
107 | 		
108 | 		System.out.println("Number of duplications converted to insertions: " + countDup + " out of " + entries.size() + " total variants");
109 | 		
110 | 		header.addInfoField("OLDTYPE", "1", "String", "");
111 | 		header.addInfoField("STRANDS", "1", "String", "");
112 | 		header.print(out);
113 | 		
114 | 		for(VcfEntry ve : entries)
115 | 		{
116 | 			out.println(ve);
117 | 		}
118 | 		
119 | 		input.close();
120 | 		out.close();
121 | 	}
122 | }
123 | 


--------------------------------------------------------------------------------
/src/Forest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * A representation of a forest using a union-find data structure
  3 |  * It allows nodes to be merged, checking if their components share
  4 |  * any variants from the same sample.  For now, only up to 64 samples
  5 |  * are supported, but sampleMask can be replaced with actual bitsets
  6 |  * at a small cost to runtime.
  7 |  */
  8 | 
  9 | import java.util.Arrays;
 10 | 
 11 | public class Forest
 12 | {
 13 | 	int[] map; // map[i] is negative if root, more negative means bigger set; if nonnegative, then it indicates the parent
 14 | 	long[][] sampleMask; // For each root node, a bitmask of which samples are present in its component
 15 | 	static int samplesPerMask = 63;
 16 | 	
 17 | 	public Forest(Variant[] data)
 18 | 	{
 19 | 		int n = data.length;
 20 | 		int maxSample = 0;
 21 | 		for(int i = 0; i<n; i++)
 22 | 		{
 23 | 			maxSample = Math.max(maxSample, data[i].sample);
 24 | 		}
 25 | 		
 26 | 		// Each component may require multiple 64-bit integers to hold its bitset of sample IDs if there are many samples
 27 | 		int masksNeeded = maxSample / samplesPerMask + 1;
 28 | 		map = new int[n];
 29 | 		Arrays.fill(map, -1);
 30 | 		sampleMask = new long[masksNeeded][n];
 31 | 		for(int i = 0; i<n; i++)
 32 | 		{
 33 | 			int maskId = data[i].sample / samplesPerMask;
 34 | 			int maskVal = data[i].sample % samplesPerMask;
 35 | 			sampleMask[maskId][i] |= (1L << maskVal);
 36 | 		}
 37 | 		
 38 | 	}
 39 | 	
 40 | 	/*
 41 | 	 * Get the root of the component containing a variant
 42 | 	 */
 43 | 	public int find(int x)
 44 | 	{
 45 | 		if(map[x] < 0)
 46 | 			return x;
 47 | 		else
 48 | 		{
 49 | 			map[x] = find(map[x]);
 50 | 			return map[x];
 51 | 		}
 52 | 	}
 53 | 	
 54 | 	/*
 55 | 	 * Add an edge between two variants
 56 | 	 */
 57 | 	public boolean canUnion(int a, int b)
 58 | 	{
 59 | 		int roota = find(a), rootb = find(b);
 60 | 		if(roota == rootb)
 61 | 		{
 62 | 			return false;
 63 | 		}
 64 | 		if(!okayEdge(roota, rootb))
 65 | 		{
 66 | 			return false;
 67 | 		}
 68 | 		
 69 | 		return true;
 70 | 	}
 71 | 	
 72 | 	public void union(int a, int b)
 73 | 	{
 74 | 		int roota = find(a), rootb = find(b);
 75 | 		if(map[roota] < map[rootb])
 76 | 		{
 77 | 			map[roota] += map[rootb]; //add the sizes
 78 | 			map[rootb] = roota; //connect the smaller to the bigger
 79 | 			for(int j = 0; j<sampleMask.length; j++)
 80 | 			{
 81 | 				sampleMask[j][roota] |= sampleMask[j][rootb];
 82 | 			}
 83 | 		}
 84 | 		else
 85 | 		{
 86 | 			map[rootb] += map[roota];
 87 | 			map[roota] = rootb;
 88 | 			for(int j = 0; j<sampleMask.length; j++)
 89 | 			{
 90 | 				sampleMask[j][rootb] |= sampleMask[j][roota];
 91 | 			}
 92 | 		}
 93 | 	}
 94 | 	
 95 | 	/*
 96 | 	 * Whether or not adding an edge will avoid causing intrasample merging
 97 | 	 */
 98 | 	private boolean okayEdge(int rootA, int rootB)
 99 | 	{
100 | 		if(Settings.ALLOW_INTRASAMPLE)
101 | 		{
102 | 			return true;
103 | 		}
104 | 		for(int j = 0; j<sampleMask.length; j++)
105 | 		{
106 | 			if((sampleMask[j][rootA] & sampleMask[j][rootB]) != 0)
107 | 			{
108 | 				return false;
109 | 			}
110 | 		}
111 | 		return true;
112 | 	}
113 | }
114 | 


--------------------------------------------------------------------------------
/src/GenomeQuery.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * A wrapper around samtools to get substrings of a genome
 3 |  */
 4 | 
 5 | import java.io.File;
 6 | import java.io.InputStream;
 7 | import java.util.Scanner;
 8 | 
 9 | public class GenomeQuery {
10 | 	String filename;
11 | 	
12 | 	/*
13 | 	 * The constructor tests that samtools works and that the genome file exists
14 | 	 */
15 | 	GenomeQuery(String filename) throws Exception
16 | 	{
17 | 		testSamtoolsInstalled();
18 | 		boolean validFile = new File(filename).exists();
19 | 		if(!validFile)
20 | 		{
21 | 			throw new Exception("geonome file does not exist: " + filename);
22 | 		}
23 | 		this.filename = filename;
24 | 	}
25 | 	
26 | 	/*
27 | 	 * Runs a simple samtools command and inspects the exit code to make sure it is installed
28 | 	 */
29 | 	void testSamtoolsInstalled() throws Exception
30 | 	{
31 | 		String samtoolsTestCommand = Settings.SAMTOOLS_PATH;
32 | 		Process child = Runtime.getRuntime().exec(samtoolsTestCommand);
33 |         int seqExit = child.waitFor();
34 | 		
35 |         // Exit code > 1 means the command failed, usually because samtools is not installed or on path
36 |         if(seqExit > 1)
37 |         {
38 |         	throw new Exception("samtools produced bad exit code (" + seqExit + ") - check path: " + Settings.SAMTOOLS_PATH);
39 |         }
40 | 	}
41 | 	
42 | 	/*
43 | 	 * Queries a genomic substring - runs samtools faidx <genomeFile> chr:startPos-endPos
44 | 	 */
45 | 	String genomeSubstring(String chr, long startPos, long endPos) throws Exception
46 | 	{
47 | 		if(startPos > endPos)
48 | 		{
49 | 			return "";
50 | 		}
51 | 		String faidxCommand = String.format("%s faidx %s %s:%d-%d", Settings.SAMTOOLS_PATH, filename, chr, startPos, endPos);
52 | 		Process child = Runtime.getRuntime().exec(faidxCommand);
53 |         InputStream seqStream = child.getInputStream();
54 | 		Scanner seqInput = new Scanner(seqStream);
55 | 		
56 | 		// Make sure it produced an actual output
57 | 		if(!seqInput.hasNext())
58 |         {
59 |         	seqInput.close();
60 |         	throw new Exception("samtools faidx did not produce an output: " + faidxCommand);
61 |         }
62 | 		// Read in and ignore sequence name
63 | 		seqInput.next();
64 | 		
65 | 		// Make sure there's a sequence
66 | 		if(!seqInput.hasNext())
67 | 		{
68 | 			seqInput.close();
69 |         	throw new Exception("samtools faidx produced a sequence name but not an actual sequence: " + faidxCommand);
70 | 		}
71 | 		
72 | 		// Concatenate all lines of the output sequence
73 | 		StringBuilder res = new StringBuilder("");
74 | 		while(seqInput.hasNext())
75 | 		{
76 | 			res.append(seqInput.next());
77 | 		}
78 | 		seqInput.close();
79 | 
80 | 		return res.toString();
81 | 	}
82 | }
83 | 


--------------------------------------------------------------------------------
/src/IgvScreenshotMaker.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Script for visualizing all variants in a merged VCF file
  3 |  */
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.PrintWriter;
  7 | import java.nio.file.Path;
  8 | import java.nio.file.Paths;
  9 | import java.util.ArrayList;
 10 | import java.util.HashMap;
 11 | import java.util.HashSet;
 12 | import java.util.Scanner;
 13 | 
 14 | public class IgvScreenshotMaker {
 15 | 	
 16 | 	static String vcfFn = "";
 17 | 	static String bedFn = "";
 18 | 	static String bamFilelist = "";
 19 | 	static String vcfFilelist = "";
 20 | 	static String genomeFn = "";
 21 | 	
 22 | 	static int PADDING = 100;
 23 | 	
 24 | 	static String outPrefix = "";
 25 | 	
 26 | 	static boolean SQUISH = false;
 27 | 	static boolean SVG = false;
 28 | 	static boolean PRECISE = false;
 29 | 	
 30 | 	static HashMap<String, String> infoFilters;
 31 | 	static HashSet<String> grepFilters;
 32 | 		
 33 | 	static void parseArgs(String[] args)
 34 | 	{
 35 | 		infoFilters = new HashMap<String, String>();
 36 | 		grepFilters = new HashSet<String>();
 37 | 		
 38 | 		for(String arg : args)
 39 | 		{
 40 | 			int equalsIdx = arg.indexOf('=');
 41 | 			if(equalsIdx == -1)
 42 | 			{
 43 | 				if(arg.toLowerCase().endsWith("squish"))
 44 | 				{
 45 | 					SQUISH = true;
 46 | 				}
 47 | 				else if(arg.toLowerCase().endsWith("svg"))
 48 | 				{
 49 | 					SVG = true;
 50 | 				}
 51 | 				else if(arg.toLowerCase().endsWith("normalize_chr_names"))
 52 | 				{
 53 | 					Settings.DEFAULT_CHR_NORM = true;
 54 | 				}
 55 | 				else if(arg.toLowerCase().endsWith("precise"))
 56 | 				{
 57 | 					PRECISE = true;
 58 | 				}
 59 | 				else if(arg.toLowerCase().endsWith("specific"))
 60 | 				{
 61 | 					infoFilters.put("IS_SPECIFIC", "1");
 62 | 				}
 63 | 			}
 64 | 			else
 65 | 			{
 66 | 				String key = arg.substring(0, equalsIdx);
 67 | 				String val = arg.substring(1 + equalsIdx);
 68 | 				if(key.equalsIgnoreCase("vcf_file"))
 69 | 				{
 70 | 					vcfFn = val;
 71 | 				}
 72 | 				else if(key.equalsIgnoreCase("bed_file"))
 73 | 				{
 74 | 					bedFn = val;
 75 | 				}
 76 | 				else if(key.equalsIgnoreCase("genome_file"))
 77 | 				{
 78 | 					genomeFn = val;
 79 | 				}
 80 | 				else if(key.equalsIgnoreCase("bam_filelist"))
 81 | 				{
 82 | 					bamFilelist = val;
 83 | 				}
 84 | 				else if(key.equalsIgnoreCase("vcf_filelist"))
 85 | 				{
 86 | 					vcfFilelist = val;
 87 | 				}
 88 | 				else if(key.equalsIgnoreCase("out_prefix"))
 89 | 				{
 90 | 					outPrefix = val;
 91 | 				}
 92 | 				else if(key.equalsIgnoreCase("info_filter"))
 93 | 				{
 94 | 					String[] tokens = val.split(",");
 95 | 					infoFilters.put(tokens[0], tokens[1]);
 96 | 				}
 97 | 				else if(key.equalsIgnoreCase("grep_filter"))
 98 | 				{
 99 | 					grepFilters.add(val);
100 | 				}
101 | 				else if(key.equalsIgnoreCase("padding"))
102 | 				{
103 | 					PADDING = Integer.parseInt(val);
104 | 				}
105 | 			}
106 | 		}
107 | 		
108 | 		if((vcfFn.length() == 0 && bedFn.length() == 0) || genomeFn.length() == 0 || bamFilelist.length() == 0 || outPrefix.length() == 0)
109 | 		{
110 | 			usage();
111 | 			System.exit(0);
112 | 		}
113 | 	}
114 | 	
115 | 	/*
116 | 	 * Print the usage menu
117 | 	 */
118 | 	static void usage()
119 | 	{
120 | 		System.out.println();
121 | 		System.out.println("Jasmine IGV Screenshot Maker");
122 | 		System.out.println("Usage: igv_jasmine [args]");
123 | 		System.out.println("  Example: igv_jasmine vcf_file=merged.vcf genome_file=genome.fa"
124 | 				+ " bam_filelist=bams.txt out_prefix=igv");
125 | 		System.out.println();
126 | 		System.out.println("Required args:");
127 | 		System.out.println("  vcf_file      (String) - the VCF file with merged SVs");
128 | 		System.out.println("  genome_file   (String) - the FASTA file with the reference genome");
129 | 		System.out.println("  bam_filelist  (String) - a comma-separated list of BAM files");
130 | 		System.out.println("  out_prefix    (String) - the prefix of the output directory and filenames");
131 | 		System.out.println();
132 | 		System.out.println("Optional args:");
133 | 		System.out.println("  info_filter=KEY,VALUE  - filter by an INFO field value (multiple allowed) e.g., info_filter=SUPP_VEC,101");
134 | 		System.out.println("  grep_filter=QUERY      - filter to only lines containing a given QUERY");
135 | 		System.out.println("  vcf_filelist  (String) - the txt file with a list of input VCFs in the same order as BAM files");
136 | 		System.out.println("  bed_file      (String) - a bed file with a list of ranges (use instead of vcf_file)");
137 | 		System.out.println("  --precise              - require variant to contain \"PRECISE\" as an INFO field");
138 | 		System.out.println("  --specific             - shorthand for info_filter=IS_SPECIFIC,1");
139 | 		System.out.println("  --squish               - squishes tracks to fit more reads");
140 | 		System.out.println("  --svg                  - save as an SVG instead of a PNG");
141 | 		System.out.println("  --normalize_chr_names  - normalize the VCF chromosome names to strip \"chr\"");
142 | 		System.out.println();
143 | 	}
144 | 	
145 | 	public static void main(String[] args) throws Exception
146 | 	{
147 | 		Settings.CHR_NAME_MAP = new ChrNameNormalization();
148 | 		
149 | 		parseArgs(args);
150 | 		
151 | 		Path currentRelativePath = Paths.get("");
152 | 		String outDir = currentRelativePath.toAbsolutePath().toString() + "/" + outPrefix;
153 | 		File outDirFile = new File(outDir);
154 | 		if(outDirFile.isDirectory())
155 | 		{
156 | 			final File[] files = outDirFile.listFiles();
157 | 			for (File f: files) f.delete();
158 | 			outDirFile.delete();
159 | 		}
160 | 		outDirFile.mkdir();
161 | 		String ofn = outDir + "/" + outPrefix + ".bat";
162 | 		
163 | 		PrintWriter out = new PrintWriter(new File(ofn));
164 | 		
165 | 		out.println("new");
166 | 		out.println("genome " + (genomeFn.startsWith("/") ? 
167 | 				genomeFn : (currentRelativePath.toAbsolutePath().toString() + "/" + genomeFn)));
168 | 		ArrayList<String> bamFiles = PipelineManager.getFilesFromList(bamFilelist);
169 | 		ArrayList<String> vcfFiles = new ArrayList<String>();
170 | 		if(vcfFilelist.length() > 0)
171 | 		{
172 | 			vcfFiles = PipelineManager.getFilesFromList(vcfFilelist);
173 | 		}
174 | 		ArrayList<String> bedFiles = new ArrayList<String>();
175 | 		for(int i = 0; i<bamFiles.size(); i++)
176 | 		{
177 | 			String bamFn = bamFiles.get(i);
178 | 			out.println("load " + (bamFn.startsWith("/") ? 
179 | 					bamFn : (currentRelativePath.toAbsolutePath().toString() + "/" + bamFn)));
180 | 			if(vcfFiles.size() > 0)
181 | 			{
182 | 				String fn = currentRelativePath.toAbsolutePath().toString() + "/" + StringUtils.fileBaseName(bamFn);
183 | 				fn = fn.substring(0, fn.length() - 4) + ".bed";
184 | 				out.println("load " + fn);
185 | 				bedFiles.add(fn);
186 | 				PrintWriter curOut = new PrintWriter(new File(fn));
187 | 				Scanner curInput = new Scanner(new FileInputStream(new File(vcfFiles.get(i))));
188 | 				while(curInput.hasNext())
189 | 				{
190 | 					String line = curInput.nextLine();
191 | 					if(line.length() == 0 || line.startsWith("#"))
192 | 					{
193 | 						continue;
194 | 					}
195 | 					VcfEntry entry = VcfEntry.fromLine(line);
196 | 					String chr = entry.getChromosome();
197 | 					int start = (int)entry.getPos();
198 | 					int end = (int)entry.getEnd();
199 | 					String id = entry.getId();
200 | 					String type = entry.getNormalizedType();
201 | 					if(type.equalsIgnoreCase("TRA"))
202 | 					{
203 | 						String chr2 = entry.getChr2();
204 | 						curOut.printf("%s\t%d\t%d\t%s_%s\n", chr, start, start+1, id, type);
205 | 						curOut.printf("%s\t%d\t%d\t%s_%s\n", chr2, end, end+1, id, type);
206 | 					}
207 | 					else
208 | 					{
209 | 						if(end - start <= 100000)
210 | 						{
211 | 							curOut.printf("%s\t%d\t%d\t%s_%s\n", chr, start, end+1, id, type);
212 | 						}
213 | 					}
214 | 				}
215 | 				curInput.close();
216 | 				curOut.close();
217 | 			}
218 | 			
219 | 		}
220 | 		out.println("snapshotDirectory " + outDir);
221 | 		
222 | 		if(vcfFn.length() > 0)
223 | 		{	
224 | 			Scanner input = new Scanner(new FileInputStream(new File(vcfFn)));
225 | 			while(input.hasNext())
226 | 			{
227 | 				String line = input.nextLine();
228 | 				if(line.length() == 0 || line.startsWith("#"))
229 | 				{
230 | 					continue;
231 | 				}
232 | 				VcfEntry entry = new VcfEntry(line);
233 | 				
234 | 				// Check that the entry passes grep and INFO filters
235 | 				boolean passesFilters = true;
236 | 				
237 | 				for(String s : grepFilters)
238 | 				{
239 | 					if(!line.contains(s))
240 | 					{
241 | 						passesFilters = false;
242 | 					}
243 | 				}
244 | 				for(String s : infoFilters.keySet())
245 | 				{
246 | 					if(!entry.hasInfoField(s) || !entry.getInfo(s).equals(infoFilters.get(s)))
247 | 					{
248 | 						passesFilters = false;
249 | 					}
250 | 				}
251 | 				
252 | 				if(PRECISE && !entry.tabTokens[7].startsWith("PRECISE;") && !entry.tabTokens[7].contains(";PRECISE;"))
253 | 				{
254 | 					passesFilters = false;
255 | 				}
256 | 				
257 | 				if(!passesFilters)
258 | 				{
259 | 					continue;
260 | 				}
261 | 				
262 | 				long start = entry.getPos() - PADDING;				
263 | 				long end = entry.getEnd() + PADDING;
264 | 				
265 | 				// Avoid giving non-positive coords
266 | 				start = Math.max(start, 1);
267 | 				end = Math.max(end, 1);
268 | 				
269 | 				// Make sure entire insertion is covered
270 | 				if(entry.getNormalizedType().equals("INS"))
271 | 				{
272 | 					end = entry.getPos() + entry.getLength() + PADDING;
273 | 				}
274 | 				
275 | 				if(end > start + 100000)
276 | 				{
277 | 					continue;
278 | 				}
279 | 				
280 | 				String chr = entry.getChromosome();
281 | 				
282 | 				out.println("goto " + chr + ":" + start + "-" + end);
283 | 				out.println("sort position");
284 | 				if(SQUISH)
285 | 				{
286 | 					for(String bamFile : bamFiles)
287 | 					{
288 | 						out.println("squish " + bamFile);
289 | 					}
290 | 				}
291 | 				else
292 | 				{
293 | 					for(String bamFile : bamFiles)
294 | 					{
295 | 						out.println("collapse " + bamFile);
296 | 					}
297 | 				}
298 | 				for(String bedFile : bedFiles)
299 | 				{
300 | 					out.println("expand " + bedFile);
301 | 				}
302 | 				out.println("snapshot " + entry.getId() + ".png");
303 | 			}
304 | 						
305 | 			input.close();
306 | 		}
307 | 		
308 | 		else if(bedFn.length() > 0)
309 | 		{
310 | 			Scanner input = new Scanner(new FileInputStream(new File(bedFn)));
311 | 			while(input.hasNext())
312 | 			{
313 | 				String line = input.nextLine();
314 | 				if(line.length() == 0)
315 | 				{
316 | 					continue;
317 | 				}
318 | 				
319 | 				String[] tokens = line.split("\t");
320 | 
321 | 				long start = Long.parseLong(tokens[1]);				
322 | 				long end = Long.parseLong(tokens[2]);
323 | 				
324 | 				String chr = tokens[0];
325 | 				
326 | 				out.println("goto " + chr + ":" + start + "-" + end);
327 | 				out.println("sort position");
328 | 				if(SQUISH)
329 | 				{
330 | 					for(String bamFile : bamFiles)
331 | 					{
332 | 						out.println("squish " + bamFile);
333 | 					}
334 | 				}
335 | 				else
336 | 				{
337 | 					for(String bamFile : bamFiles)
338 | 					{
339 | 						out.println("collapse " + bamFile);
340 | 					}
341 | 				}
342 | 				out.println("snapshot " + tokens[3] + ".png");
343 | 			}
344 | 						
345 | 			input.close();
346 | 		}
347 | 		
348 | 		out.println("exit");
349 | 
350 | 		out.close();
351 | 	}
352 | }
353 | 


--------------------------------------------------------------------------------
/src/InsertionsToDuplications.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Converts insertions which were originally duplications back to their original SV calls
 3 |  * Usage: java InsertionsToDuplications input_vcf output_vcf
 4 |  */
 5 | 
 6 | import java.io.File;
 7 | import java.io.FileInputStream;
 8 | import java.io.PrintWriter;
 9 | import java.util.Scanner;
10 | 
11 | public class InsertionsToDuplications {
12 | 	static String inputFile = "";
13 | 	static String outputFile = "";
14 | 	public static void main(String[] args) throws Exception
15 | 	{
16 | 		if(args.length != 2)
17 | 		{
18 | 			System.out.println("Usage: java InsertionsToDuplications input_vcf output_vcf");
19 | 			return;
20 | 		}
21 | 		else
22 | 		{
23 | 			inputFile = args[0];
24 | 			outputFile = args[1];
25 | 			convertFile(inputFile, outputFile);
26 | 		}		
27 | 	}
28 | 	
29 | 	/*
30 | 	 * Convert any insertions which have OLDTYPE marked as DUP back to duplications
31 | 	 */
32 | 	static void convertFile(String inputFile, String outputFile) throws Exception
33 | 	{
34 | 		Scanner input = new Scanner(new FileInputStream(new File(inputFile)));
35 | 		
36 | 		PrintWriter out = new PrintWriter(new File(outputFile));
37 | 		
38 | 		VcfHeader header = new VcfHeader();
39 | 		
40 | 		int countDup = 0;
41 | 		
42 | 		boolean headerPrinted = false;
43 | 		int totalEntries = 0;
44 | 		
45 | 		while(input.hasNext())
46 | 		{
47 | 			String line = input.nextLine();
48 | 			if(line.startsWith("#"))
49 | 			{
50 | 				header.addLine(line);
51 | 				continue;
52 | 			}
53 | 			
54 | 			if(!headerPrinted)
55 | 			{
56 | 				header.addInfoField("REFINEDALT", "1", "String", "For duplications which were changed to insertions and refined, the refined ALT sequence");
57 | 				header.addInfoField("STRANDS", "1", "String", "");
58 | 				header.print(out);
59 | 				headerPrinted = true;
60 | 			}
61 | 			
62 | 			VcfEntry ve = new VcfEntry(line);
63 | 			
64 | 			totalEntries++;
65 | 			
66 | 			if(line.contains("OLDTYPE=DUP") && ve.getType().equals("INS"))
67 | 			{
68 | 				countDup++;
69 | 					
70 | 				long start = ve.getPos();
71 | 				int length = ve.getLength();
72 | 				long nstart = start - length + 1, nend = nstart + length;
73 | 				String refinedAlt = ve.getAlt();
74 | 				ve.setPos(nstart);
75 | 				ve.setInfo("END", nend+"");
76 | 				ve.setType("DUP");
77 | 				ve.setInfo("REFINEDALT", refinedAlt);
78 | 				ve.setInfo("STRANDS", "-+");
79 | 				ve.setRef(".");
80 | 				ve.setAlt("<DUP>");
81 | 				out.println(ve);
82 | 			}
83 | 			else
84 | 			{
85 | 				ve.setInfo("REFINEDALT", ".");
86 | 				out.println(ve);
87 | 			}
88 | 		}
89 | 		
90 | 		System.out.println("Number of insertions converted back to duplications: " + countDup + " out of " + totalEntries + " total variants");
91 | 		
92 | 		input.close();
93 | 		out.close();
94 | 	}
95 | }
96 | 


--------------------------------------------------------------------------------
/src/KDTree.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Data structure for fast k-nearest neighbor queries in variant sets
  3 |  * For a given query, the k closest points to it in the dataset will be reported,
  4 |  * breaking ties by variant ID to ensure deterministic behavior.
  5 |  * 
  6 |  * We assume variants are 2-D points; nearness is based on Euclidean distance or its generalizations.
  7 |  * 
  8 |  * Uses algorithm described here: 
  9 |  * https://courses.cs.washington.edu/courses/cse599c1/13wi/slides/lsh-hashkernels-annotated.pdf
 10 |  */
 11 | 
 12 | import java.util.ArrayDeque;
 13 | import java.util.ArrayList;
 14 | import java.util.LinkedList;
 15 | import java.util.PriorityQueue;
 16 | import java.util.Stack;
 17 | 
 18 | public class KDTree 
 19 | {
 20 | 	Node root;
 21 | 	Node search;
 22 | 	PriorityQueue<Candidate> best;
 23 | 	int cnt;
 24 | 	int querySize;
 25 | 	int K;
 26 | 	
 27 | 	int n;
 28 | 	
 29 | 	/*
 30 | 	 * Initializes a KD-tree from a list of variants
 31 | 	 */
 32 | 	public KDTree(Variant[] p) 
 33 | 	{
 34 | 		n = p.length;
 35 | 		K = 2;
 36 | 		LinkedList<Node> list = new LinkedList<Node>();
 37 | 		for (Variant q : p) list.add(new Node(q));
 38 | 		root = build(list, 0);//buildNonrecursive(list, 0).get(0);
 39 | 	}
 40 | 	
 41 | 	public KDTree(Variant[] p, boolean recursive) 
 42 | 	{
 43 | 		n = p.length;
 44 | 		K = 2;
 45 | 		LinkedList<Node> list = new LinkedList<Node>();
 46 | 		for (Variant q : p) list.add(new Node(q));
 47 | 		root = recursive ? build(list, 0) : buildNonrecursive(list).get(0);
 48 | 	}
 49 | 	
 50 | 	private Node build(LinkedList<Node> p, int depth) 
 51 | 	{
 52 | 		if (p.size() == 0) return null;
 53 | 		Node pivot = p.remove();
 54 | 		
 55 | 		// Sort the points into left and right subtrees based on current split dimension
 56 | 		LinkedList<Node> left = new LinkedList<Node>();
 57 | 		LinkedList<Node> right = new LinkedList<Node>();
 58 | 		while (!p.isEmpty()) 
 59 | 		{
 60 | 			if (p.peek().planes[depth % K] < pivot.planes[depth % K])
 61 | 				left.add(p.remove());
 62 | 			else
 63 | 				right.add(p.remove());
 64 | 		}
 65 | 		pivot.children[0] = build(left, depth + 1);
 66 | 		pivot.children[1] = build(right, depth + 1);
 67 | 		
 68 | 		return pivot;
 69 | 	}
 70 | 	
 71 | 	/*
 72 | 	 * Build the data structure from a list of points without recursion
 73 | 	 * This avoids stack overflow issues caused by larger datasets
 74 | 	 */
 75 | 	private ArrayList<Node> buildNonrecursive(LinkedList<Node> p) 
 76 | 	{
 77 | 		ArrayList<Node> nodeList = new ArrayList<Node>();
 78 | 		if(p.size() == 0)
 79 | 		{
 80 | 			return null;
 81 | 		}
 82 | 		
 83 | 		// The stack of node lists to process (in place of recursive calls)
 84 | 		ArrayDeque<LinkedList<Node>> toProcess = new ArrayDeque<LinkedList<Node>>();
 85 | 		ArrayDeque<Integer> parents = new ArrayDeque<Integer>();
 86 | 		ArrayDeque<Integer> depths = new ArrayDeque<Integer>();
 87 | 		ArrayDeque<Integer> parentsides = new ArrayDeque<Integer>();
 88 | 		
 89 | 		// Initialize root to null to be filled 
 90 | 		//nodeList.add(res);
 91 | 		toProcess.addFirst(p);
 92 | 		parents.addFirst(-1); // This is not actually the parent of the root, but it will be ignored anyways
 93 | 		depths.addFirst(0);
 94 | 		parentsides.addFirst(-1);
 95 | 		
 96 | 		while(!toProcess.isEmpty())
 97 | 		{
 98 | 			// Get information for processing this node from stacks
 99 | 			LinkedList<Node> pcur = toProcess.pollFirst();
100 | 			int parentcur = parents.pollFirst();
101 | 			int depthcur = depths.pollFirst();
102 | 			int parentsidecur = parentsides.pollFirst();
103 | 			
104 | 			// Get pivot as the first point in the list
105 | 			Node pivot = pcur.remove();
106 | 			
107 | 			// Separate this point into points left of the pivot vs. right of the pivot
108 | 			LinkedList<Node> left = new LinkedList<Node>();
109 | 			LinkedList<Node> right = new LinkedList<Node>();
110 | 			while (!pcur.isEmpty()) 
111 | 			{
112 | 				Node check = pcur.pollFirst();
113 | 				if (check.planes[depthcur % K] < pivot.planes[depthcur % K])
114 | 					left.add(check);
115 | 				else
116 | 					right.add(check);
117 | 			}
118 | 			
119 | 			//pcur.clear();
120 | 			
121 | 			// Update this node's parent's child-pointer to this node.
122 | 			if(parentsidecur != -1)
123 | 			{
124 | 				nodeList.get(parentcur).children[parentsidecur] = pivot;
125 | 			}
126 | 			
127 | 			pivot.children[0] = null;
128 | 			pivot.children[1] = null;
129 | 			nodeList.add(pivot);
130 | 			
131 | 			// Add right child to processing stack
132 | 			if(right.size() > 0)
133 | 			{
134 | 				toProcess.addFirst(right);
135 | 				parents.addFirst(nodeList.size() - 1);
136 | 				parentsides.addFirst(1);
137 | 				depths.addFirst(depthcur + 1);
138 | 			}
139 | 			
140 | 			// Add left child to processing stack
141 | 			if(left.size() > 0)
142 | 			{
143 | 				toProcess.addFirst(left);
144 | 				parents.addFirst(nodeList.size() - 1);
145 | 				parentsides.addFirst(0);
146 | 				depths.addFirst(depthcur + 1);
147 | 			}
148 | 		}
149 | 		
150 | 		return nodeList;
151 | 	}
152 | 	
153 | 	/*
154 | 	 * Used to make sure two KD-trees are the same
155 | 	 */
156 | 	static boolean compare(String pref, Node a, Node b)
157 | 	{
158 | 		if(a == null && b != null)
159 | 		{
160 | 			System.out.println(pref + " only a is null");
161 | 			return true;
162 | 		}
163 | 		if(b == null && a != null)
164 | 		{
165 | 			System.out.println(pref + " only b is null");
166 | 			return true;
167 | 		}
168 | 		if(a == null && b == null)
169 | 		{
170 | 			return false;
171 | 		}
172 | 		if(a.planes[0] != b.planes[0] || a.planes[1] != b.planes[1])
173 | 		{
174 | 			System.out.println(pref + " diff value: " + a.planes[0] + " " + a.planes[1] + " " + b.planes[0] + " " + b.planes[1]);
175 | 			return true;
176 | 		}
177 | 		
178 | 		boolean leftDiff = compare(pref + "L", a.children[0], b.children[0]);
179 | 		if(leftDiff)
180 | 		{
181 | 			return true;
182 | 		}
183 | 		else
184 | 		{
185 | 			return compare(pref + "R", a.children[1], b.children[1]);
186 | 		}
187 | 		
188 | 	}
189 | 	
190 | 	/*
191 | 	 * Gets the k nearest neighbors for a query variant
192 | 	 */
193 | 	public Variant[] kNearestNeighbor(Variant p, int k) {
194 | 		search = new Node(p);
195 | 		best = new PriorityQueue<Candidate>();
196 | 		querySize = k;
197 | 		search(root, 0);
198 | 		Variant[] res = new Variant[best.size()];
199 | 		int idx = res.length - 1;
200 | 		while(!best.isEmpty())
201 | 		{
202 | 			res[idx--] = best.poll().v;
203 | 		}
204 | 		return res;
205 | 	}
206 | 	
207 | 	/*
208 | 	 * Search the subtree rooted at cur for candidate points in the set of query's k-nearest neighbors
209 | 	 */
210 | 	private void search(Node ocur, int odepth) {
211 | 		Stack<Node> curs = new Stack<Node>();
212 | 		Stack<Integer> depths = new Stack<Integer>();
213 | 		Stack<Boolean> processedBest = new Stack<Boolean>();
214 | 		curs.add(ocur);
215 | 		depths.add(odepth);
216 | 		processedBest.push(false);
217 | 		while(!curs.isEmpty())
218 | 		{
219 | 			Node cur = curs.pop();
220 | 			int depth = depths.pop();
221 | 			boolean bestDone = processedBest.pop();
222 | 			
223 | 			if(cur == null) continue;
224 | 			
225 | 			int betterChild = (int) Math.signum(search.planes[depth % K] - cur.planes[depth % K]) < 0 ? 0 : 1;
226 | 			
227 | 			if(!bestDone)
228 | 			{
229 | 				curs.add(cur);
230 | 				depths.add(depth);
231 | 				processedBest.add(true);
232 | 				curs.add(cur.children[betterChild]);
233 | 				depths.add(depth+1);
234 | 				processedBest.add(false);
235 | 				continue;
236 | 			}
237 | 			Candidate toAdd = new Candidate(cur.p, cur.p.distance(search.p));
238 | 			if (best == null || best.size() < querySize || toAdd.compareTo(best.peek()) > 0) 
239 | 			{
240 | 				if(best.size() == querySize)
241 | 				{
242 | 					best.poll();
243 | 				}
244 | 				best.add(toAdd);
245 | 			}
246 | 			if (best.size() < querySize || Math.abs(search.planes[depth % K] - cur.planes[depth % K]) < best.peek().dist)
247 | 			{
248 | 				curs.add(cur.children[1 - betterChild]);
249 | 				depths.add(depth+1);
250 | 				processedBest.add(false);
251 | 			}
252 | 		}
253 | 	}
254 | 	
255 | 	/*
256 | 	 * A node of the KD tree
257 | 	 * Each node has a variant, storing alongside it its values along the split planes, as well as two (possibly null) children
258 | 	 */
259 | 	private class Node {
260 | 		Node[] children;
261 | 		Variant p;
262 | 		double[] planes;
263 | 		public Node(Variant pp) 
264 | 		{
265 | 			p = pp;
266 | 			planes = new double[K];
267 | 			planes[0] = p.start;
268 | 			planes[1] = p.end; // add additional dimensions as necessary
269 | 			children = new Node[2];
270 | 			children[0] = null;
271 | 			children[1] = null;
272 | 		}
273 | 	}
274 | 	
275 | 	/*
276 | 	 * Candidate k-nearest neighbor of the current query point
277 | 	 */
278 | 	private static class Candidate implements Comparable<Candidate>
279 | 	{
280 | 		Variant v;
281 | 		double dist;
282 | 		Candidate(Variant v, double dist)
283 | 		{
284 | 			this.v = v;
285 | 			this.dist = dist;
286 | 		}
287 | 		public int compareTo(Candidate o)
288 | 		{
289 | 			if(Math.abs(dist - o.dist) > 1e-9) return Double.compare(o.dist, dist);
290 | 			if(v.hash != o.v.hash) return o.v.hash - v.hash;
291 | 			return o.v.id.compareTo(v.id);
292 | 			
293 | 		}
294 | 	}
295 | }
296 | 


--------------------------------------------------------------------------------
/src/Main.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Main interface for Jasmine
  3 |  */
  4 | import java.io.File;
  5 | import java.util.ArrayList;
  6 | import java.util.TreeMap;
  7 | 
  8 | public class Main {
  9 | public static void main(String[] args) throws Exception
 10 | {
 11 | 	Settings.parseArgs(args);
 12 | 	
 13 | 	// The input file to SV merging may change based on the steps the user wants to run
 14 | 	String currentInputFile = Settings.FILE_LIST;
 15 | 	
 16 | 	if(Settings.USING_FILE_LIST)
 17 | 	{
 18 | 		File f = new File(currentInputFile);
 19 | 		if(!f.exists())
 20 | 		{
 21 | 			System.out.println("Warning: Input file list " + currentInputFile + " does not exist.");
 22 | 		}
 23 | 	}
 24 | 	
 25 | 	if(!Settings.POSTPROCESS_ONLY)
 26 | 	{
 27 | 		currentInputFile = preprocess(currentInputFile);
 28 | 	}
 29 | 	
 30 | 	if(!Settings.PREPROCESS_ONLY && !Settings.POSTPROCESS_ONLY)
 31 | 	{
 32 | 		runJasmine(currentInputFile);
 33 | 	}
 34 | 	
 35 | 	if(!Settings.PREPROCESS_ONLY)
 36 | 	{
 37 | 		postprocess(currentInputFile);
 38 | 	}
 39 | }
 40 | static String preprocess(String currentInputFile) throws Exception
 41 | {
 42 | 	// Convert the duplications to insertions if the user wants to
 43 | 	if(Settings.CONVERT_DUPLICATIONS)
 44 | 	{
 45 | 		currentInputFile = PipelineManager.convertDuplicationsToInsertions(currentInputFile);
 46 | 	}
 47 | 	
 48 | 	// Mark calls with strong read support and long length as specific calls
 49 | 	if(Settings.MARK_SPECIFIC)
 50 | 	{
 51 | 		currentInputFile = PipelineManager.markSpecificCalls(currentInputFile);
 52 | 	}
 53 | 	
 54 | 	// Run iris if the user specifies that they want to run it
 55 | 	if(Settings.RUN_IRIS)
 56 | 	{
 57 | 		currentInputFile = PipelineManager.runIris(currentInputFile);
 58 | 	}
 59 | 	
 60 | 	// Run iris if the user specifies that they want to run it
 61 | 	if(Settings.PRE_NORMALIZE)
 62 | 	{
 63 | 		currentInputFile = PipelineManager.normalizeTypes(currentInputFile);
 64 | 	}
 65 | 	
 66 | 	return currentInputFile;
 67 | }
 68 | static void runJasmine(String currentInputFile) throws Exception
 69 | {
 70 | 	// Get the variants and bin them into individual graphs
 71 | 	TreeMap<String, ArrayList<Variant>> allVariants = VariantInput.readAllFiles(currentInputFile);
 72 | 		
 73 | 	// Initialize data structure for outputting merged variants
 74 | 	VariantOutput output = new VariantOutput();
 75 | 		
 76 | 	// Get the number of samples to know the length of the SUPP_VEC field
 77 | 	int sampleCount = VariantInput.countFiles(currentInputFile);
 78 | 		
 79 | 	// Merge each graph in parallel
 80 | 	ParallelMerger pm = new ParallelMerger(allVariants, output, sampleCount);
 81 | 	pm.run();
 82 | 		
 83 | 	System.out.println("Merging complete - outputting results");
 84 | 		
 85 | 	// Print the merged variants to a file if they have enough support
 86 | 	output.writeMergedVariants(currentInputFile, Settings.OUT_FILE);
 87 | 		
 88 | 	System.out.println("Number of sets with multiple variants: " + pm.totalMerged.get()); 
 89 | }
 90 | static void postprocess(String currentInputFile) throws Exception
 91 | {
 92 | 	// Convert insertions back to duplications as needed
 93 | 	if(Settings.CONVERT_DUPLICATIONS)
 94 | 	{
 95 | 		PipelineManager.convertInsertionsBackToDuplications();
 96 | 	}
 97 | 	
 98 | 	// Add genotypes
 99 | 	if(Settings.OUTPUT_GENOTYPES)
100 | 	{
101 | 		PipelineManager.addGenotypes(currentInputFile);
102 | 	}
103 | }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/MarkSpecificCalls.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Program for marking calls which fall in a specific callset, given the sensitive callset
 3 |  * Takes parameters for read support and SV length required for a variant to be specific
 4 |  */
 5 | 
 6 | import java.io.File;
 7 | import java.io.FileInputStream;
 8 | import java.io.PrintWriter;
 9 | import java.util.ArrayList;
10 | import java.util.Scanner;
11 | 
12 | public class MarkSpecificCalls {
13 | public static void main(String[] args) throws Exception
14 | {
15 | 	String fn = "";
16 | 	String ofn = "";
17 | 	int minReadSupport = 0;
18 | 	int minLength = 0;
19 | 	if(args.length == 4)
20 | 	{
21 | 		fn = args[0];
22 | 		ofn = args[1];
23 | 		minReadSupport = Integer.parseInt(args[2]);
24 | 		minLength = Integer.parseInt(args[3]);
25 | 		convertFile(fn, ofn, minReadSupport, minLength);
26 | 	}
27 | 	else
28 | 	{
29 | 		System.out.println("Usage: java MarkSpecificCalls vcffile outfile minreadsupport minlength");
30 | 		return;
31 | 	}	
32 | }
33 | 
34 | /*
35 |  * Marks specific calls in inputFile and outputs updated VCF to outputFile
36 |  * A variant will be specific if its number of supporting reads is at least
37 |  * minReadSupport (or unspecified) and its length (absolute value) is at least minLength
38 |  */
39 | static void convertFile(String inputFile, String outputFile, int minReadSupport, int minLength) throws Exception
40 | {
41 | 	Scanner input = new Scanner(new FileInputStream(new File(inputFile)));
42 | 	PrintWriter out = new PrintWriter(new File(outputFile));
43 | 	
44 | 	VcfHeader header = new VcfHeader();
45 | 	ArrayList<VcfEntry> entries = new ArrayList<VcfEntry>();
46 | 	
47 | 	while(input.hasNext())
48 | 	{
49 | 		String line = input.nextLine();
50 | 		if(line.length() == 0)
51 | 		{
52 | 			continue;
53 | 		}
54 | 		if(line.startsWith("#"))
55 | 		{
56 | 			header.addLine(line);
57 | 		}
58 | 		else
59 | 		{
60 | 			VcfEntry entry = VcfEntry.fromLine(line);
61 | 			boolean inSpecific = false;
62 | 			int readSupport = entry.getReadSupport();
63 | 			
64 | 			boolean longEnough = entry.getType().equals("TRA") || entry.getType().equals("BND") || Math.abs(entry.getLength()) >= minLength || entry.getLength() == 0;
65 | 			
66 | 			if(readSupport >= minReadSupport && longEnough)
67 | 			{
68 | 				inSpecific = true;
69 | 			}
70 | 			
71 | 			entry.setInfo("IS_SPECIFIC", inSpecific ? "1" : "0");
72 | 			entries.add(entry);
73 | 		}
74 | 	}
75 | 	
76 | 	header.addInfoField("IS_SPECIFIC", "1", "String", "Whether or not a variant has enough read support and length to be specific");
77 | 	header.print(out);
78 | 	
79 | 	for(VcfEntry entry : entries)
80 | 	{
81 | 		out.println(entry);
82 | 	}
83 | 	
84 | 	input.close();
85 | 	out.close();
86 | }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/NormalizeTypes.java:
--------------------------------------------------------------------------------
 1 | import java.io.File;
 2 | import java.io.FileInputStream;
 3 | import java.io.PrintWriter;
 4 | import java.util.ArrayList;
 5 | import java.util.Scanner;
 6 | 
 7 | public class NormalizeTypes {
 8 | 	static String inputFile = "";
 9 | 	static String outputFile = "";
10 | 	public static void main(String[] args) throws Exception
11 | 	{
12 | 		if(args.length != 2)
13 | 		{
14 | 			System.out.println("Usage: java NormalizeTypes input_vcf output_vcf");
15 | 			return;
16 | 		}
17 | 		else
18 | 		{
19 | 			inputFile = args[0];
20 | 			outputFile = args[1];
21 | 			Settings.CHR_NAME_MAP = new ChrNameNormalization();
22 | 			convertFile(inputFile, outputFile);
23 | 		}		
24 | 	}
25 | 	
26 | 	/*
27 | 	 * Convert types in inputFiles to their normalized types and outputs them to a new file
28 | 	 */
29 | 	static void convertFile(String inputFile, String outputFile) throws Exception
30 | 	{
31 | 		Scanner input = new Scanner(new FileInputStream(new File(inputFile)));
32 | 				
33 | 		PrintWriter out = new PrintWriter(new File(outputFile));
34 | 		
35 | 		VcfHeader header = new VcfHeader();
36 | 		ArrayList<VcfEntry> entries = new ArrayList<VcfEntry>();
37 | 				
38 | 		while(input.hasNext())
39 | 		{
40 | 			String line = input.nextLine();
41 | 			if(line.startsWith("#"))
42 | 			{
43 | 				header.addLine(line);
44 | 			}
45 | 			else if(line.length() > 0)
46 | 			{
47 | 				VcfEntry ve = VcfEntry.fromLine(line);
48 | 				ve.normalizeType();
49 | 				entries.add(ve);
50 | 			}
51 | 		}
52 | 				
53 | 		header.print(out);
54 | 		
55 | 		for(VcfEntry ve : entries)
56 | 		{
57 | 			out.println(ve);
58 | 		}
59 | 		
60 | 		input.close();
61 | 		out.close();
62 | 	}
63 | }
64 | 


--------------------------------------------------------------------------------
/src/Overlap.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * A program for filtering variants based on their overlap with a list of regions.
  3 |  */
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.PrintWriter;
  7 | import java.util.ArrayList;
  8 | import java.util.Collections;
  9 | import java.util.HashMap;
 10 | import java.util.HashSet;
 11 | import java.util.Scanner;
 12 | import java.util.TreeSet;
 13 | 
 14 | public class Overlap
 15 | {
 16 | 	static String vcfFn = "";
 17 | 	static String bedFn = "";
 18 | 	static String ofn = "";
 19 | 	static String FILTER_MODE = "CONTAINED_IN_REGION";
 20 | 	static String REPORT_MODE = "REMOVE";
 21 | 	static String reportInfo = "";
 22 | 	
 23 | 	static ChrNameNormalization chrNorm;
 24 | 	static void parseArgs(String[] args)
 25 | 	{
 26 | 		for(String arg : args)
 27 | 		{
 28 | 			int equalsIdx = arg.indexOf('=');
 29 | 			if(equalsIdx == -1)
 30 | 			{
 31 | 				
 32 | 			}
 33 | 			else
 34 | 			{
 35 | 				String key = arg.substring(0, equalsIdx);
 36 | 				String val = arg.substring(1 + equalsIdx);
 37 | 				if(key.equalsIgnoreCase("vcf_file"))
 38 | 				{
 39 | 					vcfFn = val;
 40 | 				}
 41 | 				else if(key.equalsIgnoreCase("bed_file"))
 42 | 				{
 43 | 					bedFn = val;
 44 | 				}
 45 | 				else if(key.equalsIgnoreCase("out_file"))
 46 | 				{
 47 | 					ofn = val;
 48 | 				}
 49 | 				else if(key.equalsIgnoreCase("info_report"))
 50 | 				{
 51 | 					reportInfo = val;
 52 | 				}
 53 | 			}
 54 | 		}
 55 | 		
 56 | 		if(reportInfo.length() > 0)
 57 | 		{
 58 | 			REPORT_MODE = "INFO";
 59 | 		}
 60 | 		
 61 | 		if(vcfFn.length() == 0 || bedFn.length() == 0 || ofn.length() == 0)
 62 | 		{
 63 | 			usage();
 64 | 			System.exit(0);
 65 | 		}
 66 | 	}
 67 | 	static void usage()
 68 | 	{
 69 | 		System.out.println();
 70 | 		System.out.println("Jasmine Overlapping");
 71 | 		System.out.println("Usage: overlap_jasmine [args]");
 72 | 		System.out.println("  Example: overlap_jasmine vcf_file=merged.vcf bed_file=regions.bed out_fie=filtered.vcf");
 73 | 		System.out.println();
 74 | 		System.out.println("Required args:");
 75 | 		System.out.println("  vcf_file   (String) - the VCF file with merged SVs");
 76 | 		System.out.println("  bed_file   (String) - a BED file with regions of interest");
 77 | 		System.out.println("  out_file   (String) - the name of the output VCF filtered by regions of interest");
 78 | 		System.out.println();
 79 | 		System.out.println("Optional args:");
 80 | 		System.out.println("  info_report (String) [] - the INFO field to indicate presence in regions instead of removing non-overlapping variants");
 81 | 		System.out.println();
 82 | 	}
 83 | 	
 84 | 	public static void main(String[] args) throws Exception
 85 | 	{
 86 | 		parseArgs(args);
 87 | 		
 88 | 		Settings.DEFAULT_CHR_NORM = true;
 89 | 		chrNorm = new ChrNameNormalization();
 90 | 		
 91 | 		filterVcf();
 92 | 	}
 93 | 	
 94 | 	static ArrayList<Event> getBedEvents() throws Exception
 95 | 	{
 96 | 		Scanner input = new Scanner(new FileInputStream(new File(bedFn)));
 97 | 		
 98 | 		ArrayList<Event> events = new ArrayList<Event>();
 99 | 		int idNum = 0;
100 | 		while(input.hasNext())
101 | 		{
102 | 			String line = input.nextLine();
103 | 			if(line.startsWith("#"))
104 | 			{
105 | 				continue;
106 | 			}
107 | 			String[] tokens = line.split("\t");
108 | 			String chr = tokens[0];
109 | 			chr = chrNorm.normalize(chr);
110 | 			int start = Integer.parseInt(tokens[1]);
111 | 			int end = Integer.parseInt(tokens[2]);
112 | 			idNum++;
113 | 			events.add(new Event(chr, start, 1, idNum + ""));
114 | 			events.add(new Event(chr, end, -1, idNum + ""));
115 | 		}
116 | 		input.close();
117 | 		
118 | 		Collections.sort(events);
119 | 		
120 | 		return events;
121 | 	}
122 | 	
123 | 	/*
124 | 	 * Gets the start and end events for variants
125 | 	 * Translocations are a special case and are broken up into two event pairs, one for each breakpoint
126 | 	 */
127 | 	static ArrayList<Event> getVcfEvents() throws Exception
128 | 	{
129 | 		Scanner input = new Scanner(new FileInputStream(new File(vcfFn)));
130 | 		
131 | 		ArrayList<Event> events = new ArrayList<Event>();
132 | 		while(input.hasNext())
133 | 		{
134 | 			String line = input.nextLine();
135 | 			if(line.startsWith("#"))
136 | 			{
137 | 				continue;
138 | 			}
139 | 			VcfEntry entry = VcfEntry.fromLine(line);
140 | 			String chr = entry.getChromosome();
141 | 			chr = chrNorm.normalize(chr);
142 | 			int start = (int)(entry.getPos());
143 | 			int end = (int)(entry.getEnd());
144 | 			String id = entry.getId();
145 | 			
146 | 			if(entry.getNormalizedType().equals("TRA"))
147 | 			{
148 | 				events.add(new Event(chr, start, 1, id + "_breakpoint1"));
149 | 				events.add(new Event(chr, start+1, -1, id + "_breakpoint1"));
150 | 				String chr2 = entry.getChr2();
151 | 				if(chr2.length() != 0)
152 | 				{
153 | 					events.add(new Event(chr2, end, 1, id + "_breakpoint2"));
154 | 					events.add(new Event(chr2, end + 1, -1, id + "_breakpoint2"));
155 | 				}
156 | 			}
157 | 			else
158 | 			{
159 | 				events.add(new Event(chr, start, 1, id));
160 | 				events.add(new Event(chr, end + 1, -1, id));
161 | 			}
162 | 		}
163 | 		input.close();
164 | 		
165 | 		Collections.sort(events);
166 | 		
167 | 		return events;
168 | 	}
169 | 	
170 | 	/*
171 | 	 * Gets a list of overlaps based on variant and region start and end events
172 | 	 * For each variant, it outputs a list of the IDs of regions with which it overlaps
173 | 	 */
174 | 	static HashMap<String, HashSet<String>> getOverlaps(ArrayList<Event> regions, ArrayList<Event> variants)
175 | 	{
176 | 		// The next region and variant events to consider
177 | 		int regionIdx = 0, variantIdx = 0;
178 | 		
179 | 		// As we do the plane sweep, the list of regions we are currently inside, if any
180 | 		TreeSet<String> openRegions = new TreeSet<String>();
181 | 		// The list of variant intervals we are currently inside
182 | 		TreeSet<String> openVariants = new TreeSet<String>();
183 | 
184 | 		HashMap<String, HashSet<String>> overlaps = new HashMap<String, HashSet<String>>();
185 | 		
186 | 		// Plane sweep time!
187 | 		while(true)
188 | 		{
189 | 			// Stop when there are no more events
190 | 			if(regionIdx == regions.size() && variantIdx == variants.size())
191 | 			{
192 | 				break;
193 | 			}
194 | 			
195 | 			// Whether or not the next event to process is a region event (as opposed to a variant)
196 | 			boolean nextEventRegion = false;
197 | 			
198 | 			// If we are out of regions, take a variant event
199 | 			if(regionIdx == regions.size())
200 | 			{
201 | 				nextEventRegion = false;
202 | 			}
203 | 			
204 | 			// If we are out of variants, take a region event
205 | 			else if(variantIdx == variants.size())
206 | 			{
207 | 				nextEventRegion = true;
208 | 			}
209 | 			
210 | 			// If we have both left, compare positions and choose according to overlap scheme
211 | 			else
212 | 			{
213 | 				Event nextRegion = regions.get(regionIdx);
214 | 				Event nextVariant = variants.get(variantIdx);
215 | 				
216 | 				// If region is on earlier chromosome, take that
217 | 				if(nextRegion.chr.compareTo(nextVariant.chr) < 0)
218 | 				{
219 | 					nextEventRegion = true;
220 | 				}
221 | 				
222 | 				// If region is on later chromosome, take variant
223 | 				else if(nextRegion.chr.compareTo(nextVariant.chr) > 0)
224 | 				{
225 | 					nextEventRegion = false;
226 | 				}
227 | 				
228 | 				// If region is at earlier position on same chromosome, take it
229 | 				else if(nextRegion.pos < nextVariant.pos)
230 | 				{
231 | 					nextEventRegion = true;
232 | 				}
233 | 				
234 | 				// If region is at later position on same chromosome, take variant
235 | 				else if(nextRegion.pos > nextVariant.pos)
236 | 				{
237 | 					nextEventRegion = false;
238 | 				}
239 | 				
240 | 				// Now the case where positions are the same - tie handling depends on overlap mode
241 | 				else if(FILTER_MODE.equalsIgnoreCase("CONTAINED_IN_REGION"))
242 | 				{
243 | 					// Order of priority is variant end, region end, region start, variant start
244 | 					if(nextVariant.type == 1)
245 | 					{
246 | 						nextEventRegion = false;
247 | 					}
248 | 					else
249 | 					{
250 | 						nextEventRegion = true;
251 | 					}
252 | 				}
253 | 			}
254 | 			
255 | 			// After deciding what kind of event to use, process it!
256 | 			
257 | 			// Case where next event is a region breakpoint
258 | 			if(nextEventRegion)
259 | 			{
260 | 				Event next = regions.get(regionIdx);
261 | 				// Start of region
262 | 				if(next.type == 1)
263 | 				{
264 | 					// Add to list of open regions
265 | 					openRegions.add(next.id);
266 | 				}
267 | 				// End of region
268 | 				else
269 | 				{
270 | 					// Remove from list of open regions
271 | 					openRegions.remove(next.id);
272 | 					
273 | 					for(String openVariant : openVariants)
274 | 					{
275 | 						if(overlaps.get(openVariant).contains(next.id))
276 | 						{
277 | 							overlaps.get(openVariant).remove(next.id);
278 | 						}
279 | 					}
280 | 				}
281 | 				regionIdx++;
282 | 			}
283 | 			
284 | 			// Case where next event is a variant breakpoint
285 | 			else
286 | 			{
287 | 				Event next = variants.get(variantIdx);
288 | 				// Start of variant
289 | 				if(next.type == 1)
290 | 				{
291 | 					// Add to list of open variants
292 | 					openVariants.add(next.id);
293 | 					
294 | 					// Initialize list of overlaps to all open regions
295 | 					overlaps.put(next.id, new HashSet<String>());
296 | 					for(String openRegion : openRegions)
297 | 					{
298 | 						overlaps.get(next.id).add(openRegion);
299 | 					}
300 | 				}
301 | 				// End of variant
302 | 				else
303 | 				{
304 | 					// Remove from list of open variants
305 | 					openVariants.remove(next.id);
306 | 					
307 | 					// If all overlapping regions were removed, take this out of overlap list
308 | 					if(overlaps.get(next.id).size() == 0)
309 | 					{
310 | 						overlaps.remove(next.id);
311 | 					}
312 | 				}
313 | 				variantIdx++;
314 | 			}
315 | 		}
316 | 		return overlaps;
317 | 	}
318 | 	
319 | 	static void filterVcf() throws Exception
320 | 	{
321 | 		System.err.println("Getting regions");
322 | 		ArrayList<Event> bedEvents = getBedEvents();
323 | 		System.err.println("Found " + bedEvents.size() + " region breakpoints");
324 | 		System.err.println("Getting variants");
325 | 		ArrayList<Event> vcfEvents = getVcfEvents();
326 | 		System.err.println("Found " + vcfEvents.size() + " variant breakpoints");
327 | 		System.err.println("Finding overlaps");
328 | 		HashMap<String, HashSet<String>> overlaps = getOverlaps(bedEvents, vcfEvents);
329 | 		System.err.println("Found " + overlaps.size() + " variants with at least one overlap");
330 | 		System.err.println("Filtering variants");
331 | 		Scanner input = new Scanner(new FileInputStream(new File(vcfFn)));
332 | 		PrintWriter out = new PrintWriter(new File(ofn));
333 | 		VcfHeader header = new VcfHeader();
334 | 		boolean printedHeader = false;
335 | 		while(input.hasNext())
336 | 		{
337 | 			String line = input.nextLine();
338 | 			if(line.length() == 0)
339 | 			{
340 | 				continue;
341 | 			}
342 | 			if(line.startsWith("#"))
343 | 			{
344 | 				header.addLine(line);
345 | 			}
346 | 			else
347 | 			{
348 | 				if(!printedHeader)
349 | 				{
350 | 					if(REPORT_MODE.equalsIgnoreCase("INFO"))
351 | 					{
352 | 						header.addInfoField(reportInfo, "1", "String", "Whether or not the variant is in the regions of interest listed in " + bedFn);
353 | 					}
354 | 					header.print(out);
355 | 					printedHeader = true;
356 | 				}
357 | 				VcfEntry entry = VcfEntry.fromLine(line);
358 | 				String id = entry.getId();
359 | 				HashSet<String> curOverlaps = overlaps.getOrDefault(id, null);
360 | 				boolean hasOverlap = curOverlaps != null;
361 | 				if(entry.getNormalizedType().equals("TRA"))
362 | 				{
363 | 					String id1 = id + "_breakpoint1", id2 = id + "_breakpoint2";
364 | 					HashSet<String> firstOverlap = overlaps.getOrDefault(id1, null);
365 | 					HashSet<String> secondOverlap = overlaps.getOrDefault(id2, null);
366 | 					hasOverlap = firstOverlap != null && secondOverlap != null;
367 | 				}
368 | 				if(REPORT_MODE.equalsIgnoreCase("REMOVE"))
369 | 				{
370 | 					if(hasOverlap)
371 | 					{
372 | 						out.println(entry);
373 | 					}
374 | 				}
375 | 				if(REPORT_MODE.equalsIgnoreCase("INFO"))
376 | 				{
377 | 					if(hasOverlap)
378 | 					{
379 | 						entry.setInfo(reportInfo, "1");
380 | 					}
381 | 					else
382 | 					{
383 | 						entry.setInfo(reportInfo, "0");
384 | 					}
385 | 					out.println(entry);
386 | 				}
387 | 			}
388 | 		}
389 | 		input.close();
390 | 		out.close();
391 | 	}
392 | 	
393 | 	static class Event implements Comparable<Event>
394 | 	{
395 | 		String chr;
396 | 		int pos;
397 | 		int type;
398 | 		String id;
399 | 		Event(String chr, int pos, int type, String id)
400 | 		{
401 | 			this.chr = chr;
402 | 			this.pos = pos;
403 | 			this.type = type;
404 | 			this.id = id;
405 | 		}
406 | 		@Override
407 | 		public int compareTo(Event o)
408 | 		{
409 | 			if(!chr.equals(o.chr))
410 | 			{
411 | 				return chr.compareTo(o.chr);
412 | 			}
413 | 			if(pos != o.pos) return pos - o.pos;
414 | 			return type - o.type; // Do ends before starts
415 | 		}
416 | 	}
417 | }
418 | 


--------------------------------------------------------------------------------
/src/ParallelMerger.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Multi-threading support for variant merging
  3 |  * Since each chromosome (and possibly type and strand) is its own graph,
  4 |  * the algorithm can be parallelized pretty naturally.
  5 |  * 
  6 |  * The graphs that need to be processed are stored in a queue, and each thread
  7 |  * processes one graph at a time, querying the queue for the next graph to process
  8 |  */
  9 | 
 10 | import java.util.ArrayList;
 11 | import java.util.Collections;
 12 | import java.util.TreeMap;
 13 | import java.util.concurrent.ConcurrentLinkedQueue;
 14 | import java.util.concurrent.atomic.AtomicInteger;
 15 | 
 16 | public class ParallelMerger {
 17 | 	
 18 | 	// IDs of graphs left to process
 19 | 	ConcurrentLinkedQueue<String> todo;
 20 | 	
 21 | 	// the variant graphs on which merging will be performed
 22 | 	TreeMap<String, ArrayList<Variant>> allVariants;
 23 | 	
 24 | 	// A data structure for holding merged variants to output
 25 | 	VariantOutput output;
 26 | 	
 27 | 	// The number of threads to use
 28 | 	int numThreads;
 29 | 	
 30 | 	// The total number of samples 
 31 | 	int sampleCount;
 32 | 	
 33 | 	AtomicInteger totalMerged = new AtomicInteger(0);
 34 | 	
 35 | 	ParallelMerger(TreeMap<String, ArrayList<Variant>> allVariants, VariantOutput output, int sampleCount)
 36 | 	{
 37 | 		this.allVariants = allVariants;
 38 | 		this.output = output;
 39 | 		this.numThreads = Settings.THREADS;
 40 | 		System.out.println("Number of threads: " + numThreads);
 41 | 		this.sampleCount = sampleCount;
 42 | 		todo = new ConcurrentLinkedQueue<String>();
 43 | 		for(String s : allVariants.keySet())
 44 | 		{
 45 | 			todo.add(s);
 46 | 		}
 47 | 	}
 48 | 	
 49 | 	/*
 50 | 	 * Start merging in parallel, initializing all threads
 51 | 	 */
 52 | 	void run() throws Exception
 53 | 	{
 54 | 		// The last thread in the array is the main thread, so it calls
 55 | 		// run() instead of start() and doesn't get joined below
 56 | 		MyThread[] threads = new MyThread[numThreads];
 57 | 		for(int i = 0; i<numThreads; i++)
 58 | 		{
 59 | 			threads[i] = new MyThread();
 60 | 			if(i == numThreads - 1)
 61 | 			{
 62 | 				threads[i].run();
 63 | 			}
 64 | 			else
 65 | 			{
 66 | 				threads[i].start();
 67 | 			}
 68 | 		}
 69 | 		for(int i = 0; i<numThreads-1; i++)
 70 | 		{
 71 | 			threads[i].join();
 72 | 		}
 73 | 	}
 74 | 
 75 | 	/*
 76 | 	 * A single thread performing variant merging
 77 | 	 */
 78 | 	public class MyThread extends Thread {
 79 | 			
 80 | 		public void run()
 81 | 		{
 82 | 			while(!todo.isEmpty())
 83 | 			{
 84 | 				String graphID = todo.poll();
 85 | 				System.out.println("Merging graph ID: " + graphID);
 86 | 				ArrayList<Variant> variantList = allVariants.get(graphID);
 87 | 				Collections.sort(variantList);
 88 | 				VariantMerger vm = new VariantMerger(variantList);
 89 | 				vm.runMerging();
 90 | 				ArrayList<Variant>[] res = vm.getGroups();
 91 | 				output.addGraph(graphID, res, sampleCount);
 92 | 				int merges = 0;
 93 | 				for(ArrayList<Variant> list : res)
 94 | 				{
 95 | 					if(list.size() > 1)
 96 | 					{
 97 | 						merges++;
 98 | 					}
 99 | 				}
100 | 				totalMerged.addAndGet(merges);
101 | 			}
102 | 			
103 | 		}
104 | 		
105 | 	}
106 | }
107 | 


--------------------------------------------------------------------------------
/src/PipelineManager.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * A utility for managing pipeline steps for multiple VCF files
  3 |  * Most of the pre-processing and post-processing steps are done on a per-VCF basis,
  4 |  * so this manager performs them for all files and updates the filelist to point to a
  5 |  * list of updated files instead of the original ones
  6 |  */
  7 | 
  8 | import java.io.File;
  9 | import java.io.FileInputStream;
 10 | import java.io.PrintWriter;
 11 | import java.nio.file.Files;
 12 | import java.nio.file.Paths;
 13 | import java.util.ArrayList;
 14 | import java.util.HashSet;
 15 | import java.util.Scanner;
 16 | 
 17 | public class PipelineManager {
 18 | 	
 19 | /*
 20 |  * Convert duplications to insertions for all VCF files and update filelist
 21 |  * Returns a path to the new filelist
 22 |  */
 23 | static String convertDuplicationsToInsertions(String fileList) throws Exception
 24 | {
 25 | 	ArrayList<String> vcfFiles = getFilesFromList(fileList);
 26 | 	ArrayList<String> newVcfFiles = new ArrayList<String>();
 27 | 	
 28 | 	HashSet<String> basenames = new HashSet<String>();
 29 | 	
 30 | 	for(int i = 0; i<vcfFiles.size(); i++)
 31 | 	{
 32 | 		String vcfFile = vcfFiles.get(i);
 33 | 		String basename = StringUtils.fileBaseName(vcfFile);
 34 | 		while(basenames.contains(basename))
 35 | 		{
 36 | 			basename = i + "_" + basename;
 37 | 		}
 38 | 		
 39 | 		String newVcfFile = Settings.OUT_DIR + "/" + StringUtils.addDescriptor(basename, "dupToIns");
 40 | 		newVcfFiles.add(newVcfFile);
 41 | 		DuplicationsToInsertions.convertFile(vcfFile, Settings.GENOME_FILE, newVcfFile);
 42 | 		basenames.add(basename);
 43 | 	}
 44 | 		
 45 | 	return buildUpdatedFileList(fileList, "dupToIns", newVcfFiles);
 46 | }
 47 | 
 48 | /*
 49 |  * Normalize types for all VCF files and update filelist
 50 |  * Returns a path to the new filelist
 51 |  */
 52 | static String normalizeTypes(String fileList) throws Exception
 53 | {
 54 | 	
 55 | 	ArrayList<String> vcfFiles = getFilesFromList(fileList);
 56 | 	ArrayList<String> newVcfFiles = new ArrayList<String>();
 57 | 	
 58 | 	HashSet<String> basenames = new HashSet<String>();
 59 | 	
 60 | 	for(int i = 0; i<vcfFiles.size(); i++)
 61 | 	{
 62 | 		String vcfFile = vcfFiles.get(i);
 63 | 		String basename = StringUtils.fileBaseName(vcfFile);
 64 | 		while(basenames.contains(basename))
 65 | 		{
 66 | 			basename = i + "_" + basename;
 67 | 		}
 68 | 		String newVcfFile = Settings.OUT_DIR + "/" + StringUtils.addDescriptor(basename, "normalizeTypes");
 69 | 		newVcfFiles.add(newVcfFile);
 70 | 		
 71 | 		NormalizeTypes.convertFile(vcfFile, newVcfFile);
 72 | 		basenames.add(basename);
 73 | 	}
 74 | 	
 75 | 	return buildUpdatedFileList(fileList, "normalizeTypes", newVcfFiles);
 76 | 	
 77 | }
 78 | 
 79 | /*
 80 |  * Run Iris on all VCF files and update the filelist
 81 |  * Returns a path to the new filelist
 82 |  */
 83 | static String runIris(String fileList) throws Exception
 84 | {
 85 | 	ArrayList<String> vcfFiles = getFilesFromList(fileList), bamFiles = getFilesFromList(Settings.BAM_FILE_LIST);
 86 | 	ArrayList<String> newVcfFiles = new ArrayList<String>();
 87 | 	
 88 | 	// Get any optional arguments to be passed to Iris that the user specified
 89 | 	String[] optionalArgs = Settings.IRIS_ARGS.split(",");
 90 | 	
 91 | 	HashSet<String> basenames = new HashSet<String>();
 92 | 	
 93 | 	// Refine one VCF file at a time
 94 | 	for(int i = 0; i<vcfFiles.size(); i++)
 95 | 	{
 96 | 		String vcfFile = vcfFiles.get(i);
 97 | 		
 98 | 		String basename = StringUtils.fileBaseName(vcfFile);
 99 | 		while(basenames.contains(basename))
100 | 		{
101 | 			basename = i + "_" + basename;
102 | 		}
103 | 		
104 | 		String newVcfFile = Settings.OUT_DIR + "/" + StringUtils.addDescriptor(basename, "irisRefined");
105 | 		String bamFile = bamFiles.get(i);
106 | 		newVcfFiles.add(newVcfFile);
107 | 		
108 | 		// Generate the required Iris arguments based on the filenames
109 | 		String[] requiredArgs = new String[]
110 | 		{
111 | 			"genome_in=" + Settings.GENOME_FILE,
112 | 			"vcf_in=" + vcfFile,
113 | 			"reads_in=" + bamFile,
114 | 			"vcf_out=" + newVcfFile
115 | 		};
116 | 		
117 | 		// Concatenate the required arguments and the optional ones
118 | 		String[] allArgs = new String[optionalArgs.length + requiredArgs.length];
119 | 		for(int j = 0; j<requiredArgs.length; j++)
120 | 		{
121 | 			allArgs[j] = requiredArgs[j];
122 | 		}
123 | 		for(int j = 0; j<optionalArgs.length; j++)
124 | 		{
125 | 			allArgs[j + requiredArgs.length] = optionalArgs[j];
126 | 		}
127 | 		
128 | 		// Actually run Iris
129 | 		Iris.runIris(allArgs);
130 | 		
131 | 		basenames.add(basename);
132 | 	}
133 | 	
134 | 	return buildUpdatedFileList(fileList, "irisRefined", newVcfFiles);
135 | }
136 | 
137 | /*
138 |  * Mark all specific calls in the input VCFs and update the filelist
139 |  * Returns a path to the new filelist
140 |  */
141 | static String markSpecificCalls(String fileList) throws Exception
142 | {
143 | 	ArrayList<String> vcfFiles = getFilesFromList(fileList);
144 | 	ArrayList<String> newVcfFiles = new ArrayList<String>();
145 | 			
146 | 	HashSet<String> basenames = new HashSet<String>();
147 | 	
148 | 	for(int i = 0; i<vcfFiles.size(); i++)
149 | 	{
150 | 		String vcfFile = vcfFiles.get(i);
151 | 		
152 | 		String basename = StringUtils.fileBaseName(vcfFile);
153 | 		while(basenames.contains(basename))
154 | 		{
155 | 			basename = i + "_" + basename;
156 | 		}
157 | 		
158 | 		String newVcfFile = Settings.OUT_DIR + "/" + StringUtils.addDescriptor(basename, "markedSpec");
159 | 		newVcfFiles.add(newVcfFile);
160 | 		MarkSpecificCalls.convertFile(vcfFile, newVcfFile, Settings.SPECIFIC_MIN_RCOUNT, Settings.SPECIFIC_MIN_LENGTH);
161 | 		
162 | 		basenames.add(basename);
163 | 	}
164 | 	
165 | 	return buildUpdatedFileList(fileList, "markedSpec", newVcfFiles);
166 | }
167 | 
168 | /*
169 |  * Builds an updated file list after running a pre-processing step
170 |  */
171 | static String buildUpdatedFileList(String oldFileList, String suffix, ArrayList<String> newVcfFiles) throws Exception
172 | {
173 | 	if(Settings.USING_FILE_LIST)
174 | 	{
175 | 		String newFileList = Settings.OUT_DIR + "/" + StringUtils.addDescriptor(StringUtils.fileBaseName(oldFileList), suffix);
176 | 		PrintWriter newFileListOut = new PrintWriter(new File(newFileList));
177 | 		for(String newVcfFile : newVcfFiles)
178 | 		{
179 | 			newFileListOut.println(newVcfFile);
180 | 		}
181 | 		newFileListOut.close();
182 | 		return newFileList;
183 | 	}
184 | 	else
185 | 	{
186 | 		StringBuilder res = new StringBuilder("");
187 | 		for(int i = 0; i<newVcfFiles.size(); i++)
188 | 		{
189 | 			res.append(newVcfFiles.get(i));
190 | 			if(i < newVcfFiles.size() - 1)
191 | 			{
192 | 				res.append(",");
193 | 			}
194 | 		}
195 | 		return res.toString();
196 | 	}
197 | }
198 | 
199 | /*
200 |  * Reverts insertions in the output which were formerly duplications back to duplication calls 
201 |  * Moves the old output file and replaces it with the updated one
202 |  */
203 | static void convertInsertionsBackToDuplications() throws Exception
204 | {
205 | 	String unconvertedOutput = Settings.OUT_DIR + "/" + StringUtils.addDescriptor(StringUtils.fileBaseName(Settings.OUT_FILE), "dupToIns");
206 | 	File f;
207 | 	if((f = new File(unconvertedOutput)).exists())
208 | 	{
209 | 		f.delete();
210 | 	}
211 | 	Files.move(Paths.get(Settings.OUT_FILE), Paths.get(unconvertedOutput));
212 | 	InsertionsToDuplications.convertFile(unconvertedOutput, Settings.OUT_FILE);
213 | }
214 | 
215 | /*
216 |  * Adds genotypes to the merged VCF based on the genotypes in individual samples
217 |  * Moves the old output file and replaces it with the updated one
218 |  */
219 | static void addGenotypes(String fileList) throws Exception
220 | {
221 | 	String unconvertedOutput = Settings.OUT_DIR + "/" + StringUtils.addDescriptor(StringUtils.fileBaseName(Settings.OUT_FILE), "noGenotypes");
222 | 	File f;
223 | 	if((f = new File(unconvertedOutput)).exists())
224 | 	{
225 | 		f.delete();
226 | 	}
227 | 	Files.move(Paths.get(Settings.OUT_FILE), Paths.get(unconvertedOutput));
228 | 	AddGenotypes.addGenotypes(unconvertedOutput, fileList, Settings.OUT_FILE);
229 | }
230 | 
231 | /*
232 |  * Reads the list of files from either a specified list file or the comma-separated command line argument
233 |  */
234 | static ArrayList<String> getFilesFromList(String fileList) throws Exception
235 | {
236 | 	ArrayList<String> res = new ArrayList<String>();
237 | 	
238 | 	if(!Settings.USING_FILE_LIST)
239 | 	{
240 | 		String[] fns = fileList.split(",");
241 | 		for(String fn : fns) res.add(fn);
242 | 		return res;
243 | 	}
244 | 	
245 | 	if(new File(fileList).exists())
246 | 	{
247 | 		Scanner vcfListInput = new Scanner(new FileInputStream(new File(fileList)));
248 | 				
249 | 		while(vcfListInput.hasNext())
250 | 		{
251 | 			String line = vcfListInput.nextLine();
252 | 			if(line.length() > 0)
253 | 			{
254 | 				res.add(line);
255 | 			}
256 | 		}
257 | 		vcfListInput.close();
258 | 	}
259 | 	
260 | 	return res;
261 | }
262 | 
263 | }
264 | 


--------------------------------------------------------------------------------
/src/PreSplit.java:
--------------------------------------------------------------------------------
  1 | import java.io.File;
  2 | import java.io.FileInputStream;
  3 | import java.io.PrintWriter;
  4 | import java.nio.file.Path;
  5 | import java.nio.file.Paths;
  6 | import java.util.ArrayList;
  7 | import java.util.HashMap;
  8 | import java.util.HashSet;
  9 | import java.util.Scanner;
 10 | 
 11 | public class PreSplit
 12 | {
 13 | 	static String fileList = "";
 14 | 	static String outputDir = "";
 15 | 	static int segmentLength = -1;
 16 | 	static boolean transTogether = false;
 17 | 	
 18 | 	static void usage()
 19 | 	{
 20 | 		System.out.println();
 21 | 		System.out.println("Usage: split_jasmine file_list output_dir [segment_length]");
 22 | 		System.out.println("  Example: split_jasmine file_list=filelist.txt output_dir=/path/to/split_dir segment_length=10m");
 23 | 		System.out.println();
 24 | 		System.out.println("Required args:");
 25 | 		System.out.println("  file_list       (String) - A txt file with a line-separated list of VCFs to be split");
 26 | 		System.out.println("  output_dir      (String) - The directory to write the split files to");
 27 | 		System.out.println();
 28 | 		System.out.println("Optional args:");
 29 | 		System.out.println("  segment_length  (int)    - length of segments to split chromosomes into (default whole-chromosome)");
 30 | 		System.out.println("  --ignore_strand          - allow variants with different strands to be merged");
 31 | 		System.out.println("  --ignore_type            - allow variants with different types to be merged");
 32 | 		System.out.println("  --combine_translocations - keep all translocations together to reduce number of groups");
 33 | 		System.out.println();
 34 | 	}
 35 | 	
 36 | 	static void parseArgs(String[] args) throws Exception
 37 | 	{
 38 | 		for(int i = 0; i<args.length; i++)
 39 | 		{
 40 | 			if(args[i].indexOf('=') == -1)
 41 | 			{
 42 | 				if(args[i].endsWith("ignore_strand"))
 43 | 				{
 44 | 					Settings.USE_STRAND = false;
 45 | 				}
 46 | 				else if(args[i].endsWith("ignore_type"))
 47 | 				{
 48 | 					Settings.USE_TYPE = false;
 49 | 				}
 50 | 				else if(args[i].endsWith("combine_translocations"))
 51 | 				{
 52 | 					transTogether = true;
 53 | 				}
 54 | 			}
 55 | 			else
 56 | 			{
 57 | 				int equalIdx = args[i].indexOf('=');
 58 | 				String key = args[i].substring(0, equalIdx);
 59 | 				while(key.length() > 0 && key.charAt(0) == '-')
 60 | 				{
 61 | 					key = key.substring(1);
 62 | 				}
 63 | 				String val = args[i].substring(1 + equalIdx);
 64 | 				
 65 | 				switch(key) 
 66 | 				{
 67 | 					case "segment_length":
 68 | 						segmentLength = Settings.parseInt(val);
 69 | 						break;
 70 | 					case "file_list":
 71 | 						fileList = val;
 72 | 						break;
 73 | 					case "output_dir":
 74 | 						outputDir = val;
 75 | 						break;
 76 | 					default:
 77 | 						break;
 78 | 				}
 79 | 			}
 80 | 		}
 81 | 		if(fileList.length() == 0 || outputDir.length() == 0)
 82 | 		{
 83 | 			usage();
 84 | 			System.exit(0);
 85 | 		}
 86 | 		
 87 | 	}
 88 | 	
 89 | 	public static void main(String[] args) throws Exception
 90 | 	{
 91 | 		parseArgs(args);
 92 | 		
 93 | 		String[] filelists = convertAll(fileList, outputDir, segmentLength);
 94 | 		for(String s : filelists)
 95 | 		{
 96 | 			System.out.println(s);
 97 | 		}
 98 | 	}
 99 | 	
100 | 	@SuppressWarnings("unchecked")
101 | 	static String[] convertAll(String fileList, String outDir, int segmentLength) throws Exception
102 | 	{
103 | 		if(!outDir.startsWith("/"))
104 | 		{
105 | 			Path currentRelativePath = Paths.get("");
106 | 			outDir = currentRelativePath.toAbsolutePath().toString() + "/" + outDir;
107 | 		}
108 | 		if(!new File(outDir).isDirectory())
109 | 		{
110 | 			new File(outDir).mkdir();
111 | 		}
112 | 		ArrayList<String> vcfFiles = PipelineManager.getFilesFromList(fileList);
113 | 		int n = vcfFiles.size();
114 | 		HashMap<String, String>[] splitMaps = new HashMap[n];
115 | 		HashSet<String> allKeys = new HashSet<String>();
116 | 		for(int i = 0; i<n; i++)
117 | 		{
118 | 			splitMaps[i] = convertFile(vcfFiles.get(i), outDir + "/splitSample" + i, segmentLength);
119 | 			allKeys.addAll(splitMaps[i].keySet());
120 | 		}
121 | 		String[][] splitList = new String[n][allKeys.size()];
122 | 		for(int i = 0; i<n; i++)
123 | 		{
124 | 			VcfHeader header = new VcfHeader();
125 | 			Scanner input = new Scanner(new FileInputStream(new File(vcfFiles.get(i))));
126 | 			while(input.hasNext())
127 | 			{
128 | 				String line = input.nextLine();
129 | 				if(line.length() == 0)
130 | 				{
131 | 					continue;
132 | 				}
133 | 				if(line.startsWith("#"))
134 | 				{
135 | 					header.addLine(line);
136 | 					continue;
137 | 				}
138 | 				break;
139 | 			}
140 | 			int idx = 0;
141 | 			for(String s : allKeys)
142 | 			{
143 | 				if(splitMaps[i].containsKey(s))
144 | 				{
145 | 					splitList[i][idx] = splitMaps[i].get(s);
146 | 				}
147 | 				else
148 | 				{
149 | 					String ofn = outDir + "/splitSample" + i + "_" + s + ".vcf";
150 | 					PrintWriter out = new PrintWriter(new File(ofn));
151 | 					header.print(out);
152 | 					out.close();
153 | 					splitList[i][idx] = ofn;
154 | 				}
155 | 				idx++;
156 | 			}
157 | 		}
158 | 		
159 | 		int idx = 0;
160 | 		String[] res = new String[allKeys.size()];
161 | 		for(String s : allKeys)
162 | 		{
163 | 			String ofn = outDir + "/" + "split_" + s + ".filelist.txt";
164 | 			PrintWriter out = new PrintWriter(new File(ofn));
165 | 			for(int i = 0; i<n; i++)
166 | 			{
167 | 				out.println(splitList[i][idx]);
168 | 			}
169 | 			out.close();
170 | 			res[idx] = ofn;
171 | 			idx++;
172 | 		}
173 | 		
174 | 		return res;
175 | 	}
176 | 	
177 | 	static HashMap<String, String> convertFile(String inputFile, String outputPrefix, int segmentLength) throws Exception
178 | 	{
179 | 		VcfHeader header = new VcfHeader();
180 | 		Scanner input = new Scanner(new FileInputStream(new File(inputFile)));
181 | 		HashMap<String, String> res = new HashMap<String, String>();
182 | 		HashMap<String, PrintWriter> writerMap = new HashMap<String, PrintWriter>();
183 | 		while(input.hasNext())
184 | 		{
185 | 			String line = input.nextLine();
186 | 			if(line.length() == 0)
187 | 			{
188 | 				continue;
189 | 			}
190 | 			if(line.startsWith("#"))
191 | 			{
192 | 				header.addLine(line);
193 | 				continue;
194 | 			}
195 | 			VcfEntry entry = VcfEntry.fromLine(line);
196 | 			String graphId = VariantInput.fromVcfEntry(entry, 0).graphID;
197 | 			if(segmentLength != -1 && !entry.getNormalizedType().equals("TRA"))
198 | 			{
199 | 				graphId = graphId + "_" + ((entry.getPos() / segmentLength) * segmentLength);
200 | 			}
201 | 			if(entry.getNormalizedType().equals("TRA") && transTogether)
202 | 			{
203 | 				graphId = "TRA";
204 | 			}
205 | 			if(!res.containsKey(graphId))
206 | 			{
207 | 				String ofn = outputPrefix + "_" + graphId + ".vcf";
208 | 				PrintWriter out = new PrintWriter(new File(ofn));
209 | 				header.print(out);
210 | 				res.put(graphId, ofn);
211 | 				writerMap.put(graphId, out);
212 | 			}
213 | 			PrintWriter out = writerMap.get(graphId);
214 | 			out.println(line);
215 | 		}
216 | 		for(String key : writerMap.keySet())
217 | 		{
218 | 			writerMap.get(key).close();
219 | 		}
220 | 		return res;
221 | 	}
222 | }
223 | 


--------------------------------------------------------------------------------
/src/StringUtils.java:
--------------------------------------------------------------------------------
  1 | import java.util.HashMap;
  2 | 
  3 | /*
  4 |  * A collection of String functions
  5 |  */
  6 | public class StringUtils {
  7 | 	
  8 | 	/*
  9 | 	 * The sequence identity of two strings based on their edit distance
 10 | 	 */
 11 | 	static double editDistanceSimilarity(String s, String t)
 12 | 	{
 13 | 		int n = s.length(), m = t.length();
 14 | 		int[][] editDistance = new int[n+1][m+1];
 15 | 		for(int i = 1; i<=m; i++) editDistance[0][i] = i;
 16 | 		for(int i = 1; i<=n; i++) editDistance[i][0] = i;
 17 | 		for(int i = 1; i<=n; i++)
 18 | 		{
 19 | 			for(int j = 1; j<=m; j++)
 20 | 			{
 21 | 				boolean sameChar = s.charAt(i-1) == t.charAt(j-1);
 22 | 				int bestDistance = editDistance[i-1][j-1] + (sameChar ? 0 : 1);
 23 | 				bestDistance = Math.min(bestDistance, 1 + editDistance[i-1][j]);
 24 | 				bestDistance = Math.min(bestDistance, 1 + editDistance[i][j-1]);
 25 | 				editDistance[i][j] = bestDistance;
 26 | 			}
 27 | 		}
 28 | 		return 1.0 * editDistance[n][m] / Math.max(n, m);
 29 | 	}
 30 | 
 31 | 	/*
 32 | 	 * Gets the frequency of each k-mer in a string, skipping over non-base characters
 33 | 	 */
 34 | 	static HashMap<Integer, Integer> countKmers(String s, int k)
 35 | 	{
 36 | 		HashMap<Integer, Integer> kmerCount = new HashMap<Integer, Integer>();
 37 | 		
 38 | 		// The number of basepair characters (ACGT) we have seen so far
 39 | 		int baseCount = 0;
 40 | 		
 41 | 		// The encoded (2 bits per character) value of the current kmer so far
 42 | 		int kmer = 0;
 43 | 		
 44 | 		// Use a sliding window to get all of the kmer codes and add them to the frequency map
 45 | 		for(int i = 0; i<s.length(); i++)
 46 | 		{
 47 | 			char c = s.charAt(i);
 48 | 			
 49 | 			// The value of the current character
 50 | 			int base = -1;
 51 | 			if(c == 'a' || c == 'A') base = 0;
 52 | 			if(c == 'c' || c == 'C') base = 1;
 53 | 			if(c == 'g' || c == 'G') base = 2;
 54 | 			if(c == 't' || c == 'T') base = 3;
 55 | 			
 56 | 			// ONly consider basepair characters
 57 | 			if(base != -1)
 58 | 			{
 59 | 				// Remove the base from the window that just left the sliding window and add the new base
 60 | 				int allButTwoHighest = ((1 << (2*k-2)) - 1) & kmer;
 61 | 				kmer = (allButTwoHighest << 2) | base;
 62 | 				baseCount++;
 63 | 				
 64 | 				// If we have seen at least k bases, add the current kmer value to the map
 65 | 				if(baseCount >= k)
 66 | 				{
 67 | 					if(kmerCount.containsKey(kmer))
 68 | 					{
 69 | 						kmerCount.put(kmer, kmerCount.get(kmer)+1);
 70 | 					}
 71 | 					else
 72 | 					{
 73 | 						kmerCount.put(kmer, 1);
 74 | 					}
 75 | 				}
 76 | 			}
 77 | 		}
 78 | 		return kmerCount;
 79 | 	}
 80 | 	
 81 | 	/*
 82 | 	 * The sequence identity of two strings based on their kmer Jaccard distance
 83 | 	 */
 84 | 	static double jaccardSimilarity(String s, String t)
 85 | 	{
 86 | 		int k = Settings.K_JACCARD;
 87 | 		
 88 | 		// Get the frequencies of kmers in s
 89 | 		HashMap<Integer, Integer> sKmerFreq = countKmers(s, k);
 90 | 		if(sKmerFreq.size() <= 0)
 91 | 		{
 92 | 			return 1.0;
 93 | 		}
 94 | 		
 95 | 		// Get the frequencies of kmer in t
 96 | 		HashMap<Integer, Integer> tKmerFreq = countKmers(t, k);
 97 | 		if(tKmerFreq.size() <= 0)
 98 | 		{
 99 | 			return 1.0;
100 | 		}
101 | 		
102 | 		// Compute the min and max count of each kmer to get the intersection and union, respectively 
103 | 		int intersection = 0, union = 0;
104 | 		
105 | 		// Iterate over everything in s - this includes both kmers distinct to s and kmers in both
106 | 		for(int sKmer : sKmerFreq.keySet())
107 | 		{
108 | 			int sFrequency = sKmerFreq.get(sKmer);
109 | 			int tFrequency = tKmerFreq.getOrDefault(sKmer, 0);
110 | 			intersection += Math.min(sFrequency,  tFrequency);
111 | 			union += Math.max(sFrequency,  tFrequency);
112 | 		}
113 | 		
114 | 		// Add the kmers unique to t to the union
115 | 		for(int tKmer : tKmerFreq.keySet())
116 | 		{
117 | 			if(!sKmerFreq.containsKey(tKmer))
118 | 			{
119 | 				union += tKmerFreq.get(tKmer);
120 | 			}
121 | 		}
122 | 		
123 | 		// Compute the Jaccard similarity as the intersection size divided by the union size
124 | 		return 1.0 * intersection / union;
125 | 		
126 | 	}
127 | 	
128 | 	/*
129 | 	 * Assumes input is a filename, and adds "_<desc>" right before the file extension
130 | 	 */
131 | 	static String addDescriptor(String input, String desc)
132 | 	{
133 | 		int idx = input.lastIndexOf(".");
134 | 		if(idx == -1)
135 | 		{
136 | 			return input + "_" + desc;
137 | 		}
138 | 		
139 | 		String before = input.substring(0, idx);
140 | 		String after = input.substring(idx);
141 | 		return before + "_" + desc + after;
142 | 	}
143 | 	
144 | 	/*
145 | 	 * Gets the basename of a file from its path by removing the directory name
146 | 	 */
147 | 	static String fileBaseName(String path)
148 | 	{
149 | 		int idx = path.lastIndexOf('/');
150 | 		if(idx == -1)
151 | 		{
152 | 			return path;
153 | 		}
154 | 		return path.substring(1 + idx);
155 | 	}
156 | }
157 | 


--------------------------------------------------------------------------------
/src/TestKDTree.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Basic test for making sure my KD tree is behaving reasonably
 3 |  */
 4 | 
 5 | public class TestKDTree {
 6 | public static void main(String[] args)
 7 | {
 8 | 	Variant[] data = new Variant[] {
 9 | 			new Variant(0, "var1", 10, 5, "chr1", null),
10 | 			new Variant(0, "var2", 1, 5, "chr1", null),
11 | 			new Variant(0, "var3", 18, 5, "chr1", null),
12 | 			new Variant(0, "var4", 12, 7, "chr1", null),
13 | 			new Variant(0, "var5", 10, 5, "chr1", null),
14 | 			new Variant(0, "var6", 30, 30, "chr1", null),
15 | 			new Variant(0, "var7", 0, 0, "chr1", null)
16 | 	};
17 | 	
18 | 	KDTree kdt = new KDTree(data);
19 | 	for(int i = 0; i<data.length;i++)
20 | 	{
21 | 		Variant[] cur = kdt.kNearestNeighbor(data[i], 5);
22 | 		for(Variant v : cur)
23 | 		{
24 | 			System.out.println(v.id+" "+v.start+" "+v.end+" "+v.distance(data[i]));
25 | 		}
26 | 		System.out.println();
27 | 	}
28 | }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/Variant.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Minimum set of info we need to store for a variant
  3 |  */
  4 | 
  5 | public class Variant implements Comparable<Variant>
  6 | {
  7 | 	// Which sample number the variant came from
  8 | 	int sample; 
  9 | 	
 10 | 	// Variant ID, assumed to be unique for all variants.  It has the "<sampleId>_" added to the beginning to ensure this.
 11 | 	String id; 
 12 | 	
 13 | 	// End should be start+length for insertion
 14 | 	double start, end;
 15 | 	
 16 | 	// Store chromosome, and optionally type and strand
 17 | 	String graphID;
 18 | 	
 19 | 	// This is initialized and used internally for bookkeeping and does not come from VCF
 20 | 	int index;
 21 | 	
 22 | 	// For insertions, the sequence being inserted, or null otherwise
 23 | 	String seq;
 24 | 	
 25 | 	// The maximum distance a variant can be away to merge with this one
 26 | 	int maxDist;
 27 | 	
 28 | 	// The minimum sequence similarity another variant needs to merge with this one if both are insertions
 29 | 	double minSeqId;
 30 | 	
 31 | 	// The stat and end of an interval for checking overlap
 32 | 	double[] interval;
 33 | 	
 34 | 	int hash;
 35 | 	static int hash(String infoString)
 36 | 	{
 37 | 		long res = 0;
 38 | 		int mod = (int)(1e9+7);
 39 | 		char[] cs = infoString.toCharArray();
 40 | 		for(char c : cs)
 41 | 		{
 42 | 			res = ((res * 17) + c)%mod;
 43 | 		}
 44 | 		return (int)res;
 45 | 	}
 46 | 	
 47 | 	/*
 48 | 	 * Returns the distance from the variant's (start, end) pair to a given (x, y) point
 49 | 	 */
 50 | 	double distFromPoint(double x, double y)
 51 | 	{
 52 | 		double dStart = start - x;
 53 | 		double dEnd = end - y;
 54 | 		int norm = Settings.KD_TREE_NORM;
 55 | 		if(norm == 2)
 56 | 		{
 57 | 			return Math.sqrt(dStart * dStart + dEnd * dEnd); 
 58 | 		}
 59 | 		else
 60 | 		{
 61 | 			double powSum = Math.abs(Math.pow(dStart, norm)) + Math.abs(Math.pow(dEnd, norm));
 62 | 			return Math.pow(powSum, 1.0 / norm);
 63 | 		}
 64 | 	}
 65 | 	
 66 | 	Variant(int sample, String id, double start, double end, String graphID, String seq, int maxDist, double minSeqId)
 67 | 	{
 68 | 		this.sample = sample;
 69 | 		this.id = id;
 70 | 		this.start = start;
 71 | 		this.end = end;
 72 | 		this.graphID = graphID;
 73 | 		if(minSeqId > 0) this.seq = seq;
 74 | 		this.maxDist = maxDist;
 75 | 		this.minSeqId = minSeqId;
 76 | 		hash = 0;
 77 | 		interval = null;
 78 | 	}
 79 | 	
 80 | 	Variant(int sample, String id, double start, double end, String graphID, String seq)
 81 | 	{
 82 | 		this.sample = sample;
 83 | 		this.id = id;
 84 | 		this.start = start;
 85 | 		this.end = end;
 86 | 		this.graphID = graphID;
 87 | 		if(minSeqId > 0) this.seq = seq;
 88 | 		this.maxDist = Settings.MAX_DIST;
 89 | 		this.minSeqId = Settings.MIN_SEQUENCE_SIMILARITY;
 90 | 		interval = null;
 91 | 	}
 92 | 	
 93 | 	/*
 94 | 	 * The distance between two variants based on differences in their start/end coordinates
 95 | 	 * The metric used is a generalization of Euclidean distance
 96 | 	 */
 97 | 	double distance(Variant v)
 98 | 	{
 99 | 		return distFromPoint(v.start, v.end);
100 | 	}
101 | 	
102 | 	/*
103 | 	 * The similarity score of two variants, which is based on sequence similarity for pairs of insertions and 1 otherwise
104 | 	 */
105 | 	double stringSimilarity(Variant v)
106 | 	{
107 | 		// If either sequence is null, the variant is either non-insertion, or has no sequence, which we don't want to penalize for
108 | 		if(seq == null || v.seq == null)
109 | 		{
110 | 			return 1.0;
111 | 		}
112 | 		
113 | 		if(Settings.USE_EDIT_DISTANCE)
114 | 		{
115 | 			return StringUtils.editDistanceSimilarity(seq, v.seq);
116 | 		}
117 | 		else
118 | 		{
119 | 			return StringUtils.jaccardSimilarity(seq, v.seq);
120 | 		}
121 | 	}
122 | 	
123 | 	/*
124 | 	 * Whether or not the sequence similarity of two variants is high enough for them to be merged
125 | 	 */
126 | 	boolean passesStringSimilarity(Variant v)
127 | 	{
128 | 		if(seq == null || v.seq == null)
129 | 		{
130 | 			return true;
131 | 		}
132 | 		
133 | 		String s = seq, t = v.seq;
134 | 		
135 | 		double similarityNeeded = Math.min(minSeqId, v.minSeqId);
136 | 		
137 | 		// If there is no sequence identity requirement, don't compute the score and just return true
138 | 		if(similarityNeeded <= 0)
139 | 		{
140 | 			return true;
141 | 		}
142 | 		
143 | 		int minLength = Math.min(s.length(), t.length());
144 | 		int maxLength = s.length() + t.length() - minLength;
145 | 		
146 | 		if(minLength < maxLength * similarityNeeded - 1e-9)
147 | 		{
148 | 			return false;
149 | 		}
150 | 		
151 | 		return stringSimilarity(v) >= similarityNeeded - 1e-9;
152 | 	}
153 | 	
154 | 	/*
155 | 	 * Human-readable format for printing some of the variant information
156 | 	 */
157 | 	public String toString()
158 | 	{
159 | 		return "id: " + id + ", sample: " + sample + ", start: " + start + ", end: " + end;
160 | 	}
161 | 
162 | 	public int compareTo(Variant o) 
163 | 	{
164 | 		if(hash != o.hash) return Long.compare(hash, o.hash);
165 | 		if(start != o.start) return Double.compare(start, o.start);
166 | 		return id.compareTo(o.id);
167 | 	}
168 | 
169 | 	public boolean passesOverlap(Variant v) 
170 | 	{
171 | 		if(interval == null || v.interval == null)
172 | 		{
173 | 			return true;
174 | 		}
175 | 		double maxStart = Math.max(interval[0], v.interval[0]);
176 | 		double minEnd = Math.min(interval[1], v.interval[1]);
177 | 		if(minEnd <= maxStart + 1E-9)
178 | 		{
179 | 			return false;
180 | 		}
181 | 		double maxIntervalSize = Math.max(interval[1] - interval[0], v.interval[1] - v.interval[0]);
182 | 		return minEnd - maxStart + 1e-9 >= maxIntervalSize * Settings.OVERLAP_REQUIRED;
183 | 	}
184 | }
185 | 


--------------------------------------------------------------------------------
/src/VariantInput.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Methods for reading VCF entries from VCF files and dividing
  3 |  * the entries into separate groups by graph ID
  4 |  */
  5 | 
  6 | import java.io.File;
  7 | import java.io.FileInputStream;
  8 | import java.util.ArrayList;
  9 | import java.util.HashMap;
 10 | import java.util.HashSet;
 11 | import java.util.Scanner;
 12 | import java.util.TreeMap;
 13 | 
 14 | public class VariantInput {
 15 | 	
 16 | 	// How many samples were merged to produce each input file
 17 | 	static HashMap<Integer, Integer> previouslyMergedSamples = new HashMap<Integer, Integer>();
 18 | 	
 19 | 	/*
 20 | 	 * Count the number of VCF files in a list
 21 | 	 */
 22 | 	public static int countFiles(String fileList) throws Exception
 23 | 	{
 24 | 		return PipelineManager.getFilesFromList(fileList).size();
 25 | 	}
 26 | 	
 27 | 	/*
 28 | 	 * Get a list of all variants from a group of files, binning them by graphID
 29 | 	 */
 30 | 	@SuppressWarnings("unchecked")
 31 | 	public static TreeMap<String, ArrayList<Variant>> readAllFiles(String fileList) throws Exception
 32 | 	{
 33 | 		ArrayList<String> fileNames = PipelineManager.getFilesFromList(fileList);
 34 | 		
 35 | 		TreeMap<String, ArrayList<Variant>>[] variantsPerFile = new TreeMap[fileNames.size()];
 36 | 		for(int i = 0; i<fileNames.size(); i++)
 37 | 		{
 38 | 			variantsPerFile[i] = getSingleList(fileNames.get(i), i);
 39 | 		}
 40 | 		TreeMap<String, ArrayList<Variant>> res = new TreeMap<String, ArrayList<Variant>>();
 41 | 		for(int i = 0; i<fileNames.size(); i++)
 42 | 		{
 43 | 			for(String s : variantsPerFile[i].keySet())
 44 | 			{
 45 | 				if(!res.containsKey(s))
 46 | 				{
 47 | 					res.put(s, new ArrayList<Variant>());
 48 | 				}
 49 | 				for(Variant v : variantsPerFile[i].get(s))
 50 | 				{
 51 | 					res.get(s).add(v);
 52 | 				}
 53 | 			}
 54 | 		}
 55 | 		return res;
 56 | 	}
 57 | 	
 58 | 	/*
 59 | 	 * Get a list of variants binned by graphID for a single VCF file
 60 | 	 */
 61 | 	private static TreeMap<String, ArrayList<Variant>> getSingleList(String filename, int sample) throws Exception
 62 | 	{
 63 | 		if(filename.endsWith(".gz"))
 64 | 		{
 65 | 			System.err.println("Warning: " + filename + " ends with .gz, but (b)gzipped VCFs are not accepted");
 66 | 		}
 67 | 		Scanner input = new Scanner(new FileInputStream(new File(filename)));
 68 | 		ArrayList<Variant> allVariants = new ArrayList<Variant>();
 69 | 		HashSet<String> ids = new HashSet<String>();
 70 | 		if(!previouslyMergedSamples.containsKey(sample))
 71 | 		{
 72 | 			previouslyMergedSamples.put(sample, 1);
 73 | 		}
 74 | 		while(input.hasNext())
 75 | 		{
 76 | 			String line = input.nextLine();
 77 | 			if(line.length() == 0 || line.startsWith("#"))
 78 | 			{
 79 | 				continue;
 80 | 			}
 81 | 			if(line.length() >=2 && line.charAt(0) == 31 && (line.charAt(1) == 65533 || line.charAt(1) == 139))
 82 | 			{
 83 | 				throw new Exception(filename + " is a gzipped file, but only unzipped VCFs are accepted");
 84 | 			}
 85 | 			VcfEntry entry = VcfEntry.fromLine(line);
 86 | 			if(!previouslyMergedSamples.containsKey(sample))
 87 | 			{
 88 | 				if(entry.getInfo("SUPP_VEC_EXT").length() > 0)
 89 | 				{
 90 | 					previouslyMergedSamples.put(sample, entry.getInfo("SUPP_VEC_EXT").length());
 91 | 				}
 92 | 				else if(entry.getInfo("SUPP_VEC").length() > 0)
 93 | 				{
 94 | 					previouslyMergedSamples.put(sample, entry.getInfo("SUPP_VEC").length());
 95 | 				}
 96 | 				else
 97 | 				{
 98 | 					previouslyMergedSamples.put(sample, 1);
 99 | 				}
100 | 			}
101 | 			if(ids.contains(entry.getId()))
102 | 			{
103 | 				String oldId = entry.getId();
104 | 				int index = 1;
105 | 				while(true)
106 | 				{
107 | 					String newId = oldId + "_duplicate" + index;
108 | 					if(!ids.contains(newId))
109 | 					{
110 | 						entry.setId(newId);
111 | 						break;
112 | 					}
113 | 					else
114 | 					{
115 | 						index++;
116 | 					}
117 | 				}
118 | 				System.err.println("Warning: Duplicate variant ID " + oldId + " in " + filename + "; Replacing with " + entry.getId());
119 | 			}
120 | 			ids.add(entry.getId());
121 | 			allVariants.add(fromVcfEntry(entry, sample));
122 | 			
123 | 		}
124 | 		
125 | 		System.out.println(filename + " has " + allVariants.size() + " variants");
126 | 		input.close();
127 | 		
128 | 		return divideIntoGraphs(allVariants);
129 | 	}
130 | 	
131 | 	/*
132 | 	 * Take a list of variants and bin them by graphID
133 | 	 */
134 | 	private static TreeMap<String, ArrayList<Variant>> divideIntoGraphs(ArrayList<Variant> data)
135 | 	{
136 | 		TreeMap<String, ArrayList<Variant>> groups = new TreeMap<String, ArrayList<Variant>>();
137 | 		for(Variant v : data)
138 | 		{
139 | 			String graphID = v.graphID;
140 | 			if(!groups.containsKey(graphID))
141 | 			{
142 | 				groups.put(graphID, new ArrayList<Variant>());
143 | 			}
144 | 			groups.get(graphID).add(v);
145 | 		}
146 | 		return groups;
147 | 	}
148 | 	
149 | 	/*
150 | 	 * From a line of a VCF file, extract the information needed for merging
151 | 	 * and return it as a Variant object
152 | 	 */
153 | 	public static Variant fromVcfEntry(VcfEntry entry, int sample) throws Exception
154 | 	{
155 | 		double start = entry.getFirstCoord();
156 | 		double end = entry.getSecondCoord();
157 | 		
158 | 		entry.setId(sample + "_" + entry.getId());
159 | 		
160 | 		String id = entry.getGraphID();
161 | 		
162 | 		String seq = null;
163 | 		if(entry.getType().equals("INS"))
164 | 		{
165 | 			String entrySeq = entry.getSeq();
166 | 			if(entrySeq.length() > 0)
167 | 			{
168 | 				seq = entrySeq;
169 | 			}
170 | 		}
171 | 		
172 | 		// Default distance threshold model is constant, so set to that first
173 | 		int maxDist = Settings.MAX_DIST;
174 | 		double minSeqId = Settings.MIN_SEQUENCE_SIMILARITY;
175 | 		
176 | 		// Then, check if there is a per-variant distance threshold
177 | 		String maxDistInfo = entry.getInfo("JASMINE_DIST");
178 | 		if(maxDistInfo.length() > 0)
179 | 		{
180 | 			maxDist = Integer.parseInt(maxDistInfo);
181 | 		}
182 | 		
183 | 		// Next check if a per-sample distance threshold was set
184 | 		else if(Settings.PER_SAMPLE_DISTS != null && Settings.PER_SAMPLE_DISTS.length > sample)
185 | 		{
186 | 			maxDist = Settings.PER_SAMPLE_DISTS[sample];
187 | 		}
188 | 		
189 | 		// Next, check if there is a length-based threshold
190 | 		else if(Settings.USE_LINEAR_THRESHOLD && Settings.MAX_DIST_LINEAR > 0)
191 | 		{
192 | 			maxDist = (int)(Settings.MAX_DIST_LINEAR * Math.abs(entry.getLength()) + 0.5);
193 | 			if(Settings.MAX_DIST_SET)
194 | 			{
195 | 				maxDist = Math.min(maxDist, Settings.MAX_DIST);
196 | 			}
197 | 			if(Settings.MIN_DIST != -1)
198 | 			{
199 | 				maxDist = Math.max(maxDist, Settings.MIN_DIST);
200 | 			}
201 | 		}
202 | 		
203 | 		// Check for per-variant sequence ID thresholds
204 | 		String minIdInfo = entry.getInfo("JASMINE_ID");
205 | 		if(minIdInfo.length() > 0)
206 | 		{
207 | 			minSeqId = Double.parseDouble(minIdInfo);
208 | 		}
209 | 		
210 | 		Variant res = new Variant(sample, entry.getId(), start, end, id, seq, maxDist, minSeqId);
211 | 		res.hash = Variant.hash(entry.tabTokens[7]);
212 | 		if(Settings.OVERLAP_REQUIRED > 0 && (entry.getType().equals("DEL")) || entry.getType().equals("INV") || entry.getType().equals("DUP"))
213 | 		{
214 | 			res.interval = new double[] {entry.getPos(), entry.getEnd()};
215 | 		}
216 | 		return res;
217 | 	}
218 | }
219 | 


--------------------------------------------------------------------------------
/src/VariantMergeTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Class for testing variant merging
 3 | */
 4 | 
 5 | import java.util.ArrayList;
 6 | 
 7 | public class VariantMergeTest {
 8 | 	public static void main(String[] args)
 9 | 	{
10 | 		Variant[] data = new Variant[] {
11 | 				new Variant(0, "var1", 10, 5, "chr1", null),
12 | 				new Variant(0, "var2", 1, 5, "chr1", null),
13 | 				new Variant(0, "var3", 18, 5, "chr1", null),
14 | 				new Variant(1, "var4", 12, 7, "chr1", null),
15 | 				new Variant(1, "var5", 10, 5, "chr1", null),
16 | 				new Variant(1, "var6", 30, 30, "chr1", null),
17 | 				new Variant(1, "var7", 0, 0, "chr1", null),
18 | 				new Variant(2, "var8", 12, 12, "chr1", null),
19 | 				new Variant(2, "var9", 15, 15, "chr1", null),
20 | 				new Variant(2, "var10", 20, 20, "chr1", null),
21 | 				new Variant(2, "var11", 28, 28, "chr1", null),
22 | 				new Variant(3, "var12", 25, 25, "chr1", null),
23 | 				new Variant(4, "var13", 22, 22, "chr1", null)
24 | 		};
25 | 		
26 | 		Settings.MAX_DIST = 5;
27 | 		VariantMerger vm = new VariantMerger(data);
28 | 		vm.runMerging();
29 | 		ArrayList<Variant>[] res = vm.getGroups();
30 | 		
31 | 		System.out.println();
32 | 		for(ArrayList<Variant> list : res)
33 | 		{
34 | 			if(list.size() > 1)
35 | 			{
36 | 				System.out.println("Variant:");
37 | 				for(Variant v : list)
38 | 				{
39 | 					System.out.println(v);
40 | 				}
41 | 				System.out.println();
42 | 			}
43 | 		}
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/src/VariantMerger.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Main interface for merging variants.  This assumes all variants are on the
  3 |  * same chromosome, and have the same type/strand if that separation is desired.
  4 |  */
  5 | 
  6 | import java.util.ArrayList;
  7 | import java.util.PriorityQueue;
  8 | 
  9 | public class VariantMerger
 10 | {
 11 | 	// An array of all of the variants to be considered
 12 | 	Variant[] data;
 13 | 	
 14 | 	// The number of total variants
 15 | 	int n;
 16 | 	
 17 | 	// A forest in which connected components will represent merged groups
 18 | 	Forest forest;
 19 | 	
 20 | 	// A KD-tree data structure for fast k-nearest-neighbors queries
 21 | 	KDTree knn;
 22 | 	
 23 | 	// Indices of variants in each group, used for more advanced distance checks like clique and centroid
 24 | 	ArrayList<Integer>[] merged;
 25 | 
 26 | 	@SuppressWarnings("unchecked")
 27 | 	public VariantMerger(Variant[] data)
 28 | 	{
 29 | 		n = data.length;
 30 | 		
 31 | 		forest = new Forest(data);
 32 | 		knn = new KDTree(data, false);
 33 | 		
 34 | 		for(int i = 0; i<n; i++) data[i].index = i;
 35 | 		this.data = data;
 36 | 		
 37 | 		if(Settings.CENTROID_MERGE || Settings.CLIQUE_MERGE)
 38 | 		{
 39 | 			merged = new ArrayList[n];
 40 | 			for(int i = 0; i<n; i++)
 41 | 			{
 42 | 				merged[i] = new ArrayList<Integer>();
 43 | 				merged[i].add(i);
 44 | 			}
 45 | 		}
 46 | 	}
 47 | 	
 48 | 	/*
 49 | 	 * Helper function to convert an ArrayList to an array to make the constructor more flexible
 50 | 	 */
 51 | 	static Variant[] listToArray(ArrayList<Variant> data)
 52 | 	{
 53 | 		int length = data.size();
 54 | 		Variant[] asArray = new Variant[length];
 55 | 		for(int i = 0; i<length; i++)
 56 | 		{
 57 | 			asArray[i] = data.get(i);
 58 | 		}
 59 | 		return asArray;
 60 | 	}
 61 | 	
 62 | 	/*
 63 | 	 * Alternate constructor which takes a list instead of an array
 64 | 	 */
 65 | 	public VariantMerger(ArrayList<Variant> data)
 66 | 	{
 67 | 		this(listToArray(data));
 68 | 	}
 69 | 	
 70 | 	/*
 71 | 	 * Runs the core algorithm for building the implicit merging graph and
 72 | 	 * performing merging
 73 | 	 */
 74 | 	void runMerging()
 75 | 	{
 76 | 		if(n == 1)
 77 | 		{
 78 | 			return;
 79 | 		}
 80 | 		
 81 | 		// For each variant v, how many of its nearest neighbors have had their edges
 82 | 		// from v considered already.  
 83 | 		int[] countEdgesProcessed = new int[n];
 84 | 		
 85 | 		// nearestNeighbors will be used as a cache to store the next few nearest neighbors
 86 | 		// The purpose of this is to prevent performing a new KNN-query every time an edge
 87 | 		// is considered, but instead a logarithmic number of times.
 88 | 		Variant[][] nearestNeighbors = new Variant[n][];
 89 | 		
 90 | 		// A heap of edges to be processed in non-decreasing order of distance
 91 | 		PriorityQueue<Edge> toProcess = new PriorityQueue<Edge>();
 92 | 		
 93 | 		// Get the first 4 nearest neighbors for every variant, and add their first edges to
 94 | 		// the heap
 95 | 		for(int i = 0; i<n; i++)
 96 | 		{
 97 | 			nearestNeighbors[i] = knn.kNearestNeighbor(data[i], 4);
 98 | 			int maxDistAllowed = Math.max(data[i].maxDist, nearestNeighbors[i][0].maxDist);
 99 | 			if(Settings.REQUIRE_MUTUAL_DISTANCE)
100 | 			{
101 | 				maxDistAllowed = Math.min(data[i].maxDist, nearestNeighbors[i][0].maxDist);
102 | 			}
103 | 			if(data[i].distance(nearestNeighbors[i][0]) < maxDistAllowed + 1e-9)
104 | 			{
105 | 				toProcess.add(new Edge(i, nearestNeighbors[i][0].index, data[i].distance(nearestNeighbors[i][0])));
106 | 			}
107 | 			countEdgesProcessed[i]++;
108 | 		}
109 | 		
110 | 		while(!toProcess.isEmpty())
111 | 		{
112 | 			Edge e = toProcess.poll();
113 | 			boolean valid = forest.canUnion(e.from, e.to);
114 | 			if(valid)
115 | 			{
116 | 				int fromRoot = forest.find(e.from);
117 | 				int toRoot = forest.find(e.to);
118 | 				if(Settings.CLIQUE_MERGE || Settings.CENTROID_MERGE)
119 | 				{
120 | 					// Makes sure all newly merged variant pairs are within the distance threshold
121 | 					if(Settings.CLIQUE_MERGE)
122 | 					{
123 | 						for(int i = 0; i<merged[fromRoot].size() && valid; i++)
124 | 						{
125 | 							Variant candidateFrom = data[merged[fromRoot].get(i)];
126 | 							for(int j = 0; j<merged[toRoot].size() && valid; j++)
127 | 							{
128 | 								Variant candidateTo = data[merged[toRoot].get(j)];
129 | 								int maxDistAllowed = Math.max(candidateFrom.maxDist, candidateTo.maxDist);
130 | 								if(Settings.REQUIRE_MUTUAL_DISTANCE)
131 | 								{
132 | 									maxDistAllowed = Math.min(data[i].maxDist, nearestNeighbors[i][0].maxDist);
133 | 								}
134 | 								if(candidateFrom.distance(candidateTo) > maxDistAllowed + 1e-9)
135 | 								{
136 | 									valid = false;
137 | 								}
138 | 							}
139 | 						}
140 | 					}
141 | 					// Make sure everything being merged can be merged with their overall centroid
142 | 					else if(Settings.CENTROID_MERGE)
143 | 					{
144 | 						double avgStart = 0.0, avgEnd = 0.0;
145 | 						for(int i = 0; i<merged[fromRoot].size(); i++)
146 | 						{
147 | 							Variant v = data[merged[fromRoot].get(i)];
148 | 							avgStart += v.start;
149 | 							avgEnd += v.end;
150 | 						}
151 | 						for(int i = 0; i<merged[toRoot].size(); i++)
152 | 						{
153 | 							Variant v = data[merged[toRoot].get(i)];
154 | 							avgStart += v.start;
155 | 							avgEnd += v.end;
156 | 						}
157 | 						avgStart /= merged[fromRoot].size() + merged[toRoot].size();
158 | 						avgEnd /= merged[fromRoot].size() + merged[toRoot].size();
159 | 						
160 | 						for(int i = 0; i<merged[fromRoot].size() && valid; i++)
161 | 						{
162 | 							Variant v = data[merged[fromRoot].get(i)];
163 | 							valid &= v.distFromPoint(avgStart, avgEnd) <= v.maxDist + 1e-9;
164 | 						}
165 | 						for(int i = 0; i<merged[toRoot].size() && valid; i++)
166 | 						{
167 | 							Variant v = data[merged[toRoot].get(i)];
168 | 							valid &= v.distFromPoint(avgStart, avgEnd) <= v.maxDist + 1e-9;
169 | 						}
170 | 					}
171 | 					if(valid)
172 | 					{
173 | 						forest.union(fromRoot, toRoot);
174 | 						if(forest.map[fromRoot] < 0)
175 | 						{
176 | 							// The first variant is the new root of the union-find component
177 | 							for(int x : merged[toRoot])
178 | 							{
179 | 								merged[fromRoot].add(x);
180 | 							}
181 | 						}
182 | 						else
183 | 						{
184 | 							for(int x : merged[fromRoot])
185 | 							{
186 | 								merged[toRoot].add(x);
187 | 							}
188 | 						}
189 | 					}
190 | 				}
191 | 				
192 | 				// Two variants are being merged here - nothing needs to be done
193 | 				else
194 | 				{
195 | 					forest.union(e.from, e.to);
196 | 				}
197 | 			}
198 | 			
199 | 			while(true)
200 | 			{
201 | 				
202 | 				// If we already used the stored neighbors, query again for twice as many
203 | 				if(countEdgesProcessed[e.from] >= nearestNeighbors[e.from].length)
204 | 				{
205 | 					nearestNeighbors[e.from] = knn.kNearestNeighbor(data[e.from], 2 * nearestNeighbors[e.from].length);
206 | 				}
207 | 				
208 | 				// If we tried to get more and didn't find anymore, then we are done with this variant
209 | 				if(countEdgesProcessed[e.from] >= nearestNeighbors[e.from].length)
210 | 				{
211 | 					break;
212 | 				}
213 | 				Variant candidateTo = nearestNeighbors[e.from][countEdgesProcessed[e.from]];
214 | 				
215 | 				// This edge was invalid because of distance from the query, so stop looking at any edges 
216 | 				// since they'll only get farther away
217 | 				int maxDistAllowed = Math.max(data[e.from].maxDist, candidateTo.maxDist);
218 | 				if(Settings.REQUIRE_MUTUAL_DISTANCE)
219 | 				{
220 | 					maxDistAllowed = Math.min(data[e.from].maxDist, candidateTo.maxDist);
221 | 				}
222 | 				
223 | 				if(data[e.from].distance(candidateTo) > data[e.from].maxDist + 1e-9)
224 | 				{
225 | 					break;
226 | 				}
227 | 				
228 | 				else if(data[e.from].distance(candidateTo) > maxDistAllowed + 1e-9)
229 | 				{
230 | 					countEdgesProcessed[e.from]++;
231 | 					continue;
232 | 				}
233 | 				
234 | 				// If edge was invalid because of coming from the same sample, ignore it and try the next one
235 | 				else if(!Settings.ALLOW_INTRASAMPLE && data[e.from].sample == candidateTo.sample)
236 | 				{
237 | 					toProcess.add(new Edge(e.from, candidateTo.index, data[e.from].distance(candidateTo)));
238 | 					countEdgesProcessed[e.from]++;
239 | 					break;
240 | 				}
241 | 				
242 | 				// If sequences weren't similar enough for two insertions, ignore and try again
243 | 				else if(!data[e.from].passesStringSimilarity(candidateTo))
244 | 				{
245 | 					countEdgesProcessed[e.from]++;
246 | 					continue;
247 | 				}
248 | 				
249 | 				else if(!data[e.from].passesOverlap(candidateTo))
250 | 				{
251 | 					countEdgesProcessed[e.from]++;
252 | 					continue;
253 | 				}
254 | 				
255 | 				// The next edge is something we want to consider since it is close enough and goes to a
256 | 				// different sample
257 | 				else
258 | 				{
259 | 					toProcess.add(new Edge(e.from, candidateTo.index, data[e.from].distance(candidateTo)));
260 | 					countEdgesProcessed[e.from]++;
261 | 					break;
262 | 				}
263 | 			}
264 | 		}
265 | 	}
266 | 	
267 | 	/*
268 | 	 * Get an array of all of the groups of variants
269 | 	 */
270 | 	@SuppressWarnings("unchecked")
271 | 	ArrayList<Variant>[] getGroups()
272 | 	{
273 | 		ArrayList<Variant>[] res = new ArrayList[n];
274 | 		for(int i = 0; i<n; i++)
275 | 		{
276 | 			res[i] = new ArrayList<Variant>();
277 | 		}
278 | 		for(int i = 0; i<n; i++)
279 | 		{
280 | 			if(forest.map[i] < 0)
281 | 			{
282 | 				res[i].add(data[i]);
283 | 			}
284 | 			else
285 | 			{
286 | 				res[forest.find(i)].add(data[i]);
287 | 			}
288 | 		}
289 | 		return res;
290 | 	}
291 | 	
292 | 	/*
293 | 	 * An edge between two variants indicating that they can be merged
294 | 	 * Sorting is non-decreasing order of edge weights (ties broken with variant IDs)
295 | 	 */
296 | 	class Edge implements Comparable<Edge>
297 | 	{
298 | 		int from, to;
299 | 		double dist;
300 | 		Edge(int from, int to, double dist)
301 | 		{
302 | 			this.from = from;
303 | 			this.to = to;
304 | 			this.dist = dist;
305 | 		}
306 | 		@Override
307 | 		public int compareTo(Edge o) {
308 | 			if(Math.abs(dist - o.dist) > 1e-9)
309 | 			{
310 | 				return Double.compare(dist, o.dist);
311 | 			}
312 | 			if(data[from].hash != data[o.from].hash) return data[from].hash - (data[o.from].hash);
313 | 			if(data[to].hash != data[o.to].hash) return data[to].hash - (data[o.to].hash);
314 | 			if(from != o.from) return data[from].id.compareTo(data[o.from].id);
315 | 			return data[to].id.compareTo(data[o.to].id);
316 | 		}
317 | 	}
318 | }
319 | 


--------------------------------------------------------------------------------
/src/VcfHeader.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * The header of a VCF file, including a list of INFO fields
  3 |  * The main purpose of this is to manage INFO field description lines and avoid duplicates
  4 |  */
  5 | 
  6 | import java.io.PrintWriter;
  7 | import java.util.ArrayList;
  8 | import java.util.HashSet;
  9 | 
 10 | public class VcfHeader {
 11 | 	
 12 | 	static String infoKey = "##INFO=<ID=";
 13 | 	static String formatKey = "##FORMAT=<ID=";
 14 | 	
 15 | 	// List of all lines in the header
 16 | 	ArrayList<String> lines;
 17 | 	
 18 | 	// The names of all INFO fields present in the VCF file
 19 | 	HashSet<String> infoFields;
 20 | 	HashSet<String> formatFields;
 21 | 	
 22 | 	// The index of the last INFO field line, or the line before where the next INFO field should go
 23 | 	int lastInfoFieldIndex;
 24 | 	
 25 | 	// The index of the last FORMAT field line, or the line before where the next FORMAT field should go
 26 | 	int lastFormatFieldIndex;
 27 | 	
 28 | 	// Constructor - just initializes the data structures
 29 | 	VcfHeader()
 30 | 	{
 31 | 		lines = new ArrayList<String>();
 32 | 		infoFields = new HashSet<String>();
 33 | 		formatFields = new HashSet<String>();
 34 | 		lastInfoFieldIndex = -1;
 35 | 		lastFormatFieldIndex = -1;
 36 | 	}
 37 | 	
 38 | 	/*
 39 | 	 * Print all lines of the header
 40 | 	 */
 41 | 	void print(PrintWriter out)
 42 | 	{
 43 | 		for(int i = 0; i<lines.size(); i++)
 44 | 		{
 45 | 			out.println(lines.get(i));
 46 | 		}
 47 | 	}
 48 | 	
 49 | 	/*
 50 | 	 * Adds a line to the header, updating the list of INFO fields as needed
 51 | 	 */
 52 | 	void addLine(String line)
 53 | 	{
 54 | 		lines.add(line);
 55 | 		
 56 | 		if(line.startsWith(infoKey))
 57 | 		{
 58 | 			lastInfoFieldIndex = lines.size() - 1;
 59 | 			lastFormatFieldIndex = lines.size() - 1;
 60 | 			String restOfLine = line.substring(infoKey.length());
 61 | 			int endIdx = restOfLine.indexOf(',');
 62 | 			infoFields.add(restOfLine.substring(0, endIdx));
 63 | 		}
 64 | 		
 65 | 		else if(line.startsWith(formatKey))
 66 | 		{
 67 | 			lastFormatFieldIndex = lines.size() - 1;
 68 | 			String restOfLine = line.substring(formatKey.length());
 69 | 			int endIdx = restOfLine.indexOf(',');
 70 | 			formatFields.add(restOfLine.substring(0, endIdx));
 71 | 		}
 72 | 		
 73 | 		else
 74 | 		{
 75 | 			if(lastFormatFieldIndex == -1)
 76 | 			{
 77 | 				lastFormatFieldIndex++;
 78 | 			}
 79 | 			if(lastInfoFieldIndex == -1)
 80 | 			{
 81 | 				lastInfoFieldIndex++;
 82 | 			}
 83 | 		}
 84 | 	}
 85 | 	
 86 | 	/*
 87 | 	 * Adds an INFO field, adding the header line if it's not already present
 88 | 	 */
 89 | 	void addInfoField(String id, String number, String type, String desc)
 90 | 	{
 91 | 		if(infoFields.contains(id))
 92 | 		{
 93 | 			return;
 94 | 		}
 95 | 		String line = String.format("##INFO=<ID=%s,Number=%s,Type=%s,Description=\"%s\">", id, number, type, desc);
 96 | 		infoFields.add(id);
 97 | 		lines.add(lastInfoFieldIndex + 1, line);
 98 | 		lastInfoFieldIndex++;
 99 | 		lastFormatFieldIndex++;
100 | 	}
101 | 	
102 | 	/*
103 | 	 * Remove all format fields from the header
104 | 	 */
105 | 	void resetFormatFields()
106 | 	{
107 | 		formatFields = new HashSet<String>();
108 | 		ArrayList<String> newLines = new ArrayList<String>();
109 | 		int oldIndex = lastFormatFieldIndex;
110 | 		lastFormatFieldIndex = -1;
111 | 		for(int i = 0; i<lines.size(); i++)
112 | 		{
113 | 			String line = lines.get(i);
114 | 			if(line.startsWith(formatKey))
115 | 			{
116 | 				if(lastFormatFieldIndex == -1)
117 | 				{
118 | 					lastFormatFieldIndex = i - 1;
119 | 				}
120 | 				continue;
121 | 			}
122 | 			newLines.add(line);
123 | 		}
124 | 		if(lastFormatFieldIndex == -1)
125 | 		{
126 | 			lastFormatFieldIndex = oldIndex;
127 | 		}
128 | 		lines = newLines;
129 | 	}
130 | 	
131 | 	/*
132 | 	 * Adds the header line for a new format field, if it doesn't already exist
133 | 	 */
134 | 	void addFormatField(String id, String number, String type, String desc)
135 | 	{
136 | 		if(formatFields.contains(id))
137 | 		{
138 | 			return;
139 | 		}
140 | 		String line = String.format("##FORMAT=<ID=%s,Number=%s,Type=%s,Description=\"%s\">", id, number, type, desc);
141 | 		formatFields.add(id);
142 | 		lines.add(lastFormatFieldIndex + 1, line);
143 | 		lastFormatFieldIndex++;
144 | 	}
145 | }
146 | 


--------------------------------------------------------------------------------
/src/VcfHeaderTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Test to make sure VCF header is properly handling adding INFO fields and adding/resetting FORMAT fields
 3 |  * Output should be (in order): Test, SVTYPE, INFO1-3, FORMAT1-3, and #CHR
 4 |  */
 5 | import java.io.PrintWriter;
 6 | 
 7 | public class VcfHeaderTest {
 8 | public static void main(String[] args)
 9 | {
10 | 	VcfHeader header = new VcfHeader();
11 | 	header.addLine("#Test");
12 | 	header.addLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of the SV.\">");
13 | 	header.addLine("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
14 | 	header.addLine("##CHR etc.");
15 | 	header.addInfoField("INFO1", "1", "String", "desc1");
16 | 	header.addInfoField("INFO2", "1", "String", "desc2");
17 | 	header.addFormatField("FORMAT3", "1", "String", "descf3");
18 | 	header.resetFormatFields();
19 | 	header.addFormatField("FORMAT1", "1", "String", "descf1");
20 | 	header.addInfoField("INFO3", "1", "String", "desc3");
21 | 	header.addFormatField("FORMAT2", "1", "String", "descf2");
22 | 	header.addFormatField("FORMAT3", "1", "String", "descf3");
23 | 	header.addFormatField("FORMAT1", "1", "String", "descf1");
24 | 	PrintWriter out = new PrintWriter(System.out);
25 | 	header.print(out);
26 | 	out.close();
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/VisualizationPrep.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Given the results of both Jasmine and SURVIVOR, extract out shared and different merges,
  3 |  * producing a list of points and line segments which can be plotted to visualize the results.
  4 |  * 
  5 |  * For now, this only works on datasets with two samples (i.e., two VCFs input to the merging software).
  6 |  */
  7 | import java.io.File;
  8 | import java.io.FileInputStream;
  9 | import java.io.PrintWriter;
 10 | import java.util.ArrayList;
 11 | import java.util.HashMap;
 12 | import java.util.Scanner;
 13 | import java.util.TreeSet;
 14 | public class VisualizationPrep {
 15 | 	
 16 | 	// Empty string if whole genome or chromosome name for plotting that chromosome
 17 | 	static String chrToPlot = "1";
 18 | 	
 19 | 	 // True iff we did one of the two with the samples in reverse order
 20 | 	static boolean secondRev = false;
 21 | 	
 22 | 	// Whether or not each file was produced by SURVIVOR and needs to be parsed differently
 23 | 	static boolean firstSurvivor = true;
 24 | 	static boolean secondSurvivor = false;
 25 | 	
 26 | 	// Whether or not to print merges unique to one output file
 27 | 	static boolean printUnique = false;
 28 | 	
 29 | 	static int sampleCount = 0;
 30 | 	
 31 | 	@SuppressWarnings("unchecked")
 32 | 	public static void main(String[] args) throws Exception
 33 | 	{
 34 | 		// File containing a list of VCF files
 35 | 		String fileList = "/home/mkirsche/eichler/filelist.txt";
 36 | 		
 37 | 		// The resulting merged VCF files from both Jasmine and SURVIVOR
 38 | 		String firstOutput = "/home/mkirsche/eichler/survmerged.vcf";
 39 | 		String secondOutput = "/home/mkirsche/eichler/merged.vcf";
 40 | 		
 41 | 		// Get the list of VCF files
 42 | 		ArrayList<String> vcfsAsList = PipelineManager.getFilesFromList(fileList);
 43 | 		String[] vcfs = new String[vcfsAsList.size()];
 44 | 		for(int i = 0; i<vcfs.length; i++)
 45 | 		{
 46 | 			vcfs[i] = vcfsAsList.get(i);
 47 | 		}
 48 | 		
 49 | 		sampleCount = vcfs.length;
 50 | 		
 51 | 		String outFile = secondOutput + ".graph";
 52 | 		
 53 | 		// The y-coordinate of each point (variant) will be the sample it came from
 54 | 		int[] ys = new int[vcfs.length];
 55 | 		for(int i = 0; i<ys.length; i++)
 56 | 		{
 57 | 			ys[i] = i;
 58 | 		}
 59 | 		
 60 | 		PrintWriter out = new PrintWriter(new File(outFile));
 61 | 		
 62 | 		// For each variant, keep track of its position and color (type)
 63 | 		ArrayList<Integer>[] positions = new ArrayList[vcfs.length];
 64 | 		ArrayList<Integer>[] colors = new ArrayList[vcfs.length];
 65 | 		int[] colorCounts = new int[4];
 66 | 		HashMap<String, VcfEntry>[] idToEntry = new HashMap[vcfs.length];
 67 | 		
 68 | 		// Hard-code the colors of the common variant types, so we know which color is which in downstream plotting
 69 | 		// There may be other colors in the case of other types.
 70 | 		HashMap<String, Integer> typeToInt = new HashMap<String, Integer>();
 71 | 		typeToInt.put("INS", 0);
 72 | 		typeToInt.put("DEL", 1);
 73 | 		typeToInt.put("DUP", 2);
 74 | 		typeToInt.put("INV", 3);
 75 | 		
 76 | 		// Iterate over input VCF files to record entries
 77 | 		for(int i = 0; i<vcfs.length; i++)
 78 | 		{
 79 | 			positions[i] = new ArrayList<Integer>();
 80 | 			colors[i] = new ArrayList<Integer>();
 81 | 			idToEntry[i] = new HashMap<String, VcfEntry>();
 82 | 			Scanner input = new Scanner(new FileInputStream(new File(vcfs[i])));
 83 | 			
 84 | 			// Read entries one at a time and store information about them
 85 | 			while(input.hasNext())
 86 | 			{
 87 | 				String line = input.nextLine();
 88 | 				if(line.length() == 0 || line.startsWith("#"))
 89 | 				{
 90 | 					continue;
 91 | 				}
 92 | 				VcfEntry entry = new VcfEntry(line);
 93 | 				if(chrToPlot.length() > 0 && !entry.getChromosome().equals(chrToPlot)) continue;
 94 | 				
 95 | 				// Below is an example of how to restrict the plot to certain positions
 96 | 				if(entry.getPos() > 10000000) continue;
 97 | 				
 98 | 				int pos = (int)entry.getPos();
 99 | 				positions[i].add(pos);
100 | 				idToEntry[i].put(entry.getId(), entry);
101 | 				if(!typeToInt.containsKey(entry.getType()))
102 | 				{
103 | 					typeToInt.put(entry.getType(), typeToInt.size());
104 | 				}
105 | 				colors[i].add(typeToInt.get(entry.getType()));
106 | 			}
107 | 			input.close();
108 | 		}
109 | 		
110 | 		// We now have the variant points, so output those for plotting
111 | 		for(int i = 0; i<vcfs.length; i++)
112 | 		{
113 | 			for(int j = 0; j<positions[i].size(); j++)
114 | 			{
115 | 				int x = positions[i].get(j);
116 | 				out.println(x+" "+ys[i]+" "+colors[i].get(j));
117 | 			}
118 | 		}
119 | 		
120 | 		// Now we have to get line segments, so get merged sets from both SURVIVOR and Jasmine.
121 | 		TreeSet<Merge> firstEdges = getJoinedPairs(firstOutput, firstSurvivor, false);
122 | 		System.out.println("Merges in first output: " + firstEdges.size());
123 | 		TreeSet<Merge> secondEdges = getJoinedPairs(secondOutput, secondSurvivor, secondRev);
124 | 		System.out.println("Merges in second output: " + secondEdges.size());
125 | 		
126 | 		// Store the union of the merge-sets so we get every line segment
127 | 		TreeSet<Merge> union = new TreeSet<Merge>();
128 | 		for(Merge s : secondEdges) union.add(s);
129 | 		for(Merge s : firstEdges) union.add(s);
130 | 		
131 | 		// For each line segment, color it based on which output it came from (possibly both)
132 | 		for(Merge edge : union)
133 | 		{
134 | 			String[] ids = new String[] {edge.id1, edge.id2};
135 | 			int[] samples = new int[] {edge.sample1, edge.sample2};
136 | 			boolean okay = true;
137 | 			int[] curPositions = new int[2];
138 | 			for(int i = 0; i<2; i++)
139 | 			{
140 | 				if(idToEntry[samples[i]].containsKey(ids[i]))
141 | 				{
142 | 					curPositions[i] = (int)idToEntry[samples[i]].get(ids[i]).getPos();
143 | 				}
144 | 				else
145 | 				{
146 | 					okay = false;
147 | 					break;
148 | 				}
149 | 			}
150 | 			if(!okay)
151 | 			{
152 | 				continue;
153 | 			}
154 | 			int color = 0;
155 | 			if(secondEdges.contains(edge)) color |= 2;
156 | 			if(firstEdges.contains(edge)) color |= 1;
157 | 			
158 | 			String firstSoftware = firstSurvivor ? "survivor" : "Jasmine";
159 | 			String secondSoftware = secondSurvivor ? "survivor" : "Jasmine";
160 | 			if(secondRev) secondSoftware += "rev";
161 | 			
162 | 			colorCounts[color]++;
163 | 			
164 | 			// If the pair was only merged by one software, print out information about it
165 | 			if(color == 1 || color == 2)
166 | 			{
167 | 				VcfEntry first = idToEntry[samples[0]].get(ids[0]);
168 | 				VcfEntry second = idToEntry[samples[1]].get(ids[1]);
169 | 				
170 | 				if(printUnique)
171 | 				{
172 | 					System.out.println("Merge unique to " + (color == 1 ? firstSoftware : secondSoftware));
173 | 					System.out.println("  " + ids[0] + " " + first.getType() + " " + first.getStrand() + " at " + first.getPos() + " (length " + first.getLength() + ")");
174 | 					System.out.println("  " + ids[1] + " " + second.getType() + " " + second.getStrand() + " at " + second.getPos() + " (length " + second.getLength() + ")");
175 | 					System.out.println("  " + edge.line);
176 | 					System.out.println("  Samples: " + edge.sample1 + " " + edge.sample2);
177 | 					Variant a = VariantInput.fromVcfEntry(first, 0), b = VariantInput.fromVcfEntry(second, 0);
178 | 					System.out.println("  Distance according to Jasmine: " + a.distance(b));
179 | 				}
180 | 			}
181 | 			
182 | 			// Print the line segment
183 | 			out.println(curPositions[0]+" "+ys[edge.sample1]+" "+curPositions[1]+" "+ys[edge.sample2]+" "+color);
184 | 		}
185 | 		System.out.println("First output unique merges: " + colorCounts[1]);
186 | 		System.out.println("Second output unique merges: " + colorCounts[2]);
187 | 		System.out.println("Shared merges: " + colorCounts[3]);
188 | 		out.close();
189 | 		
190 | 		
191 | 	}
192 | 	
193 | 	/*
194 | 	 * For a given merged VCF file, get the list of all pairs of variants which were joined
195 | 	 * For now, assumes only 2 samples, and the survivor flag is true if SURVIVOR was used
196 | 	 * and false if Jasmine was used instead.
197 | 	 */
198 | 	static TreeSet<Merge> getJoinedPairs(String fn, boolean survivor, boolean rev) throws Exception
199 | 	{
200 | 		Scanner input = new Scanner(new FileInputStream(new File(fn)));
201 | 		TreeSet<Merge> res = new TreeSet<Merge>();
202 | 		while(input.hasNext())
203 | 		{
204 | 			String line = input.nextLine();
205 | 			if(line.length() == 0 || line.startsWith("#")) continue;
206 | 			VcfEntry entry = new VcfEntry(line);
207 | 			if(chrToPlot.length() > 0 && !entry.getChromosome().equals(chrToPlot)) continue;
208 | 			String supportVector = entry.getInfo("SUPP_VEC");
209 | 			ArrayList<Integer> samples = new ArrayList<Integer>();
210 | 			for(int i = 0; i<supportVector.length(); i++)
211 | 			{
212 | 				if(supportVector.charAt(i) == '1')
213 | 				{
214 | 					samples.add(i);
215 | 				}
216 | 			}
217 | 			if(survivor)
218 | 			{
219 | 				if(entry.tabTokens.length < 11) continue;
220 | 				ArrayList<String> ids = new ArrayList<String>();
221 | 				for(int i = 9; i<entry.tabTokens.length; i++)
222 | 				{
223 | 					String val = entry.tabTokens[i].split(":")[7];
224 | 					if(!val.equalsIgnoreCase("nan")) ids.add(val);
225 | 				}
226 | 				for(int i = 0; i < ids.size()-1 && i < samples.size()-1; i++)
227 | 				{
228 | 					int j = i+1;
229 | 					//for(int j = i+1; j < ids.size() && j < samples.size(); j++)
230 | 					{
231 | 						if(rev) res.add(new Merge(ids.get(j), ids.get(i), sampleCount - 1 - samples.get(j), sampleCount - 1 - samples.get(i), line));
232 | 						else res.add(new Merge(ids.get(i), ids.get(j), samples.get(i), samples.get(j), line));
233 | 					}
234 | 				}
235 | 				
236 | 			}
237 | 			else
238 | 			{
239 | 				String[] ids = entry.getInfo("IDLIST").split(",");
240 | 				for(int i = 0; i < ids.length-1 && i < samples.size()-1; i++)
241 | 				{
242 | 					int j = i+1;
243 | 					//for(int j = i+1; j < ids.length && j < samples.size(); j++)
244 | 					{
245 | 						if(rev) res.add(new Merge(ids[j], ids[i], sampleCount - 1 - samples.get(j), sampleCount - 1 - samples.get(i), line));
246 | 						else res.add(new Merge(ids[i], ids[j], samples.get(i), samples.get(j), line));
247 | 					}
248 | 				}
249 | 			}
250 | 			
251 | 		}
252 | 		input.close();
253 | 		return res;
254 | 	}
255 | 	
256 | 	/*
257 | 	 * A merge between two variants in different samples
258 | 	 */
259 | 	static class Merge implements Comparable<Merge>
260 | 	{
261 | 		String id1, id2;
262 | 		int sample1, sample2;
263 | 		String line;
264 | 		Merge(String ii1, String ii2, int ss1, int ss2, String ll)
265 | 		{
266 | 			line = ll;
267 | 			id1 = ii1;
268 | 			id2 = ii2;
269 | 			sample1 = ss1;
270 | 			sample2 = ss2;
271 | 		}
272 | 		@Override
273 | 		public int compareTo(Merge o) {
274 | 			if(sample1 != o.sample1)
275 | 			{
276 | 				return sample1 - o.sample1;
277 | 			}
278 | 			if(sample2 != o.sample2)
279 | 			{
280 | 				return sample2 - o.sample2;
281 | 			}
282 | 			if(!id1.equals(o.id1))
283 | 			{
284 | 				return id1.compareTo(o.id1);
285 | 			}
286 | 			return id2.compareTo(o.id2);
287 | 		}
288 | 	}
289 | }
290 | 


--------------------------------------------------------------------------------
/test_data/a.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##contig=<ID=1,length=248956422>
 3 | ##contig=<ID=2,length=242193529>
 4 | ##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for END coordinate in case of a translocation">
 5 | ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant">
 6 | ##INFO=<ID=RE,Number=1,Type=Integer,Description="read support">
 7 | ##INFO=<ID=PRECISE,Number=0,Type=Flag,Description="Precise structural variation">
 8 | ##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the SV">
 9 | ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
10 | ##INFO=<ID=STRANDS,Number=A,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">
11 | ##INFO=<ID=IS_SPECIFIC,Number=1,Type=String,Description="Whether or not a variant has enough read support and length to be specific">
12 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
13 | 1	1000000	1	CACGTACGTACGTACGTACGTACGTACTGACGTACGT	C	.	PASS	PRECISE;CHR2=1;END=1000036;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-36;STRANDS=+-;RE=12;IS_SPECIFIC=1	GT	1/1
14 | 1	1999960	2	C	CACGTACGTACGTACGTACGTACGTACTGACGTACGT	.	PASS	PRECISE;CHR2=1;END=1999960;SVTYPE=INS;SUPTYPE=AL;SVLEN=36;STRANDS=+-;RE=10;IS_SPECIFIC=1	GT	1/1
15 | 1	3000000	3	N	]2:1000000]N	.	PASS	PRECISE;CHR2=2;END=3000000;SVTYPE=BND;SUPTYPE=AL;SVLEN=1;STRANDS=-+;RE=15;IS_SPECIFIC=1	GT	1/1
16 | 1	4000200	4	CACGTACGTACGTACGTACGTACGTACTGACGT	C	.	PASS	PRECISE;CHR2=1;END=4000232;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;RE=12;IS_SPECIFIC=1	GT	1/1
17 | 
18 | 


--------------------------------------------------------------------------------
/test_data/b.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##contig=<ID=1,length=248956422>
 3 | ##contig=<ID=2,length=242193529>
 4 | ##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for END coordinate in case of a translocation">
 5 | ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant">
 6 | ##INFO=<ID=RE,Number=1,Type=Integer,Description="read support">
 7 | ##INFO=<ID=PRECISE,Number=0,Type=Flag,Description="Precise structural variation">
 8 | ##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the SV">
 9 | ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
10 | ##INFO=<ID=STRANDS,Number=A,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">
11 | ##INFO=<ID=IS_SPECIFIC,Number=1,Type=String,Description="Whether or not a variant has enough read support and length to be specific">
12 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
13 | 1	1000000	1	CACGTACGTACGTACGTACGTACGTACTGACGT	C	.	PASS	PRECISE;CHR2=1;END=1000032;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;RE=12;IS_SPECIFIC=1	GT	1/1
14 | 1	2000045	2	C	CACGTACGTACGTACGTACGTACGTACTGACGTACGTACGT	.	PASS	PRECISE;CHR2=1;END=2000045;SVTYPE=INS;SUPTYPE=AL;SVLEN=40;STRANDS=+-;RE=10;IS_SPECIFIC=1	GT	1/1
15 | 1	3000000	3	N	]2:1000000]N	.	PASS	PRECISE;CHR2=2;END=3000000;SVTYPE=BND;SUPTYPE=AL;SVLEN=1;STRANDS=-+;RE=15;IS_SPECIFIC=1	GT	1/1
16 | 1	4000000	4	CACGTACGTACGTACGTACGTACGTACTGACGT	C	.	PASS	PRECISE;CHR2=1;END=4000032;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;RE=12;IS_SPECIFIC=1	GT	1/1
17 | 
18 | 


--------------------------------------------------------------------------------
/test_data/c.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##contig=<ID=1,length=248956422>
 3 | ##contig=<ID=2,length=242193529>
 4 | ##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for END coordinate in case of a translocation">
 5 | ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant">
 6 | ##INFO=<ID=RE,Number=1,Type=Integer,Description="read support">
 7 | ##INFO=<ID=PRECISE,Number=0,Type=Flag,Description="Precise structural variation">
 8 | ##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the SV">
 9 | ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
10 | ##INFO=<ID=STRANDS,Number=A,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">
11 | ##INFO=<ID=IS_SPECIFIC,Number=1,Type=String,Description="Whether or not a variant has enough read support and length to be specific">
12 | ##INFO=<ID=SUPP_VEC,Number=1,Type=String,Description="Vector of supporting samples">
13 | ##INFO=<ID=SUPP_VEC_EXT,Number=1,Type=String,Description="Vector of supporting samples, potentially extended across multiple merges">
14 | ##INFO=<ID=SUPP,Number=1,Type=String,Description="Number of samples supporting the variant">
15 | ##INFO=<ID=SUPP_EXT,Number=1,Type=String,Description="Number of samples supporting the variant, potentially extended across multiple merges">
16 | ##INFO=<ID=IDLIST,Number=.,Type=String,Description="Variant IDs of variants merged to make this call (at most 1 per sample)">
17 | ##INFO=<ID=IDLIST_EXT,Number=.,Type=String,Description="Variant IDs of variants merged, potentially extended across multiple merges">
18 | ##INFO=<ID=SVMETHOD,Number=1,Type=String,Description="">
19 | ##INFO=<ID=STARTVARIANCE,Number=1,Type=String,Description="Variance of start position for variants merged into this one">
20 | ##INFO=<ID=ENDVARIANCE,Number=1,Type=String,Description="Variance of end position for variants merged into this one">
21 | ##INFO=<ID=AVG_START,Number=1,Type=String,Description="Average start position for variants merged into this one">
22 | ##INFO=<ID=AVG_END,Number=1,Type=String,Description="Average end position for variants merged into this one">
23 | ##INFO=<ID=AVG_LEN,Number=1,Type=String,Description="Average length for variants merged into this one">
24 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
25 | 1	4000200	0_4	CACGTACGTACGTACGTACGTACGTACTGACGT	C	.	PASS	PRECISE;CHR2=1;END=4000232;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;RE=12;IS_SPECIFIC=1;STARTVARIANCE=0.000000;ENDVARIANCE=0.000000;AVG_LEN=-32.000000;AVG_START=4000200.000000;AVG_END=4000232.000000;SUPP_VEC_EXT=10;IDLIST_EXT=4;SUPP_EXT=1;SUPP_VEC=10;SUPP=1;SVMETHOD=JASMINE;IDLIST=4	GT	1/1
26 | 1	1000000	0_1	CACGTACGTACGTACGTACGTACGTACTGACGTACGT	C	.	PASS	PRECISE;CHR2=1;END=1000036;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-36;STRANDS=+-;RE=12;IS_SPECIFIC=1;STARTVARIANCE=0.000000;ENDVARIANCE=4.000000;AVG_LEN=-34.000000;AVG_START=1000000.000000;AVG_END=1000034.000000;SUPP_VEC_EXT=11;IDLIST_EXT=1,1;SUPP_EXT=2;SUPP_VEC=11;SUPP=2;SVMETHOD=JASMINE;IDLIST=1,1	GT	1/1
27 | 1	1999960	0_2	C	CACGTACGTACGTACGTACGTACGTACTGACGTACGT	.	PASS	PRECISE;CHR2=1;END=1999960;SVTYPE=INS;SUPTYPE=AL;SVLEN=36;STRANDS=+-;RE=10;IS_SPECIFIC=1;STARTVARIANCE=1806.250000;ENDVARIANCE=1806.250000;AVG_LEN=38.000000;AVG_START=2000002.500000;AVG_END=2000002.500000;SUPP_VEC_EXT=11;IDLIST_EXT=2,2;SUPP_EXT=2;SUPP_VEC=11;SUPP=2;SVMETHOD=JASMINE;IDLIST=2,2	GT	1/1
28 | 1	3000000	0_3	N	]2:1000000]N	.	PASS	PRECISE;CHR2=2;END=3000000;SVTYPE=BND;SUPTYPE=AL;SVLEN=1;STRANDS=-+;RE=15;IS_SPECIFIC=1;STARTVARIANCE=0.000000;ENDVARIANCE=0.000000;AVG_LEN=1.000000;AVG_START=3000000.000000;AVG_END=3000000.000000;SUPP_VEC_EXT=11;IDLIST_EXT=3,3;SUPP_EXT=2;SUPP_VEC=11;SUPP=2;SVMETHOD=JASMINE;IDLIST=3,3	GT	1/1
29 | 1	4000000	1_4	CACGTACGTACGTACGTACGTACGTACTGACGT	C	.	PASS	PRECISE;CHR2=1;END=4000032;SVTYPE=DEL;SUPTYPE=AL;SVLEN=-32;STRANDS=+-;RE=12;IS_SPECIFIC=1;STARTVARIANCE=0.000000;ENDVARIANCE=0.000000;AVG_LEN=-32.000000;AVG_START=4000000.000000;AVG_END=4000032.000000;SUPP_VEC_EXT=01;IDLIST_EXT=4;SUPP_EXT=1;SUPP_VEC=01;SUPP=1;SVMETHOD=JASMINE;IDLIST=4	GT	1/1
30 | 


--------------------------------------------------------------------------------