├── .gitignore ├── .gitmodules ├── .travis.yml ├── LICENSE ├── README.md ├── README_NIM.md ├── SVNibbler.png ├── makefile ├── multimedia ├── nibSV.jpg └── nibSV_presentation.pptx ├── nib.nimble ├── nim.cfg ├── src ├── nibpkg │ ├── captain.nim │ ├── classify.nim │ ├── compose.nim │ ├── kmers.nim │ ├── read.nim │ ├── refmers.nim │ ├── reporter.nim │ ├── svidx.nim │ ├── util.nim │ └── welcome.nim └── nibsv.nim ├── test-data ├── GIAB-chr22.vcf ├── GIAB_PBSV_TRIO_CALLS.vcf ├── GIAB_PBSV_TRIO_CALLS_TEST2.vcf ├── GIAB_PBSV_TRIO_CALLS_TEST2_regions.bed ├── README.md ├── event_four.bam ├── event_four.bam.bai ├── event_one.bam ├── event_one.bam.bai ├── event_three.bam ├── event_three.bam.bai ├── event_two.bam └── event_two.bam.bai ├── tests ├── .gitignore ├── all.nim ├── config.nims ├── foo.fasta ├── foo.fasta.fai ├── makefile ├── nim.cfg ├── t_composer.nim ├── t_kmers.nim ├── t_read.nim ├── t_refmers.nim ├── t_svidx.nim ├── t_util.nim └── t_welcome.nim └── vendor └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /nimbleDir 2 | /nibsv 3 | /src/nibsv 4 | .DS_Store 5 | *.dSYM 6 | *.msgpck 7 | tests/test_read 8 | tests/test_svidx 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "vendor/STRling"] 2 | path = vendor/STRling 3 | url = https://github.com/quinlan-lab/STRling.git 4 | ignore = dirty 5 | [submodule "vendor/threadpools"] 6 | path = vendor/threadpools 7 | url = https://github.com/yglukhov/threadpools.git 8 | ignore = dirty 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | services: 3 | - docker 4 | before_install: 5 | - docker pull nimlang/nim 6 | script: 7 | - docker run nimlang/nim nim --version 8 | - docker run -v "$(pwd):/project" -w /project nimlang/nim sh -c "make tests" 9 | # - docker run -v "$(pwd):/project" -w /project nimlang/nim sh -c "find src/ -name '*.nim' -type f -exec nim doc {} \;" 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 collaborativebioinformatics 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # DEPRICATED! Moved to : https://github.com/fritzsedlazeck/nibSV 4 | 5 | 6 | 7 | # NibblerSV 8 | 9 | ## Contributors 10 | 11 | Brent Pederson1, Christopher Dunn2, Eric Dawson3, Fritz Sedlazeck4, Peter Xie5, and Zev Kronenberg2 12 | 13 | 1 University of Utah; 2PacBio; 3Nvidia Corporation; 4Baylor College of Medicine; 5JBrowse (UC Berkeley); 14 | 15 | ## Intro statement 16 | Structural variation (SV) are the largest source of genetic variation within the human population. Long read DNA sequencing is becoming the preferred method for discovering structural variants. Structural variation can be longer than a short-read (<500bp) DNA trace, meaning the SV allele is not contained, which causes challenges and problems in the detection. 17 | 18 | Long read sequencing has proven superior to identify Structural Variations in individuals. Nevertheless, it is important to obtain accurate allele frequencies of these complex alleles across a population to rank and identify potential pathogenic variations. Thus, it is important to be able to genotype SV events in a large set of previously short read based sequenced samples (e.g. 1000genomes, Topmed, CCDG, etc.). Two main approaches has been recently shown to achieve this with high accuracy even for insertions: Paragraph and VG. However, these methods still consume hours per sample and even more depending on the number of SV to be genotyped along the genome or in regions. Furthermore and maybe more crucially rely on precise breakpoints that do not change in other samples. This assumption might be flawed over repetitive regions. In addition the problem currently arises that some data sets are mapped to different genomic version than others (e.g hg19 vs. GRCH38 vs. CHM13) and will require a different VCF catalog to be genotyped. 19 | 20 | # Why NibblerSV 21 | NibblerSV can overcome these challenges. NibblerSV relies on a k-mer based strategy to identify SV breakpoints in short read data set. Due to innovative k-mer design and efficient implementation, NibblerSV is able to run on a 30x cram file within minutes with low memory requirements. Its k-mer strategy of spaced k-mers allow a relaxed constrain on the precision of the breakpoint. In addition, utilizing k-mers NibblerSV is independent of the genomic reference the short reads were aligned to and can even work on raw fastq reads. This makes NibblerSV a lightweight, scalable and easy to apply methods to identify the frequency of Structural Variatons. 22 | 23 | 24 | Who doesn't like to nibble on SV? 25 | # How does it work ? 26 | NibblerSV is a light weighted framework to identify the presence and absence of Structural Variations across a large set of Illumina sequenced samples. To achieve this we take a VCF file including all the SV that should be genotyped. Next, we extract the reference and alternative allele kmers. This is done such that we include the flanking regions. Subsequently, we count the occurrence of these k-mers in the reference fasta file. This is necessary to not miscount certain k-mers. To enable large scaling of NibblerSV the results of these two steps are written into a temporary file, which is all that is needed for the actual genotyping step. 27 | 28 | During the genotyping step NibblerSV uses the small temporary file and the bam/cram file of the sample. NibblerSV then identifies the presence /absence of the reference and alternative k-mer across the entire sample. This is very fast and requires only minimal resources of memory as the number of k-mers is limited. Once NibblerSV finished the scanning of the bam/cram file it reports out which SV have been re-identified by adding a tag in the output VCF file of this sample. The VCF per sample can then be merged to obtain population frequencies. 29 | 30 | ![alt text](multimedia/nibSV.jpg) 31 | 32 | # How to use 33 | 34 | To run nibblerSV just execute this example which uses the test data provided. You should have a copy of GRCh38 available to run this. 35 | ``` 36 | ./src/nibsv main -v test-data/GIAB_PBSV_TRIO_CALLS_TEST2.vcf -r hg38.fa.gz --reads-fn test-data/event_one.bam -p HG02 37 | ``` 38 | 39 | Full usage: 40 | ``` 41 | (base) ZKRONENBERG-MAC:nibSV zkronenberg$ ./src/nibsv main -h 42 | Usage: 43 | main [required&optional-params] 44 | Generate a SV kmer database, and genotype new samples. If a file called "{prefix}.sv_kmers.msgpack" exists, use it. Otherwise, 45 | generate it. 46 | Options: 47 | -h, --help print this cligen-erated help 48 | --help-syntax advanced: prepend,plurals,.. 49 | -v=, --variants-fn= string REQUIRED long read VCF SV calls 50 | -r=, --refSeq-fn= string REQUIRED reference genome FASTA, compressed OK 51 | --reads-fn= string REQUIRED input short-reads in BAM/SAM/CRAM/FASTQ 52 | -p=, --prefix= string "test" output prefix 53 | -k=, --kmer-size= int 25 kmer size, for spaced seeds use <=16 otherwise <=32 54 | -s, --spaced-seeds bool false turn on spaced seeds 55 | --space= int 50 width between spaced kmers 56 | -f=, --flank= int 100 number of bases on either side of ALT/REF in VCF records 57 | -m=, --max-ref-kmer-count= uint32 0 max number of reference kmers allowed in SV event 58 | ``` 59 | 60 | 61 | # Quickstart 62 | ## Input 63 | 1. A Strucutural variant VCF 64 | 2. An indexed FASTA file of the reference genome 65 | 3. A BAM/CRAM file (new genome) 66 | 67 | ## Output 68 | A VCF file with a tag in INFO field identifying the present/ absance for each SV. 69 | 70 | # Testing 71 | We have tested NibblerSV on HG002 from GIAB and various other control data sets. 72 | 73 | # Installation 74 | 75 | ## Install Nim 76 | * https://nim-lang.org/install.html 77 | 78 | See also [README_NIM.md](README_NIM.md) 79 | 80 | ## Install htslib 81 | This needs to be available as a dynamically loadable library 82 | on your system. 83 | 84 | * http://www.htslib.org/download/ 85 | 86 | ## Setup and build 87 | ```sh 88 | make setup 89 | make build 90 | 91 | # Or for faster executable 92 | make release 93 | ``` 94 | -------------------------------------------------------------------------------- /README_NIM.md: -------------------------------------------------------------------------------- 1 | ### Installing Nim 2 | 3 | * https://nim-lang.org/install.html 4 | 5 | Then, if you want to control which version you have: 6 | 7 | ``` 8 | nimble install nimble 9 | export PATH=~/.nimble/bin:$PATH 10 | choosenim stable 11 | ``` 12 | -------------------------------------------------------------------------------- /SVNibbler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collaborativebioinformatics/nibSV/3fec3eb8fda8ee1c21879287f4c5660dd3961998/SVNibbler.png -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | #NIMBLE_DIR?=${CURDIR}/nimbleDir 2 | #export NIMBLE_DIR 3 | # Alternatively, use --nimbleDir:${NIMBLE_DIR} everywhere 4 | UNAME=$(shell uname) 5 | ifeq (${UNAME},Darwin) 6 | install=install_name_tool -add_rpath /opt/local/lib 7 | else 8 | install=echo 9 | endif 10 | 11 | build: 12 | nim c src/nibsv.nim 13 | ${install} src/nibsv 14 | release: 15 | nim c -d:release -d:danger src/nibsv.nim 16 | all: 17 | ${MAKE} install 18 | quick: 19 | nim c -r tests/t_kmers.nim 20 | nim c -r tests/t_util.nim 21 | help: 22 | nimble -h 23 | nimble tasks 24 | tests: 25 | @# much faster than nimble 26 | ${MAKE} -C tests 27 | test: 28 | nimble test # uses "tests/" directory by default 29 | integ-test: 30 | @echo 'integ-test TBD' 31 | install: 32 | nimble install -y 33 | pretty: 34 | find src -name '*.nim' | xargs -L1 nimpretty --maxLineLen=1024 35 | find tests -name '*.nim' | xargs -L1 nimpretty --maxLineLen=1024 36 | setup: #vendor/threadpools vendor/STRling 37 | nimble install --verbose -y hts kmer bitvector cligen msgpack4nim 38 | #cd vendor/threadpools; nimble install --verbose -y 39 | #cd vendor/STRling; nimble install --verbose -y 40 | vendor/threadpools vendor/STRling: 41 | git submodule update --init 42 | rsync: # not used for now 43 | mkdir -p ${NIMBLE_DIR}/pkgs/ 44 | rsync -av vendor/STRling/ ${NIMBLE_DIR}/pkgs/strling-0.3.0/ 45 | rsync -av vendor/threadpools/ ${NIMBLE_DIR}/pkgs/threadpools-0.1.0/ 46 | 47 | .PHONY: test tests 48 | -------------------------------------------------------------------------------- /multimedia/nibSV.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collaborativebioinformatics/nibSV/3fec3eb8fda8ee1c21879287f4c5660dd3961998/multimedia/nibSV.jpg -------------------------------------------------------------------------------- /multimedia/nibSV_presentation.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collaborativebioinformatics/nibSV/3fec3eb8fda8ee1c21879287f4c5660dd3961998/multimedia/nibSV_presentation.pptx -------------------------------------------------------------------------------- /nib.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | 3 | version = "0.2.0" 4 | author = "Zev Kronenberg" 5 | author = "Christopher Dunn" 6 | author = "(Add your name here)" 7 | description = "Structural Variant nibbles" 8 | license = "BSD-3-Clause" 9 | srcDir = "src" 10 | installDirs = @["nibpkg"] 11 | bin = @["nibsv"] 12 | 13 | 14 | # Dependencies 15 | 16 | requires "nim >= 1.2.0", "hts", "kmer", "bitvector >= 0.4.10", "cligen", "msgpack4nim" 17 | -------------------------------------------------------------------------------- /nim.cfg: -------------------------------------------------------------------------------- 1 | --hint[Conf]:off 2 | --hint[XDeclaredButNotUsed]:off 3 | --hint[Processing]:off 4 | --hint[Name]:off 5 | #--warning[UnusedImport]:off 6 | -------------------------------------------------------------------------------- /src/nibpkg/captain.nim: -------------------------------------------------------------------------------- 1 | # vim: sw=4 ts=4 sts=4 tw=0 et: 2 | import refmers 3 | import svidx 4 | import strformat 5 | import classify 6 | import reporter 7 | #from ./read import `$` 8 | from os import nil 9 | from tables import len 10 | 11 | proc main_runner*(variants_fn, refSeq_fn, reads_fn: string, prefix = "test", kmer_size: int = 25, spaced_seeds : bool = false, space: int = 0, flank: int = 100, maxRefKmerCount : uint32 = 1 ) = 12 | ## Generate a SV kmer database, and genotype new samples. 13 | ## If a file called "{prefix}.sv_kmers.msgpack" exists, use it. 14 | ## Otherwise, generate it. 15 | var index_fn = "{prefix}.sv_kmers.msgpck".fmt 16 | var idx: SvIndex 17 | 18 | if not os.fileExists(index_fn): 19 | echo "building an SV kmer DB." 20 | let sp = if spaced_seeds: 21 | space 22 | else: 23 | 0 24 | idx = buildSvIndex(refSeq_fn, variants_fn, flank, kmer_size, sp) 25 | echo "updating reference kmer counts." 26 | updateSvIndex(refSeq_fn, idx, kmer_size, 1000000, sp) 27 | echo "dumpIndexToFile:'", index_fn, "'" 28 | dumpIndexToFile(idx, index_fn) 29 | else: 30 | echo "loadIndexFromFile:'", index_fn, "'" 31 | idx = loadIndexFromFile(index_fn, kmer_size) 32 | 33 | echo "final idx contains: {idx.len} forward and reverse SV kmers.".fmt 34 | 35 | 36 | filterRefKmers(idx, maxRefKmerCount) 37 | 38 | 39 | #echo dumpIndexToJson(idx) 40 | 41 | 42 | let classifyCount = classify_file(reads_fn, idx, kmer_size, spaced_seeds, space) 43 | 44 | #echo "classifyCount:" 45 | #echo classifyCount 46 | 47 | 48 | echo "reporting variants." 49 | 50 | report(variants_fn, classifyCount, idx, prefix) 51 | 52 | echo "nibbleSV finished without problems, goodbye!" 53 | 54 | 55 | when isMainModule: 56 | import cligen 57 | dispatch(main_runner) 58 | -------------------------------------------------------------------------------- /src/nibpkg/classify.nim: -------------------------------------------------------------------------------- 1 | import strutils 2 | import tables 3 | import hts 4 | import ./read 5 | import ./svidx 6 | from ./compose import nil 7 | 8 | proc buildSvIndex*(reference_path: string, vcf_path: string, flank: int = 100, k: int = 25, space: int = 0): SvIndex = 9 | ## Open FASTA index 10 | var fai: Fai 11 | doAssert fai.open(reference_path), "Failed to open FASTA file: " & reference_path 12 | 13 | var variants: VCF 14 | doAssert(open(variants, vcf_path)) 15 | 16 | result.kmerSize = k.uint8 17 | 18 | var sv_idx = 0 19 | echo "flank:", flank 20 | for v in variants: 21 | let sv_chrom = $v.CHROM 22 | 23 | let flanks = compose.retrieve_flanking_sequences_from_fai(fai, $v.CHROM, v.start.int, v.stop.int, flank) 24 | var p = compose.composePositioned(v, flanks.left, flanks.right, k, space) 25 | 26 | result.insert(p.sequences.alt_seq, k, sv_idx) 27 | # The insert function allows us to add to the ref count, but refmer also 28 | # adds the same counts, so for now i'm commenting this out to minimize the 29 | # lines of code we are debugging. --Zev 30 | # result.insert(p.sequences.ref_seq, k, -1) 31 | 32 | sv_idx.inc 33 | 34 | proc classify_bam(filename: string, idx: SvIndex, k: int = 25, spacedSeeds: bool = false, space: int = 50, threads: int = 2): CountTableRef[uint32] = 35 | new(result) 36 | 37 | var bamfile: Bam 38 | open(bamfile, filename, index = false, threads=threads) 39 | var sequence: string 40 | 41 | for record in bamfile: 42 | # NOTE: we may also want to filter record.flag.dup in the future, but 43 | # that will make results differ between bam and fastq 44 | if record.flag.secondary or record.flag.supplementary: continue 45 | record.sequence(sequence) 46 | 47 | var read_classification = process_read(sequence, idx, k, spacedSeeds, space) 48 | 49 | #if read_classification.compatible_SVs.len != 0: 50 | # echo read_classification 51 | 52 | filter_read_matches(read_classification, winner_takes_all=false) 53 | for svId, count in read_classification.compatible_SVs: 54 | result.inc(svId) 55 | 56 | #echo result 57 | 58 | 59 | proc classify_file*(filename: string, idx: SvIndex, k: int = 25, spacedSeeds: bool = false, space: int = 50): CountTableRef[uint32] = 60 | if endsWith(filename, ".bam"): 61 | return classify_bam(filename, idx, k, spacedSeeds, space) 62 | else: 63 | quit("Error: only BAM input currently supported.") 64 | 65 | proc main_classify*(read_file: string, vcf_file: string, ref_file: string, k: int = 25, flank: int = 100) = 66 | var idx: SvIndex = buildSvIndex(ref_file, vcf_file, flank, k) 67 | var svCounts: CountTableRef[uint32] = classify_file(read_file, idx, k) 68 | -------------------------------------------------------------------------------- /src/nibpkg/compose.nim: -------------------------------------------------------------------------------- 1 | # vim: sw=2 ts=2 sts=2 tw=0 et ft=python: 2 | import hts 3 | import kmers 4 | 5 | type 6 | FlankSeq* = object 7 | left*, right*: string 8 | 9 | type 10 | PositionedSequence* = object 11 | sequences*: tuple[ref_seq: string, alt_seq: string] 12 | kmers: tuple[ref_kmers: seq[seed_t], alt_kmers: seq[seed_t]] 13 | chrom: string 14 | position: int32 15 | 16 | proc retrieve_flanking_sequences_from_fai*(fastaIdx: Fai, chrom: string, 17 | start_pos: int, end_pos: int, flank: int): FlankSeq = 18 | ## this function lacks a return 19 | result.left = fastaIdx.get(chrom, max(0, start_pos - flank), start_pos) 20 | result.right = fastaIdx.get(chrom, end_pos, end_pos + flank) 21 | 22 | proc kmerize(s: string, k: int = 25, space: int = 0): seq[seed_t] = 23 | var kmers = Dna(s).dna_to_kmers(k) 24 | if space > 0: 25 | kmers = spacing_kmer(kmers, space) 26 | return kmers.seeds 27 | 28 | proc composePositioned*(variant: Variant, left_flank: string, 29 | right_flank: string, k: int = 25 ; space: int = 0): PositionedSequence = 30 | ## Takes in a VCF variant, the 5' and 3' reference flanking sequences, 31 | ## and a kmer size. Produces a PositionedSequence, which holds the ref/alt 32 | ## sequences as well as the kmers of those sequences (in addition to 33 | ## minimal position information) 34 | var variant_type: string 35 | doAssert variant.info.get("SVTYPE", variant_type) == Status.OK 36 | if variant_type == "DEL": 37 | var deleted_bases: string = $variant.REF ## Chop the reference base prefix in the REF allele. 38 | result.sequences.ref_seq = left_flank & deleted_bases & right_flank 39 | result.sequences.alt_seq = left_flank & right_flank 40 | if k > 0: 41 | result.kmers.ref_kmers = kmerize(result.sequences.ref_seq, k, space) 42 | result.kmers.alt_kmers = kmerize(result.sequences.alt_seq, k, space) 43 | elif variant_type == "INS": 44 | # the first base in the alt string is ref (silly VCF format). ^1 prevents going off the end of the seq (which ^0 did) 45 | var inserted_seq: string = variant.ALT[0][1 .. ^1] ## Chop the reference base prefix in the ALT allele. 46 | result.sequences.ref_seq = left_flank & right_flank 47 | result.sequences.alt_seq = left_flank & inserted_seq & right_flank 48 | if k > 0: 49 | result.kmers.ref_kmers = kmerize(result.sequences.ref_seq, k, space) 50 | result.kmers.alt_kmers = kmerize(result.sequences.alt_seq, k, space) 51 | elif variant_type == "INV": 52 | return 53 | #raise newException(ValueError, 54 | #"Error: Inversion processing not implemented.") 55 | 56 | result.position = int32(variant.start) - int32(len(right_flank)) 57 | result.chrom = $variant.CHROM 58 | 59 | 60 | proc compose_variants*(variant_file: string, reference_file: string; k: int = 31, space: int = 0): seq[ 61 | PositionedSequence] = 62 | ## function to compose variants from their sequence / FASTA flanking regions 63 | ## Returns a Sequence of strings representing the DNA sequence of the flanking 64 | ## regions and variant sequence. 65 | 66 | var composed_seqs = newSeq[PositionedSequence]() 67 | 68 | ## Open FASTA index 69 | var fai: Fai 70 | if not fai.open(reference_file): 71 | quit ("Failed to open FASTA file: " & reference_file) 72 | 73 | var variants: VCF 74 | doAssert(open(variants, variant_file)) 75 | 76 | 77 | for v in variants: 78 | var variant_type: string 79 | if v.info.get("SVTYPE", variant_type) != Status.OK: 80 | continue 81 | let sv_chrom = $v.CHROM 82 | ## Retrieve flanks, either from FAI or string cache 83 | let flanks = retrieve_flanking_sequences_from_fai(fai, sv_chrom, int( 84 | v.start), int(v.stop), 100) 85 | ## Generate a single sequence from variant seq + flank, 86 | ## taking into account the variant type. 87 | var variant_seq = composePositioned(v, flanks.left, flanks.right, k, space) 88 | composed_seqs.add(variant_seq) 89 | 90 | return composed_seqs 91 | 92 | when isMainModule: 93 | import cligen 94 | dispatch(compose_variants) 95 | -------------------------------------------------------------------------------- /src/nibpkg/kmers.nim: -------------------------------------------------------------------------------- 1 | # vim: sw=4 ts=4 sts=4 tw=0 et: 2 | import deques 3 | import tables 4 | #from sets import nil 5 | from algorithm import sort 6 | from hashes import nil 7 | from strutils import format 8 | from ./util import raiseEx, PbError 9 | 10 | export PbError 11 | 12 | type 13 | Dna* = string # someday, this might be an array 14 | Bin* = uint64 # compact bitvector of DNA 15 | ## In bitvector, A is 0, C is 1, G is two, and T is 3. 16 | 17 | Min* = uint64 # minimizer 18 | Strand* = enum 19 | forward, reverse 20 | 21 | ## kmer - a uint64 supporting a maximum of 32 DNA bases. 22 | ## pos - position along the sequence 23 | seed_t* = object 24 | kmer*: Bin 25 | pos*: uint32 26 | strand*: Strand 27 | 28 | minimizer_t* = object 29 | minimizer*: Min 30 | pos*: uint32 31 | strand*: Strand 32 | 33 | ## a & b are two seed_t's designed for matching in the hash lookup 34 | seed_pair_t* = object 35 | a*: seed_t 36 | b*: seed_t 37 | 38 | Hash* = int 39 | 40 | ## seeds - a pointer to the kmers 41 | pot_t* = ref object of RootObj 42 | word_size*: uint8 # <=32 43 | seeds*: seq[seed_t] 44 | 45 | ## searchable seed-pot 46 | spot_t* = ref object of pot_t 47 | ht*: tables.TableRef[Bin, int] 48 | 49 | var seq_nt4_table: array[256, int] = [ 50 | 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 51 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 52 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 53 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 54 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 55 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 56 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 57 | 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 58 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 59 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 60 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 61 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 62 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 63 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 64 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 65 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4] 66 | 67 | 68 | ## @return uninitialized 69 | # 70 | proc newDna(size: int): Dna = 71 | return newString(size) 72 | 73 | # hashes for sets and tables 74 | 75 | proc hash*(s: kmers.seed_t): hashes.Hash = 76 | #hashes.hash(s.pos) + hashes.hash(s.kmer shl 8) + hashes.hash(s.strand) 77 | hashes.hash([s.pos.int64, s.kmer.int64, s.strand.int64]) 78 | 79 | proc hash*(p: kmers.seed_pair_t): hashes.Hash = 80 | hashes.hash([hash(p.a), hash(p.b)]) 81 | 82 | # convenience for C coders 83 | 84 | template `<<`(a, b: uint64): uint64 = 85 | a shl b 86 | 87 | template `>>`(a, b: uint64): uint64 = 88 | a shr b 89 | 90 | ## Return binary version of kmer exactly matching Dna. 91 | ## Mostly for testing, as this is not efficient for sliding windows. 92 | ## NOT FINISHED. 93 | # 94 | proc encode(sq: Dna) = 95 | assert sq.len() <= 32 96 | let k = sq.len() 97 | let 98 | shift1: uint64 = 2'u64 * (k - 1).uint64 99 | mask: uint64 = (1'u64 << (2 * k).uint64) - 1 100 | var 101 | forward_bin: Bin = 0 102 | reverse_bin: Bin = 0 103 | for i in 0 ..< k: 104 | let ch = cast[uint8](sq[i]) 105 | let c = seq_nt4_table[ch].uint64 106 | assert c < 4 107 | forward_bin = (forward_bin << 2 or c) and mask 108 | reverse_bin = (reverse_bin >> 2) or ( 109 | 3'u64 xor c) << shift1 110 | 111 | ## Converts a char * into a set of seed_t objects. 112 | ## @param sq - sequence 113 | ## @param k - kmer size (<=32) 114 | ## @return pot 115 | # 116 | proc dna_to_kmers*(sq: Dna; k: int): pot_t = 117 | if k > 32: 118 | raiseEx("k > 32") 119 | 120 | let 121 | shift1: uint64 = 2'u64 * (k - 1).uint64 122 | mask: uint64 = (1'u64 << (2 * k).uint64) - 1 123 | #echo format("shift1=$# mask=$#", shift1, mask) 124 | 125 | var forward_kmer: seed_t 126 | var reverse_kmer: seed_t 127 | 128 | forward_kmer.kmer = 0 129 | forward_kmer.pos = 0 130 | reverse_kmer.kmer = 0 131 | reverse_kmer.pos = 0 132 | forward_kmer.strand = forward 133 | reverse_kmer.strand = reverse 134 | 135 | var kmers: pot_t 136 | new(kmers) 137 | kmers.seeds = newSeqOfCap[seed_t](max(sq.len - int(k) + 1,0)) 138 | kmers.word_size = k.uint8 139 | 140 | ## lk is the length of the kmers being built on the fly. The variable n is the total number of 141 | var 142 | i: int 143 | lk: int 144 | n: int 145 | i = 0 146 | lk = 0 147 | n = 0 148 | 149 | while i < sq.len(): 150 | let ch = cast[uint8](sq[i]) 151 | let c = seq_nt4_table[ch].uint64 152 | if c < 4: 153 | forward_kmer.kmer = (forward_kmer.kmer << 2 or c) and mask 154 | reverse_kmer.kmer = (reverse_kmer.kmer >> 2) or ( 155 | 3'u64 xor c) << shift1 156 | #echo format("[$#]=$# $#==$#($# $#) f:$# r:$#", 157 | # i, sq[i], ch, c, (3'u8 xor c), (3'u8 xor c).uint64 shl shift1, forward_kmer.kmer, reverse_kmer.kmer) 158 | inc(lk) 159 | else: 160 | ## advance the window beyond the unknown character 161 | lk = 0 162 | inc(i, k) 163 | inc(forward_kmer.pos, k) 164 | forward_kmer.kmer = 0 165 | inc(reverse_kmer.pos, k) 166 | reverse_kmer.kmer = 0 167 | 168 | if lk >= k: 169 | inc(n, 2) 170 | kmers.seeds.add(forward_kmer) 171 | kmers.seeds.add(reverse_kmer) 172 | inc(forward_kmer.pos, 1) 173 | inc(reverse_kmer.pos, 1) 174 | inc(i) 175 | 176 | 177 | 178 | 179 | return kmers 180 | 181 | ## A function to convert the binary DNA back into character 182 | ## @param kmer up to 32 2-bit bases 183 | ## @param k kmer length 184 | ## @param strand If reverse, start at kth bit and go backwards. 185 | ## 186 | ## Zero is A, one is C, G is two, and T is 3 187 | # 188 | proc bin_to_dna*(kmer: Bin; k: uint8; strand: Strand = forward): Dna = 189 | var lookup: array[4, char] = ['A', 'C', 'G', 'T'] 190 | var mask: uint64 = 3 191 | var i: uint8 = 0 192 | var tmp: uint64 = 0 193 | var offset: uint64 = 0 194 | 195 | var dna = newDna(k.int) 196 | i = 0 197 | while i < k: 198 | if strand == forward: 199 | offset = (k - i - 1) * 2 200 | tmp = kmer >> offset 201 | dna[i.int] = lookup[mask and tmp] 202 | else: 203 | offset = i * 2 204 | tmp = kmer >> offset 205 | dna[i.int] = lookup[mask and not tmp] 206 | inc(i) 207 | 208 | return dna 209 | 210 | proc nkmers*(pot: pot_t): int = 211 | return len(pot.seeds) 212 | 213 | ## Prints the pot structure to STDOUT 214 | ## @param pot a ref to the pot 215 | # 216 | proc print_pot*(pot: pot_t) = 217 | var i: int = 0 218 | 219 | while i < pot.seeds.len(): 220 | let dna = bin_to_dna(pot.seeds[i].kmer, pot.word_size, 221 | pot.seeds[i].strand) 222 | echo format("pos:$# strand:$# seq:$# bin:$#", 223 | pot.seeds[i].pos, pot.seeds[i].strand, dna, pot.seeds[i].kmer) 224 | inc(i, 1) 225 | 226 | proc get_dnas*(pot: pot_t): seq[Dna] = 227 | for i in 0 ..< pot.seeds.len(): 228 | let dna = bin_to_dna(pot.seeds[i].kmer, pot.word_size, 229 | pot.seeds[i].strand) 230 | result.add(dna) 231 | 232 | proc cmp_seeds(a, b: seed_t): int = 233 | let c = a.kmer 234 | let d = b.kmer 235 | 236 | if c < d: 237 | return -1 238 | 239 | if c == d: 240 | if a.pos < b.pos: 241 | return -1 242 | else: 243 | return 0 244 | 245 | return 1 246 | 247 | # Actual implementation, private. 248 | # 249 | proc make_searchable(seeds: var seq[seed_t]; ht: var tables.TableRef[Bin, int]) = 250 | seeds.sort(cmp_seeds) 251 | ht = newTable[Bin, int]() 252 | #let dups = sets.initHashSet[Bin]() 253 | var ndups = 0 254 | 255 | var i: int = 0 256 | while i < seeds.len(): 257 | let key = seeds[i].kmer 258 | if ht.hasKeyOrPut(key, i): 259 | ndups += 1 260 | #echo format("WARNING: Duplicate seed $# @$#, not re-adding @$#", 261 | # key, i, ht[key]) 262 | inc(i) 263 | #[ 264 | if ndups > 0: 265 | echo format("WARNING: $# duplicates in kmer table", ndups) 266 | 267 | ]# 268 | 269 | ## Construct searchable-pot from pot. 270 | ## Move construct seeds (i.e. original is emptied). 271 | ## 272 | ## Sort the seeds and load the kmers into a hash table. 273 | ## For any dups, the table refers to the first seed with that kmer. 274 | # 275 | proc initSpot*(kms: var pot_t): spot_t = 276 | new(result) 277 | result.word_size = kms.word_size 278 | shallowCopy(result.seeds, kms.seeds) 279 | #kms.seeds = @[] 280 | kms = nil # simpler, obvious move-construction 281 | make_searchable(result.seeds, result.ht) 282 | 283 | ## Check for the presence or absence of a kmer in a 284 | ## pot regardless of the position. 285 | ## @param pot_t * - a pointer to a pot_t 286 | ## @return false if kmer doesn't exist 287 | # 288 | proc haskmer*(target: spot_t; query: Bin): bool = 289 | if target.ht.hasKey(query): 290 | return true 291 | return false 292 | 293 | ## Counts the number of shared unique kmers 294 | ## @param pot_t * - a pointer to a pot_t 295 | ## @param pot_t * - a pointer to a pot_t 296 | ## @return int - number of shared kmers 297 | # 298 | proc uniqueShared*(a, b: spot_t): int = 299 | result = 0 300 | 301 | for k in a.ht.keys(): 302 | if(haskmer(b, k)): 303 | inc(result) 304 | 305 | ## Find (target - remove), without altering target. 306 | # 307 | proc difference*(target: pot_t; remove: spot_t): pot_t = 308 | new(result) 309 | result.word_size = target.word_size 310 | 311 | var kmer_stack = newSeq[seed_t]() 312 | 313 | for i in 0 ..< target.seeds.len(): 314 | if(not haskmer(remove, target.seeds[i].kmer)): 315 | kmer_stack.add(target.seeds[i]) 316 | 317 | result.seeds = kmer_stack 318 | 319 | ## Return the seeds in the intersection of target and query. 320 | # 321 | proc search*(target: spot_t; query: pot_t): deques.Deque[seed_pair_t] = 322 | echo format("Searching through $# kmers", query.seeds.len()) 323 | var hit_stack = deques.initDeque[seed_pair_t](128) 324 | var hit: seed_pair_t 325 | var hit_index: int 326 | 327 | var i: int = 0 328 | #echo format("target.ht=$#", target.ht) 329 | #echo format("query.ht=$#", query.ht) 330 | while i < query.seeds.len(): 331 | let key = query.seeds[i].kmer 332 | if key in target.ht: 333 | hit_index = target.ht[key] 334 | #echo format("For $# ($#), ql=$# tl=$#, hit_index=$#", i, key, query.seeds.len(), target.seeds.len(), hit_index) 335 | while (hit_index < target.seeds.len() and key == target.seeds[ 336 | hit_index].kmer): 337 | #echo format("--For $# ($#), ql=$# tl=$#, hit_index=$#", i, key, query.seeds.len(), target.seeds.len(), hit_index) 338 | hit.a = query.seeds[i] 339 | hit.b = target.seeds[hit_index] 340 | deques.addLast(hit_stack, hit) 341 | inc(hit_index, 1) 342 | inc(i) 343 | 344 | return hit_stack 345 | 346 | ## This function counts the number of uniq kmers in the pot if searchable if not 347 | ## the function calls make searchable. 348 | ## @param pot_t - a ref to a pot_t 349 | ## TODO: add test coverage 350 | # 351 | proc nuniq*(pot: spot_t): int = 352 | return len(pot.ht) 353 | 354 | proc spacing_kmer*(pot: pot_t; space: int): pot_t = 355 | #doAssert(space > int(pot.word_size)) # typical, but not necessary 356 | doAssert(pot.word_size <= 16) 357 | 358 | new(result) #default return knwos the type from function header 359 | result.word_size = pot.word_size*2 360 | 361 | for i in (0 ..< pot.seeds.len - 2*(space + pot.word_size.int)): 362 | let j = i + 2*(space + pot.word_size.int) 363 | assert(j < pot.seeds.len) 364 | 365 | let k1 = pot.seeds[i] 366 | let k2 = pot.seeds[j] 367 | assert(k1.strand == k2.strand) 368 | 369 | # new kmer 370 | var k: seed_t 371 | let (left, right) = if k1.strand == forward: 372 | (k1, k2) 373 | else: 374 | (k2, k1) 375 | k.kmer = left.kmer 376 | k.kmer = k.kmer << 2*pot.word_size 377 | k.kmer = k.kmer or right.kmer 378 | k.strand = left.strand 379 | k.pos = left.pos 380 | result.seeds.add(k) 381 | -------------------------------------------------------------------------------- /src/nibpkg/read.nim: -------------------------------------------------------------------------------- 1 | import ./kmers 2 | import tables 3 | export tables 4 | import ./svidx 5 | 6 | type Read* = object 7 | ## key of svid, count of supporting kmers 8 | compatible_SVs*: CountTable[uint32] 9 | 10 | proc process_read*(s: string, idx: SvIndex, k: int = 25, spacedSeeds: bool = false, space: int = 50): Read = 11 | # find SVs with kmers intersecting with those from this read. 12 | var kmers = Dna(s).dna_to_kmers(k) 13 | if(spacedSeeds): 14 | kmers = spacing_kmer(kmers, space) 15 | for kmer in kmers.seeds: 16 | var matching_svs = idx.lookupKmer(kmer) 17 | for svId in matching_svs: 18 | result.compatible_SVs.inc(svId) 19 | 20 | 21 | proc filter_read_matches*(read: var Read, min_matches: int = 2, winner_takes_all: bool = false) = 22 | ## track sv with most kmer matches 23 | var removables: seq[uint32] 24 | var max_sv = int.high 25 | var max_kcnt = 0 26 | for sv, kcnt in read.compatible_SVs: 27 | if kcnt < min_matches: 28 | removables.add(sv) 29 | if kcnt > max_kcnt: 30 | max_sv = sv.int 31 | max_kcnt = kcnt 32 | 33 | if winner_takes_all: 34 | clear(read.compatible_SVs) 35 | read.compatible_SVs.inc(max_sv.uint32, max_kcnt) 36 | else: 37 | for r in removables: 38 | read.compatible_SVs.del(r) 39 | -------------------------------------------------------------------------------- /src/nibpkg/refmers.nim: -------------------------------------------------------------------------------- 1 | # vim: sw=4 ts=4 sts=4 tw=0 et: 2 | import hts 3 | import kmers 4 | import tables 5 | import svidx 6 | 7 | type 8 | Chunk = object 9 | chrom_name: string 10 | chrom_start: int 11 | chrom_end: int 12 | 13 | 14 | iterator createdChunks(fai: Fai, chunk_size: int): Chunk = 15 | for i in 0.. 0. (Try 50.) 28 | var convertedKmers: pot_t = dna_to_kmers(full_sequence, kmer_size) 29 | if space > 0: 30 | convertedKmers = spacing_kmer(convertedKmers, space) 31 | #for seed in convertedKmers.seeds: 32 | # echo "btd:", bin_to_dna(seed.kmer, convertedKmers.word_size, seed.strand), ' ', seed.kmer 33 | 34 | for km in convertedKmers.seeds: 35 | if km.kmer in svKmers.counts: 36 | svKmers.counts[km.kmer].refCount.inc 37 | 38 | proc updateChunk(svKmers: var SvIndex, fai: Fai, chunk: Chunk, kmer_size: int, space: int = 0) = 39 | var sub_seq = fai.get(chunk.chrom_name, chunk.chrom_start, chunk.chrom_end) 40 | addRefCount(svKmers, sub_seq, kmer_size, space) 41 | 42 | proc updateSvIndex*(input_ref_fn: string, svKmers: var SvIndex, kmer_size: int = 25, chunk_size: int = 1_000_000, space: int = 0) = 43 | ## Walk over reference sequences and count kmers. 44 | ## Update any existing svIdx entries with these counts. 45 | ## Use spaced-seeds if space > 0. (Try 50.) 46 | var fai: Fai 47 | if not fai.open(input_ref_fn): 48 | quit "couldn't open fasta" 49 | 50 | for i in createdChunks(fai, chunk_size): 51 | echo " chunk i=", i 52 | updateChunk(svKmers, fai, i, kmer_size, space) 53 | 54 | when isMainModule: 55 | import hts 56 | var fai: Fai 57 | import times 58 | 59 | if not fai.open("/data/human/g1k_v37_decoy.fa"): 60 | quit "bad" 61 | 62 | var s = fai.get("22") 63 | var svkmers: svIdx 64 | new(svkmers) 65 | echo "starting" 66 | for i in countup(0, 100_000_000, 10): 67 | svkmers[i.uint64] = (0'u32, 0'u32, newSeq[uint32]()) 68 | 69 | var t0 = cpuTime() 70 | svKmers.addRefCount(s) 71 | echo "time:", cpuTime() - t0 72 | -------------------------------------------------------------------------------- /src/nibpkg/reporter.nim: -------------------------------------------------------------------------------- 1 | import tables 2 | import hts 3 | import svidx 4 | 5 | 6 | ## N.B.: Add a function that takes a BAM path and returns the sample name 7 | ## 8 | ## TODO: Add a function that handles genotypes using the svIdx's ref/alt count fields. 9 | proc report*(vcf_name : string, sv_read_supports : CountTableRef[uint32], sv_index : SvIndex, sample_name : string="SAMPLE") = 10 | ## Query SV supports for each SV in a VCF, appending the sample name to a field in the INFO fileds if 11 | ## the SV is present in the sample (i.e., SV support count > 1) 12 | var variants:VCF 13 | doAssert open(variants, vcf_name) 14 | echo "Writing report to output.vcf" 15 | 16 | var sv_to_kmer = initTable[uint32, seq[uint64]]() 17 | for kmer, support in sv_index.counts: 18 | doAssert(support.svs.len != 0) 19 | for svId in support.svs: 20 | var a = sv_to_kmer.getOrDefault(svId) 21 | a.add(kmer) 22 | sv_to_kmer[svId] = a 23 | 24 | var outputVCF:VCF 25 | doAssert open(outputVCF, "output.vcf", "w") 26 | ## Note: this will overwrite the existing entry if any exist in the VCF 27 | discard variants.header.add_info("NIB_SAMPLES_WITH_SV", ".", "String", "Sample name is present if SV is present in sample.") 28 | discard variants.header.add_info("NIB_READ_SUPPORTS", ".", "Integer", "The number of reads supporting a given SV.") 29 | discard variants.header.add_info("NIB_SV_REF_KMERIDX_COUNT", "1", "Integer", "Number of REF kmers in SV index for SV.") 30 | discard variants.header.add_info("NIB_SV_ALT_KMERIDX_COUNT", "1", "Integer", "Number of ALT kmers in SV index for SV.") 31 | 32 | #discard variants.header.add_info("NIB_ALT_SUPPORTS", ".", "Integer", "The number of reads supporting a given SV alt.") 33 | outputVCF.copy_header(variants.header) 34 | discard outputVCF.write_header() 35 | 36 | var sample_name = sample_name 37 | var sv_id :uint32= 0 38 | for v in variants: 39 | var sv_support_count = sv_read_supports.getOrDefault(sv_id, -1) 40 | var sv_ref_k_count = 0 41 | var sv_alt_k_count = 0 42 | for km in sv_to_kmer.getOrDefault(sv_id): 43 | sv_ref_k_count += sv_index.counts[km].refCount.int 44 | sv_alt_k_count += sv_index.counts[km].altCount.int 45 | 46 | doAssert v.info.set("NIB_SV_REF_KMERIDX_COUNT", sv_ref_k_count) == Status.OK 47 | doAssert v.info.set("NIB_SV_ALT_KMERIDX_COUNT", sv_alt_k_count) == Status.OK 48 | if sv_support_count > 0: 49 | doAssert v.info.set("NIB_SAMPLES_WITH_SV", sample_name) == Status.OK 50 | doAssert v.info.set("NIB_READ_SUPPORTS", sv_support_count) == Status.OK 51 | 52 | 53 | doAssert outputVCF.write_variant(v) 54 | 55 | sv_id.inc 56 | 57 | close(outputVCF) 58 | close(variants) 59 | -------------------------------------------------------------------------------- /src/nibpkg/svidx.nim: -------------------------------------------------------------------------------- 1 | # vim: sw=4 ts=4 sts=4 tw=0 et: 2 | import tables 3 | from strutils import nil 4 | from strformat import fmt 5 | import msgpack4nim, streams, json 6 | import ./kmers 7 | 8 | type 9 | #SvValue* = tuple[refCount: uint32, altCount: uint32, svs: seq[uint32]] 10 | SvValue* = object 11 | refCount*: uint32 12 | altCount*: uint32 13 | svs*: seq[uint32] 14 | 15 | ## A map from KMER ID -> (number of time kmer appears in a ref seq, number of times kmer appears in an alt seq, list(SVs) that kmer is contained in ) 16 | #svIdx* = TableRef[uint64, SvValue] 17 | SvIndex* = object 18 | counts*: Table[uint64, SvValue] 19 | kmerSize*: uint8 20 | 21 | proc len*(idx: SvIndex): int = 22 | return idx.counts.len 23 | 24 | 25 | #Cost savings on allocations? 26 | var empty: seq[uint32] 27 | 28 | proc lookupKmer*(idx: SvIndex, kmer: seed_t): seq[uint32] {.noInit.} = 29 | if kmer.kmer in idx.counts: 30 | return idx.counts[kmer.kmer].svs 31 | return empty 32 | 33 | proc dumpIndexToFile*(idx: SvIndex, fn: string) = 34 | let strm = openFileStream(fn, fmWrite) 35 | strm.pack(idx) 36 | strm.close() 37 | 38 | proc loadIndexFromFile*(fn: string, kmerSize: int): SvIndex = 39 | let strm = openFileStream(fn, fmRead) 40 | strm.unpack(result) 41 | strm.close() 42 | if kmerSize != result.kmerSize.int: 43 | echo "ERROR: Inconsistent SvIndex file '{fn}'\nkmerSize={kmerSize} != SvIndex.kmerSize={result.kmerSize}".fmt 44 | doAssert(kmerSize == result.kmerSize.int) 45 | 46 | proc `%`(idx: SvIndex): JsonNode = 47 | result = json.newJObject() 48 | result["kmerSize"] = %idx.kmerSize 49 | result["counts"] = json.newJObject() 50 | for k, v in idx.counts.pairs(): 51 | let val = SvValue(refCount: v.refCount, altCount: v.altCount, svs: v.svs) 52 | result["counts"][$k] = %val 53 | 54 | proc dumpIndexToJson*(idx: SvIndex): string = 55 | return json.pretty(%idx) 56 | 57 | proc loadIndexFromJson*(js: string): SvIndex = 58 | ## This painful method might become simple if SvIndex values 59 | ## switched from tuple to object. 60 | let j = json.parseJson(js) 61 | result.kmerSize = j["kmerSize"].getInt().uint8 62 | for key, val in j["counts"]: 63 | let k: uint64 = strutils.parseBiggestUint(key) 64 | let v = json.to(val, SvValue) 65 | result.counts[k] = v 66 | 67 | proc insert*(idx: var SvIndex, sequence: string, k: int, sv_idx: int = -1, space: int = 0) = 68 | ## when inserting reference sequences leave sv_idx as -1 69 | #doAssert(k == idx.kmerSize.int); 70 | var l = Dna(sequence).dna_to_kmers(k.int) 71 | if space > 0: 72 | l = spacing_kmer(l, space) 73 | 74 | # inserting alternates 75 | if sv_idx >= 0: 76 | for kmer in l.seeds: 77 | var kc = idx.counts.getOrDefault(kmer.kmer) 78 | kc.altCount.inc 79 | kc.svs.add(sv_idx.uint32) 80 | idx.counts[kmer.kmer] = kc 81 | 82 | return 83 | 84 | # inserting reference counts iff the kmer was already found as alternate. 85 | for kmer in l.seeds: 86 | # note: sometimes doing double lookup. 87 | if kmer.kmer notin idx.counts: continue 88 | idx.counts[kmer.kmer].refCount.inc 89 | 90 | proc filterRefKmers*(svKmers: var SvIndex, maxRefCount: uint32) = 91 | ## Remove entries in the SV index that have a ref count higher than specified 92 | echo "before:", svKmers.len, " maxRefCount:", maxRefCount 93 | var toRemove: seq[uint64] 94 | for k, v in pairs(svKmers.counts): 95 | if v.refCount > maxRefCount: 96 | toRemove.add(k) 97 | for k in toRemove: 98 | svKmers.counts.del(k) 99 | echo "after:", svKmers.len, " maxRefCount:", maxRefCount 100 | -------------------------------------------------------------------------------- /src/nibpkg/util.nim: -------------------------------------------------------------------------------- 1 | # vim: sts=4:ts=4:sw=4:et:tw=0 2 | #from cpuinfo import nil 3 | from math import nil 4 | from os import nil 5 | #from threadpool import nil 6 | from streams import nil 7 | from strformat import fmt 8 | from strutils import nil 9 | import heapqueue 10 | import osproc 11 | import times 12 | 13 | type PbError* = object of CatchableError 14 | type GenomeCoverageError* = object of PbError 15 | type FieldTooLongError* = object of PbError 16 | type TooFewFieldsError* = object of PbError 17 | 18 | proc raiseEx*(msg: string) {.discardable.} = 19 | raise newException(PbError, msg) 20 | 21 | proc isEmptyFile*(fn: string): bool = 22 | var finfo = os.getFileInfo(fn) 23 | if finfo.size == 0: 24 | return true 25 | return false 26 | 27 | #from strformat import fmt 28 | proc isOlderFile*(afn, bfn: string): bool = 29 | ## Return true iff afn is older than bnf. 30 | let 31 | at = os.getLastModificationTime(afn) 32 | bt = os.getLastModificationTime(bfn) 33 | #af = at.format("yyyy-MM-dd'T'HH:mm:ss,ffffffzzz") 34 | #bf = bt.format("yyyy-MM-dd'T'HH:mm:ss,ffffffzzz") 35 | #echo "glmt {afn}: {af}, {bfn}: {bf}".fmt 36 | return at < bt 37 | 38 | template withcd*(newdir: string, statements: untyped) = 39 | let olddir = os.getCurrentDir() 40 | os.setCurrentDir(newdir) 41 | defer: os.setCurrentDir(olddir) 42 | statements 43 | 44 | proc log*(words: varargs[string, `$`]) = 45 | for word in words: 46 | write(stderr, word) 47 | write(stderr, '\l') 48 | 49 | proc logt*(words: varargs[string, `$`]) = 50 | var then {.global.} = times.now() 51 | let 52 | since = times.initDuration(seconds = times.inSeconds(times.now() - then)) 53 | dp = times.toParts(since) 54 | prefix = strformat.fmt("{dp[Hours]}:{dp[Minutes]:02d}:{dp[Seconds]:02d}s ") 55 | write(stderr, prefix) 56 | log(words) 57 | 58 | proc adjustThreadPool*(n: int) = 59 | ## n==0 => use ncpus 60 | ## n==-1 => do not alter threadpool size (to avoid a weird problem for now) 61 | log("(ThreadPool is currently not used.)") 62 | #var size = n 63 | #if n == 0: 64 | # size = cpuinfo.countProcessors() 65 | #if size > threadpool.MaxThreadPoolSize: 66 | # size = threadpool.MaxThreadPoolSize 67 | #if size == -1: 68 | # log("ThreadPoolsize=", size, 69 | # " (i.e. do not change)", 70 | # ", MaxThreadPoolSize=", threadpool.MaxThreadPoolSize, 71 | # ", NumCpus=", cpuinfo.countProcessors()) 72 | # return 73 | #log("ThreadPoolsize=", size, 74 | # ", MaxThreadPoolSize=", threadpool.MaxThreadPoolSize, 75 | # ", NumCpus=", cpuinfo.countProcessors()) 76 | #threadpool.setMaxPoolSize(size) 77 | 78 | iterator walk*(dir: string, followlinks = false, relative = false): string = 79 | ## similar to python os.walk(), but always topdown and no "onerror" 80 | # Slow! 30x slower than Unix find. 81 | let followFilter = if followLinks: {os.pcDir, os.pcLinkToDir} else: {os.pcDir} 82 | let yieldFilter = {os.pcFile, os.pcLinkToFile} 83 | for p in os.walkDirRec(dir, yieldFilter = yieldFilter, 84 | followFilter = followFilter, relative = relative): 85 | yield p 86 | 87 | iterator readProc*(cmd: string): string = 88 | ## Stream from Unix subprocess, e.g. "find .". 89 | ## But if cmd=="-", stream directly from stdin. 90 | if cmd == "-": 91 | log("Reading from stdin...") 92 | for line in lines(stdin): 93 | yield line 94 | else: 95 | log("Reading from '" & cmd & "'...") 96 | var p = osproc.startProcess(cmd, options = {poEvalCommand}) 97 | if osproc.peekExitCode(p) > 0: 98 | let msg = "Immedate failure in readProc startProcess('" & cmd & "')" 99 | raiseEx(msg) 100 | defer: osproc.close(p) 101 | for line in streams.lines(osproc.outputStream(p)): 102 | yield line 103 | 104 | iterator readProcInMemory(cmd: string): string = 105 | ## Read from Unix subprocess, e.g. "find .", into memory. 106 | ## But if cmd=="-", stream directly from stdin. 107 | if cmd == "-": 108 | log("Reading from stdin...") 109 | for line in lines(stdin): 110 | yield line 111 | else: 112 | log("Reading from '" & cmd & "'...") 113 | let found = osproc.execProcess(cmd, options = {poEvalCommand}) 114 | var sin = streams.newStringStream(found) 115 | for line in streams.lines(sin): 116 | yield line 117 | 118 | proc removeFile*(fn: string, failIfMissing = false) = 119 | if failIfMissing and not os.fileExists(fn): 120 | raiseEx("Cannot remove non-existent file '" & fn & "'") 121 | log("rm -f ", fn) 122 | os.removeFile(fn) 123 | 124 | proc removeFiles*(fns: openarray[string], failIfMissing = false) = 125 | for fn in fns: 126 | removeFile(fn, failIfMissing) 127 | 128 | proc which*(exe: string) = 129 | let cmd = "which " & exe 130 | log(cmd) 131 | discard execCmd(cmd) 132 | 133 | proc thousands*(v: SomeInteger): string = 134 | if v == 0: 135 | return "0" 136 | var i: type(v) = v 137 | let negative = (i < 0) 138 | i = abs(i) 139 | #result = strformat.fmt"{i mod 1000:03}" 140 | #i = i div 1000 141 | while i > 0: 142 | result = strformat.fmt"{i mod 1000:03}," & result 143 | i = i div 1000 144 | # Drop tailing comma. 145 | assert result[^1] == ',' 146 | result = result[0 .. ^2] 147 | # Drop leading 0s. 148 | while result[0] == '0': 149 | result = result[1 .. ^1] 150 | if negative: 151 | result = '-' & result 152 | 153 | proc splitWeighted*(n: int, sizes: seq[int]): seq[int] = 154 | # Split sizes into n contiguous subsets, weighted by each size. 155 | # Each elem of result will represent a range of elems of sizes. 156 | # len(result) will be <= n 157 | 158 | if n == 0: 159 | return 160 | var sums: seq[int] 161 | var totalSize = math.sum(sizes) 162 | var remSize = totalSize 163 | var curr = 0 164 | var remN = min(n, len(sizes)) 165 | while len(sizes) > curr: 166 | #assert len(sizes) > curr, "not enough elements in sizes {len(sizes)} <= {curr}".fmt 167 | result.add(0) 168 | let approx = int(math.ceil(remSize / remN)) 169 | #echo "approx={approx}, remaining={remN}, tot={remSize}".fmt 170 | sums.add(0) 171 | while sums[^1] < approx: 172 | result[^1] += 1 173 | sums[^1] += sizes[curr] 174 | curr += 1 175 | remN -= 1 176 | remSize -= sums[^1] 177 | assert math.sum(result) == len(sizes) 178 | assert math.sum(sizes) == totalSize 179 | assert len(result) <= n 180 | 181 | type 182 | BinSum = object 183 | indices: seq[int] 184 | sum: int64 185 | order: int 186 | WeightedIndex = tuple[index: int, size: int] 187 | 188 | proc `<`(a, b: BinSum): bool = 189 | return a.sum < b.sum or (a.sum == b.sum and a.indices.len() < b.indices.len()) or 190 | (a.sum == b.sum and a.indices.len() == b.indices.len() and a.order < b.order) 191 | proc `<`(a, b: WeightedIndex): bool = 192 | return a.size > b.size or (a.size == b.size and a.index > b.index) 193 | 194 | proc partitionWeighted*(n: int, sizes: seq[int]): seq[seq[int]] = 195 | ## {sizes} is an index; other seqs refer to its indices. 196 | ## The splits for this version are not required to be contiguous. 197 | ## The result has at most n index-seqs, none of which are empty. 198 | var biggest = initHeapQueue[WeightedIndex]() 199 | for i in 0 ..< len(sizes): 200 | let wi: WeightedIndex = (index: i, size: sizes[i]) 201 | biggest.push(wi) 202 | var smallest_bin = initHeapQueue[BinSum]() 203 | for x in 0 ..< n: 204 | var bin: BinSum = BinSum(sum: 0, order: x) 205 | smallest_bin.push(bin) 206 | while biggest.len() > 0: 207 | let wi = biggest.pop() 208 | var bin = smallest_bin.pop() 209 | bin.indices.add(wi.index) 210 | bin.sum += wi.size 211 | smallest_bin.push(bin) 212 | while smallest_bin.len() > 0: 213 | let bin = smallest_bin.pop() 214 | if bin.indices.len() > 0: 215 | result.add(bin.indices) 216 | return result 217 | 218 | proc combineToTarget*(target: int64, weights: seq[int64]): seq[seq[int]] = 219 | # Given a seq of weights, 220 | # combine consecutive groups of them until they meet target. 221 | # Return a seq of seqs of those indices. For now, 222 | # the results will always be consecutive, e.g. 223 | # [ [0,1,2], [2,3], [4], [5,6] ] 224 | var 225 | total = target 226 | n = -1 227 | for i in 0 ..< len(weights): 228 | let next_weight = weights[i] 229 | #echo "i:{i} next:{next_weight} total:{total} n:{n}".fmt 230 | if total >= target: 231 | # new group 232 | result.add(@[i]) 233 | n = len(result) - 1 234 | total = next_weight 235 | else: 236 | # current group 237 | result[n].add(i) 238 | total += next_weight 239 | 240 | const 241 | MAX_HEADROOM* = 1024 242 | type 243 | Headroom* = array[MAX_HEADROOM, cchar] 244 | 245 | proc sscanf*(s: cstring, frmt: cstring): cint {.varargs, importc, 246 | header: "".} 247 | 248 | proc strlen(s: cstring): cint {.importc: "strlen", nodecl.} 249 | 250 | proc strlen(a: var Headroom): int = 251 | let n = strlen(cast[cstring](addr a)) 252 | return n 253 | 254 | proc toString*(ins: var Headroom, outs: var string, source: string = "") = 255 | var n = strlen(ins) 256 | if n >= (MAX_HEADROOM - 1): 257 | # Why is max-1 illegal? B/c this is used after sscanf, and that has no way to report 258 | # a buffer-overflow. So a 0 at end-of-buffer is considered too long. 259 | let msg = strformat.fmt"Too many characters in substring (>{MAX_HEADROOM - 1}) from '{source}'" 260 | raise newException(util.FieldTooLongError, msg) 261 | outs.setLen(n) 262 | for i in 0 ..< n: 263 | outs[i] = ins[i] 264 | 265 | proc getNthWord*(line: string, n: Natural, delim: char): string = 266 | ## n is 0-based 267 | var 268 | start = 0 269 | count = 0 270 | found = -1 271 | while count < n: 272 | found = strutils.find(line, delim, start) 273 | if found == -1: 274 | let msg = "Found only {count} < {n} instances of '{delim}' in '{line}'".fmt 275 | raiseEx(msg) 276 | start = found + 1 277 | count += 1 278 | var wordEnd = strutils.find(line, delim, start) 279 | if wordEnd == -1: 280 | wordEnd = line.len() 281 | return line[start..(wordEnd-1)] 282 | -------------------------------------------------------------------------------- /src/nibpkg/welcome.nim: -------------------------------------------------------------------------------- 1 | # vim: sw=4 ts=4 sts=4 tw=0 et: 2 | 3 | proc getWelcomeMessage*(): string = 4 | "Hello, World!" 5 | -------------------------------------------------------------------------------- /src/nibsv.nim: -------------------------------------------------------------------------------- 1 | from nibpkg/compose import nil 2 | from nibpkg/classify import nil 3 | from nibpkg/captain import nil 4 | 5 | when isMainModule: 6 | import cligen 7 | dispatchMulti( 8 | [compose.compose_variants, cmdName = "compose"], 9 | [classify.buildSvIndex, cmdName = "lookup"], 10 | [classify.main_classify, cmdName = "classify"], 11 | [captain.main_runner, cmdName = "main", 12 | help={ 13 | "variants-fn": "long read VCF SV calls", 14 | "refSeq-fn": "reference genome FASTA, compressed OK", 15 | "reads-fn": "input short-reads in BAM/SAM/CRAM/FASTQ", 16 | "prefix" : "output prefix", 17 | "kmer-size" : "kmer size, for spaced seeds use <=16 otherwise <=32", 18 | "spaced-seeds" : "turn on spaced seeds", 19 | "space" : "width between spaced kmers", 20 | "flank" : "number of bases on either side of ALT/REF in VCF records", 21 | "max-ref-kmer-count" : "max number of reference kmers allowed in SV event" 22 | } 23 | ], 24 | ) 25 | -------------------------------------------------------------------------------- /test-data/GIAB-chr22.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##fileDate=2020-03-04T19:05:39.98Z 3 | ##source=pbsv 2.3.0 (commit v2.3.0) 4 | ##PG="pbsv call -j 16 -t DEL,INS,INV -m 20 -A 3 -O 3 --call-min-read-perc-one-sample 20 /pbi/dept/secondary/siv/references/human_GRCh38_no_alt_analysis_set/sequence/human_GRCh38_no_alt_analysis_set.fasta /pbi/dept/bifx/awenger/prj/giab/20200303_PacBio_pbsv/svsig/AJTrio_GRCh38.fofn /pbi/dept/bifx/awenger/prj/giab/20200303_PacBio_pbsv/vcf/AJTrio_GRCh38.pbsv.vcf" 5 | ##INFO= 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##ALT= 15 | ##ALT= 16 | ##ALT= 17 | ##FILTER= 18 | ##FILTER== 50 Ns) in the reference assembly"> 19 | ##FILTER= 20 | ##FILTER= 21 | ##FILTER= 22 | ##FORMAT= 23 | ##FORMAT= 24 | ##FORMAT= 25 | ##FORMAT= 26 | ##FORMAT= 27 | ##reference=file:///pbi/dept/secondary/siv/references/human_GRCh38_no_alt_analysis_set/sequence/human_GRCh38_no_alt_analysis_set.fasta 28 | ##contig= 29 | ##contig= 30 | ##contig= 31 | ##contig= 32 | ##contig= 33 | ##contig= 34 | ##contig= 35 | ##contig= 36 | ##contig= 37 | ##contig= 38 | ##contig= 39 | ##contig= 40 | ##contig= 41 | ##contig= 42 | ##contig= 43 | ##contig= 44 | ##contig= 45 | ##contig= 46 | ##contig= 47 | ##contig= 48 | ##contig= 49 | ##contig= 50 | ##contig= 51 | ##contig= 52 | ##contig= 53 | ##contig= 54 | ##contig= 55 | ##contig= 56 | ##contig= 57 | ##contig= 58 | ##contig= 59 | ##contig= 60 | ##contig= 61 | ##contig= 62 | ##contig= 63 | ##contig= 64 | ##contig= 65 | ##contig= 66 | ##contig= 67 | ##contig= 68 | ##contig= 69 | ##contig= 70 | ##contig= 71 | ##contig= 72 | ##contig= 73 | ##contig= 74 | ##contig= 75 | ##contig= 76 | ##contig= 77 | ##contig= 78 | ##contig= 79 | ##contig= 80 | ##contig= 81 | ##contig= 82 | ##contig= 83 | ##contig= 84 | ##contig= 85 | ##contig= 86 | ##contig= 87 | ##contig= 88 | ##contig= 89 | ##contig= 90 | ##contig= 91 | ##contig= 92 | ##contig= 93 | ##contig= 94 | ##contig= 95 | ##contig= 96 | ##contig= 97 | ##contig= 98 | ##contig= 99 | ##contig= 100 | ##contig= 101 | ##contig= 102 | ##contig= 103 | ##contig= 104 | ##contig= 105 | ##contig= 106 | ##contig= 107 | ##contig= 108 | ##contig= 109 | ##contig= 110 | ##contig= 111 | ##contig= 112 | ##contig= 113 | ##contig= 114 | ##contig= 115 | ##contig= 116 | ##contig= 117 | ##contig= 118 | ##contig= 119 | ##contig= 120 | ##contig= 121 | ##contig= 122 | ##contig= 123 | ##contig= 124 | ##contig= 125 | ##contig= 126 | ##contig= 127 | ##contig= 128 | ##contig= 129 | ##contig= 130 | ##contig= 131 | ##contig= 132 | ##contig= 133 | ##contig= 134 | ##contig= 135 | ##contig= 136 | ##contig= 137 | ##contig= 138 | ##contig= 139 | ##contig= 140 | ##contig= 141 | ##contig= 142 | ##contig= 143 | ##contig= 144 | ##contig= 145 | ##contig= 146 | ##contig= 147 | ##contig= 148 | ##contig= 149 | ##contig= 150 | ##contig= 151 | ##contig= 152 | ##contig= 153 | ##contig= 154 | ##contig= 155 | ##contig= 156 | ##contig= 157 | ##contig= 158 | ##contig= 159 | ##contig= 160 | ##contig= 161 | ##contig= 162 | ##contig= 163 | ##contig= 164 | ##contig= 165 | ##contig= 166 | ##contig= 167 | ##contig= 168 | ##contig= 169 | ##contig= 170 | ##contig= 171 | ##contig= 172 | ##contig= 173 | ##contig= 174 | ##contig= 175 | ##contig= 176 | ##contig= 177 | ##contig= 178 | ##contig= 179 | ##contig= 180 | ##contig= 181 | ##contig= 182 | ##contig= 183 | ##contig= 184 | ##contig= 185 | ##contig= 186 | ##contig= 187 | ##contig= 188 | ##contig= 189 | ##contig= 190 | ##contig= 191 | ##contig= 192 | ##contig= 193 | ##contig= 194 | ##contig= 195 | ##contig= 196 | ##contig= 197 | ##contig= 198 | ##contig= 199 | ##contig= 200 | ##contig= 201 | ##contig= 202 | ##contig= 203 | ##contig= 204 | ##contig= 205 | ##contig= 206 | ##contig= 207 | ##contig= 208 | ##contig= 209 | ##contig= 210 | ##contig= 211 | ##contig= 212 | ##contig= 213 | ##contig= 214 | ##contig= 215 | ##contig= 216 | ##contig= 217 | ##contig= 218 | ##contig= 219 | ##contig= 220 | ##contig= 221 | ##contig= 222 | ##contig= 223 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG002 HG003 HG004 224 | chr22 48910763 pbsv.DEL.66387 CCCCAGATTCTGAAATCTTTCATTGTGGTTGAAGTCTCCCCTCCCGA C . PASS SVTYPE=DEL;END=48910809;SVLEN=-46;SVANN=TANDEM GT:AD:DP 0/1:17,11:28 0/1:16,13:29 0/0:25,0:25 225 | -------------------------------------------------------------------------------- /test-data/GIAB_PBSV_TRIO_CALLS.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##fileDate=2020-03-04T19:05:39.98Z 3 | ##source=pbsv 2.3.0 (commit v2.3.0) 4 | ##PG="pbsv call -j 16 -t DEL,INS,INV -m 20 -A 3 -O 3 --call-min-read-perc-one-sample 20 /pbi/dept/secondary/siv/references/human_GRCh38_no_alt_analysis_set/sequence/human_GRCh38_no_alt_analysis_set.fasta /pbi/dept/bifx/awenger/prj/giab/20200303_PacBio_pbsv/svsig/AJTrio_GRCh38.fofn /pbi/dept/bifx/awenger/prj/giab/20200303_PacBio_pbsv/vcf/AJTrio_GRCh38.pbsv.vcf" 5 | ##INFO= 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##ALT= 15 | ##ALT= 16 | ##ALT= 17 | ##FILTER= 18 | ##FILTER== 50 Ns) in the reference assembly"> 19 | ##FILTER= 20 | ##FILTER= 21 | ##FILTER= 22 | ##FORMAT= 23 | ##FORMAT= 24 | ##FORMAT= 25 | ##FORMAT= 26 | ##FORMAT= 27 | ##reference=file:///pbi/dept/secondary/siv/references/human_GRCh38_no_alt_analysis_set/sequence/human_GRCh38_no_alt_analysis_set.fasta 28 | ##contig= 29 | ##contig= 30 | ##contig= 31 | ##contig= 32 | ##contig= 33 | ##contig= 34 | ##contig= 35 | ##contig= 36 | ##contig= 37 | ##contig= 38 | ##contig= 39 | ##contig= 40 | ##contig= 41 | ##contig= 42 | ##contig= 43 | ##contig= 44 | ##contig= 45 | ##contig= 46 | ##contig= 47 | ##contig= 48 | ##contig= 49 | ##contig= 50 | ##contig= 51 | ##contig= 52 | ##contig= 53 | ##contig= 54 | ##contig= 55 | ##contig= 56 | ##contig= 57 | ##contig= 58 | ##contig= 59 | ##contig= 60 | ##contig= 61 | ##contig= 62 | ##contig= 63 | ##contig= 64 | ##contig= 65 | ##contig= 66 | ##contig= 67 | ##contig= 68 | ##contig= 69 | ##contig= 70 | ##contig= 71 | ##contig= 72 | ##contig= 73 | ##contig= 74 | ##contig= 75 | ##contig= 76 | ##contig= 77 | ##contig= 78 | ##contig= 79 | ##contig= 80 | ##contig= 81 | ##contig= 82 | ##contig= 83 | ##contig= 84 | ##contig= 85 | ##contig= 86 | ##contig= 87 | ##contig= 88 | ##contig= 89 | ##contig= 90 | ##contig= 91 | ##contig= 92 | ##contig= 93 | ##contig= 94 | ##contig= 95 | ##contig= 96 | ##contig= 97 | ##contig= 98 | ##contig= 99 | ##contig= 100 | ##contig= 101 | ##contig= 102 | ##contig= 103 | ##contig= 104 | ##contig= 105 | ##contig= 106 | ##contig= 107 | ##contig= 108 | ##contig= 109 | ##contig= 110 | ##contig= 111 | ##contig= 112 | ##contig= 113 | ##contig= 114 | ##contig= 115 | ##contig= 116 | ##contig= 117 | ##contig= 118 | ##contig= 119 | ##contig= 120 | ##contig= 121 | ##contig= 122 | ##contig= 123 | ##contig= 124 | ##contig= 125 | ##contig= 126 | ##contig= 127 | ##contig= 128 | ##contig= 129 | ##contig= 130 | ##contig= 131 | ##contig= 132 | ##contig= 133 | ##contig= 134 | ##contig= 135 | ##contig= 136 | ##contig= 137 | ##contig= 138 | ##contig= 139 | ##contig= 140 | ##contig= 141 | ##contig= 142 | ##contig= 143 | ##contig= 144 | ##contig= 145 | ##contig= 146 | ##contig= 147 | ##contig= 148 | ##contig= 149 | ##contig= 150 | ##contig= 151 | ##contig= 152 | ##contig= 153 | ##contig= 154 | ##contig= 155 | ##contig= 156 | ##contig= 157 | ##contig= 158 | ##contig= 159 | ##contig= 160 | ##contig= 161 | ##contig= 162 | ##contig= 163 | ##contig= 164 | ##contig= 165 | ##contig= 166 | ##contig= 167 | ##contig= 168 | ##contig= 169 | ##contig= 170 | ##contig= 171 | ##contig= 172 | ##contig= 173 | ##contig= 174 | ##contig= 175 | ##contig= 176 | ##contig= 177 | ##contig= 178 | ##contig= 179 | ##contig= 180 | ##contig= 181 | ##contig= 182 | ##contig= 183 | ##contig= 184 | ##contig= 185 | ##contig= 186 | ##contig= 187 | ##contig= 188 | ##contig= 189 | ##contig= 190 | ##contig= 191 | ##contig= 192 | ##contig= 193 | ##contig= 194 | ##contig= 195 | ##contig= 196 | ##contig= 197 | ##contig= 198 | ##contig= 199 | ##contig= 200 | ##contig= 201 | ##contig= 202 | ##contig= 203 | ##contig= 204 | ##contig= 205 | ##contig= 206 | ##contig= 207 | ##contig= 208 | ##contig= 209 | ##contig= 210 | ##contig= 211 | ##contig= 212 | ##contig= 213 | ##contig= 214 | ##contig= 215 | ##contig= 216 | ##contig= 217 | ##contig= 218 | ##contig= 219 | ##contig= 220 | ##contig= 221 | ##contig= 222 | ##contig= 223 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG002 HG003 HG004 224 | chr1 48483892 pbsv.DEL.1413 TATATATAAAATATATATACACACATATATATAAAGTATATATATACACACATATATATAAAATATATATATACAC T . PASS SVTYPE=DEL;END=48483967;SVLEN=-75;SVANN=TANDEM GT:AD:DP 0/1:23,1:24 0/1:8,18:26 0/1:14,17:31 225 | chr1 143503208 pbsv.DEL.2833 AATGAAATCGTGAGATGATGAAATGATGAGATGAAGTGAAATGATGAAATGATGAAATGTGATGAAATGGAATGATGAAATGAAATGATGAAATGAAATTGTGAAATGAAATGAGGAAATGAAATGGAATGATGAAATGAAATGAAAAGATCAAATGGTGAAGTGAAGAAATGATATGAAATGATGAAATGAAATGAAATGAGGAAATGAAGTTAAATGATTAAATGATGAAATAATGAAATGAAATGAAATGATGAAATGATGAATTGATGAAATGATCAAATGAAATGACGAGATGAAAAGATGAAATGAAATGATGAAATGTAATGACGAGATGAAAAGATGAAATGAGATGAAATGATGAGATGAAATGAAATCATGAGATGATGAAATGATGAGATGAAGTGAAATGATGAAATGAAATGAAATGTTGAGATGAAATGATGAAATGAAATGAAAGAATGAAGTGAAATGATGAAATGAGATGAAATGAAATCATGAGATGAAATGATGAAATGATGAGATGAAGTGAAATGATGAAATGATGAAATGTAATGAAATGATGAAATGGAATGATGAAATGAAATGAGGAAATGAAATGGTGAAATGAAATGAGGAAATGAAATGATGAAATGAAATGAAGTGAAATGATGAAATGATGAAATGAAATGAGAAGATCAAATGGTGAAATGAAGAAATGATATGAAATGATGAAATGAAATGAAGTGATGAAATGAAGTTAAATGATTAAATGATGAAATAATGAAATGAAATGATGAAATGATGAATTGATGAAATGATCAAATGAAATGAGATGAAAAGATGAGATGAAATGAAATGATGAAATGAAATGACGAGATGAAAAGATGAAACGAGATGAAATGAG A . PASS SVTYPE=DEL;END=143504093;SVLEN=-885;SVANN=TANDEM GT:AD:DP 0/1:31,18:49 0/1:26,23:49 0/1:38,15:53 226 | chr1 163119059 pbsv.INS.3230 A AGGAAATTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGGGATCTCGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCGCCCGCCACTACGCCCGGCTAATTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGTTTTAGCCAGGATGGTCTCGATCTCCTGACCTCGTGATCTGCCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCGCCCGGCC . PASS SVTYPE=INS;END=163119059;SVLEN=319;SVANN=ALU GT:AD:DP 0/1:12,15:27 0/1:14,16:30 0/0:31,0:31 227 | chr1 229870923 pbsv.INS.4539 C CACACACACAACACACAACACACAACATACACATAACAACACACATATATACACAACACACAGAACACATACACATAACACAACATATGCATACAATACAGGTAACAATATACATAAACAACACAGAAACATACATATAACACAACACATGTATACATACAATACACATGACATGCAACACACAACACACATGACACACACAGCACACAACACAATACATATGCCTACAACACACACCTGCACACATAACACAGGTGAAACACATGCAAACACATATACACTCAACACACATA . PASS SVTYPE=INS;END=229870923;SVLEN=292;SVANN=TANDEM GT:AD:DP 1/1:3,25:28 1/1:2,33:35 1/1:4,28:32 228 | chr2 57115594 pbsv.DEL.6721 GGCTAGTAGCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCAGGTCGGACTGCGGACTGCAGTGGCGCAATCTCGGCTCACTGCAAGCTCCGCTTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCGCCCGCCACCGCGCCCGGCTAATTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCTTGTTAGCCAGGATGGTCTCGATCTCCTGACCTCATGATCCACCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCGCCCGGCC G . PASS SVTYPE=DEL;END=57115921;SVLEN=-327;SVANN=TANDEM GT:AD:DP 0/0:27,0:27 0/1:22,12:34 0/1:17,13:30 229 | chr2 87203861 pbsv.DEL.7235 CTTTTCAGTATAGGATGGGGTATTGT C . PASS SVTYPE=DEL;END=87203886;SVLEN=-25;SVANN=TANDEM GT:AD:DP 1/1:0,25:25 1/1:0,26:26 1/1:0,29:29 230 | chr2 133702693 pbsv.DEL.8096 ACTCGTCTCTTTTTCTTTTTTTTTTTTTGAGACGGAGTCTTGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGGGATCTCGGCTCACTGCAAGCTCCGCCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCGCCCGCCACTACGCCCGGCTAATTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGTTTTAGCCGGGATGGTCTCGATCTCCTGACCTCGTGATCCGCCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCGCCCGGC A . PASS SVTYPE=DEL;END=133703000;SVLEN=-307 GT:AD:DP 1/1:1,29:30 1/1:0,36:36 1/1:2,34:36 231 | chr2 209383417 pbsv.INS.9470 G GTGTGTGTGTGTGTGTGTGTGTGT . PASS IMPRECISE;SVTYPE=INS;END=209383417;SVLEN=23;SVANN=TANDEM GT:AD:DP 0/1:9,17:26 0/1:13,19:32 0/1:18,14:32 232 | chr2 213419058 pbsv.INS.9551 A ATTCTCTACTCAGAAACCTGAAACATCGGCATTATC . PASS SVTYPE=INS;END=213419058;SVLEN=35 GT:AD:DP 1/1:0,26:26 1/1:0,31:31 1/1:0,30:30 233 | chr2 227081515 pbsv.DEL.9849 TCAAAAGTAGGAAAGCCACTACTTTT T . PASS SVTYPE=DEL;END=227081540;SVLEN=-25 GT:AD:DP 0/1:27,17:44 0/1:30,21:51 0/0:33,0:33 234 | chr3 1842442 pbsv.INS.10580 T TAATCTGACTACGGAAAAACTGCCAGAGACAGGCAACATCTTTCCTGGCTATGGTGAGAGAGTAATCTGACTACGGAAAAACTGCCAGAGACAGGCAACATCTTTCCTGGCTATGGTGAGAGAGT . PASS SVTYPE=INS;END=1842442;SVLEN=124 GT:AD:DP 1/1:2,26:28 1/1:0,32:32 0/1:21,17:38 235 | chr3 60660979 pbsv.INS.11728 T TTTTTTTTTTTTTTTTTTTTTTTTTTTTT . PASS IMPRECISE;SVTYPE=INS;END=60660979;SVLEN=28 GT:AD:DP 1/1:0,26:26 1/1:0,28:28 1/1:0,33:33 236 | chr3 76141060 pbsv.INS.12028 C CATATATATATATATATATATATATATAT . PASS SVTYPE=INS;END=76141060;SVLEN=28;SVANN=TANDEM GT:AD:DP 0/1:14,15:29 0/0:31,0:31 1/1:1,28:29 237 | chr3 120413799 pbsv.INS.12779 C CTCTCCCCCTCCCCCTCCCCCTCCCCCTCCCTCTCCCCCTCCCTCTCCCTCTCCCCACGG . PASS SVTYPE=INS;END=120413799;SVLEN=59 GT:AD:DP 0/1:20,12:32 0/1:14,14:28 0/1:16,13:29 238 | chr3 154378944 pbsv.DEL.13416 TAAAAGTCCTTTATCGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGTGGATCATGAGGTCAGGAGATCAAGACCATCCTGGCTAACAAGGTGAAACCCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGCGGTGGCGGGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCGGGAAGCGGAGCTTGCAGTGAGCCGAGATTGCGCCACTGCAGTCCGCAGTCCGACCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAAAAAAAAAAAAAA T . PASS SVTYPE=DEL;END=154379264;SVLEN=-320;SVANN=TANDEM GT:AD:DP 1/1:0,27:27 1/1:1,30:31 1/1:1,33:34 239 | chr3 197592614 pbsv.DEL.14421 CTTCGTCCTTTACTCTTACTTTCCTGGAACTCACTTCCT C . PASS SVTYPE=DEL;END=197592652;SVLEN=-38;SVANN=TANDEM GT:AD:DP 0/1:10,12:22 0/1:19,12:31 1/1:0,32:32 240 | chr4 6067579 pbsv.INS.14816 G GCACTCAAGCTCTTCTGCACACACAGGTCACCCCCCTGAGCTCCAGGTT . PASS SVTYPE=INS;END=6067579;SVLEN=48;SVANN=TANDEM GT:AD:DP 0/1:17,10:27 0/1:17,12:29 0/0:37,0:37 241 | chr4 26028512 pbsv.INS.15344 G GGGTGATTTTACATAGAGAGGGGACGGGCCTGGGTGATTTTACATAGAGAGGGGACGGGCCTGATGATTTTATATAGAGAGGGGACGGGCCTGGGTGATTTTATATAGTGAGGGGACGGGCCTGGGTGATTTTATATAGAGAGGGGACGGGCCTGG . PASS SVTYPE=INS;END=26028512;SVLEN=155;SVANN=TANDEM GT:AD:DP 0/1:16,16:32 1/1:4,30:34 0/1:8,21:29 242 | chr4 66672213 pbsv.DEL.16219 TAAAAATGCCATTAAATTGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCCGGCTAAAACGGTGAAACCCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTAGTGGCGGGCGCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCGGGAGAATGGCGTGAACCCGGGAGGCGGAGCTTGCAGTGAGCCGAGATCCCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAAAAAAAAAAAAAAA T . PASS SVTYPE=DEL;END=66672530;SVLEN=-317;SVANN=TANDEM GT:AD:DP 0/0:31,0:31 0/0:34,0:34 0/1:17,12:29 243 | chr4 114774914 pbsv.DEL.17079 TGATAGATAGATAGATGATAGATA T . PASS SVTYPE=DEL;END=114774937;SVLEN=-23;SVANN=TANDEM GT:AD:DP 1/1:1,25:26 0/1:20,13:33 1/1:0,35:35 244 | chr4 120926533 pbsv.DEL.17195 CTTATTTTATTTTATTTTATT C . PASS SVTYPE=DEL;END=120926553;SVLEN=-20;SVANN=TANDEM GT:AD:DP 1/1:0,33:33 0/1:18,16:34 1/1:0,29:29 245 | chr4 183045677 pbsv.INS.18469 A ACTGTGTCTAGCTCAAGGTTTGTAAATACACCAATCAGCACT . PASS SVTYPE=INS;END=183045677;SVLEN=41;SVANN=TANDEM GT:AD:DP 1/1:2,27:29 0/1:14,15:29 1/1:1,32:33 246 | chr4 188269003 pbsv.DEL.18657 ACTATGCCCAGGCTACCACCTTCAGCCTTCCACCCGCCTCCTCCCTCCTCCCTATGCCCAGGCTACCACCTTCAGCCTTCCACCCGCCTCCTCCCTCCTCCCTATGCCCAGGCTACCACCTTCAGCCTTCCACCCGCCTCCTCCCTCCTCCCTATGCCCAGGCTACCACCTTCAGCCTTCCACCCGCCTCCTCCCTCCTCCCTATGCCCAGGCTACCACCTTCAGCCTTCCACCCGCCTCCTCCCTCCTCCCTATGCCCAGGCTACCACCTTCAGCCTTCCACCCGCCTCCTCCCTCCTCCCTATGCCCAGGCTACCACCTTCAGCCTTCCACCCGCCTCCTCCCTCCTCC A . PASS SVTYPE=DEL;END=188269353;SVLEN=-350;SVANN=TANDEM GT:AD:DP 0/0:24,0:24 0/1:20,12:32 0/0:26,0:26 247 | chr5 711884 pbsv.INS.18963 C CCAGTACTGTGCTCCCATTTCCCAATACTGTGCTCCCATTTTGCAGTGCTGTAAGCCGTTTCCCAGTACTGTGCTCCCATTTCCCAGTACTGTGCTCCCAATTCCCAGTACTGTGCTCCCATTTCT . PASS SVTYPE=INS;END=711884;SVLEN=125;SVANN=TANDEM GT:AD:DP 0/1:16,18:34 0/1:16,12:28 1/1:5,33:38 248 | chr5 24091498 pbsv.DEL.19726 TAAGAAAACTGAGGCAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCCGGCTAAAACGGTGAAACCCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTAGTGGCGGGCGCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCGGGAGGCGGAGCTTGCAGTGAGCCGAGATTGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAA T . PASS SVTYPE=DEL;END=24091823;SVLEN=-325;SVANN=ALU GT:AD:DP 1/1:1,25:26 1/1:0,30:30 1/1:1,30:31 249 | chr5 84943797 pbsv.INS.20866 T TTTTTTTTTTTTTTTTTTTTTAATTTTTTTTTTTTTTTATTATACTCTAAGTTTTAGGGTACATGTGCACATTGTGCAGGTTAGTTACATATGTATACATGTGCCATGCTGGTGCGCTGCACCCACTAACGTGTCATCTAGCATTAGGTATATCTCCCAATGCTATCCCTCCCCCCTCCCCCAACCCCACCACAGTCCCCAGAGTGTGATATTCCCCTTCCTGTGTCCATGTGATCTCATTGTTCAATTCCCACCTATGAGTGAGAATATGCGGTGTTTGGTTTTTTGTTCTTGCGATAGTTTACTGAGAATGATGGTTTCCAATTTCATCCATGTCCCTACAAAGGACATGAACTCATCATTTTTTATGGCTGCATAGTATTCCATGGTGTATATGTGCCACATTTTCTTAATCCAGTCTATCATTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTGAATAGTGCCGCAATAAACATACGTGTGCATGTGTCTTTAGAGCAGCATGATTTATAGTCCTTTGGGTATATACCCAGTAATGGGATGGCTGGGTCAAATGGTATTTCTAGTTCTAGATCCCTGAGGAATCGCCACACTGACTTCCACAATGGTTGAACTAGTTTACAGTCCCACCAACAGTGTAAAAGTGTTCCTATTTCTCCACATCCTCTCCAGCACCTGTTGTTTCCTGACTTTTTAATGATTGCCATTCTAA . PASS SVTYPE=INS;END=84943797;SVLEN=721 GT:AD:DP 1/1:0,16:16 1/1:0,29:29 0/1:13,12:25 250 | chr5 97506478 pbsv.INS.21059 A ACTGCATTCCAGCCTGGGCGACAGAGCAAGACT . PASS SVTYPE=INS;END=97506478;SVLEN=32 GT:AD:DP 0/0:30,0:30 0/1:16,17:33 0/1:14,18:32 251 | chr5 175266581 pbsv.DEL.22468 AAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGA A . PASS SVTYPE=DEL;END=175266620;SVLEN=-39;SVANN=TANDEM GT:AD:DP 0/1:15,11:26 0/1:12,22:34 0/1:8,24:32 252 | chr6 29958054 pbsv.INS.23594 C CCACACATATTTACGCTGTTCAGAT . PASS SVTYPE=INS;END=29958054;SVLEN=24 GT:AD:DP 0/0:29,0:29 0/1:15,19:34 0/1:23,18:41 253 | chr6 32521799 pbsv.DEL.23702 TTCCTCTCTCTCTCACACACACACACACACA T . PASS SVTYPE=DEL;END=32521829;SVLEN=-30;SVANN=TANDEM GT:AD:DP 0/1:29,5:34 0/1:40,33:73 0/1:23,6:29 254 | chr6 32719658 pbsv.DEL.23745 AGTGTGTGTGTGTGTGTGTGTGTGTGTGT A . PASS SVTYPE=DEL;END=32719686;SVLEN=-28;SVANN=TANDEM GT:AD:DP 0/0:30,0:30 0/1:15,15:30 0/1:15,19:34 255 | chr6 33606234 pbsv.INS.23847 C CGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCCGGCTAAAACGGTGAAACCCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTAGTGGCGGGCGCCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCGGGAGGCGGAGCTTGCAGTGAGCCGAGATCCCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAAAAAAAAAAGAGCTATACTGGCCGGGT . PASS SVTYPE=INS;END=33606234;SVLEN=305 GT:AD:DP 0/1:18,11:29 0/0:33,0:33 1/1:2,27:29 256 | chr6 63456346 pbsv.DEL.24426 AATATACATATATATATATATAT A . PASS SVTYPE=DEL;END=63456368;SVLEN=-22;SVANN=TANDEM GT:AD:DP 1/1:0,30:30 0/1:15,15:30 0/1:18,15:33 257 | chr6 67214962 pbsv.INS.24536 A ATATATATATTTATATATATAT . PASS SVTYPE=INS;END=67214962;SVLEN=21;SVANN=TANDEM GT:AD:DP 0/0:33,0:33 0/0:29,0:29 0/1:19,13:32 258 | chr6 130802673 pbsv.INS.25647 C CCAGGAGGGAGGTGGGGGGGTCAGCCCCCCGCCCGGCCAGCCACCCCATCCGGCAGGTGAGGGGTGCCTCTGCCTGGCCGCCCCTACTGGGAAGTGAGGAGCCCCTCTGCCCGGCCAGCCACCCCGTCCGGGAGGGAGGTGGGGGGGTCAGCCCCCCGCCCGGCCAGCCGCCCCGTCCGGGAGGGAGGTGGGGGGGTCAGCCCCCCGCCCGGCCAGCCACCCCGTT . PASS SVTYPE=INS;END=130802673;SVLEN=225;SVANN=TANDEM GT:AD:DP 0/0:31,0:31 0/0:32,0:32 0/1:13,16:29 259 | chr6 154383233 pbsv.INS.26090 G GTGAAAAAGCAACAAACATTTATTTTCAT . PASS SVTYPE=INS;END=154383233;SVLEN=28 GT:AD:DP 0/1:14,12:26 0/1:6,21:27 0/1:12,24:36 260 | chr6 167009115 pbsv.DEL.26495 GGCGCGGTGGCTCATGCCTGT G . PASS SVTYPE=DEL;END=167009135;SVLEN=-20;SVANN=TANDEM GT:AD:DP 0/1:13,14:27 0/1:13,10:23 0/1:16,18:34 261 | chr7 25602192 pbsv.INS.27758 A ACGTCATAAACAGTGATATGATTTATATGGTCAATAGTATCAATAAGTAGTATAGTCATA . PASS SVTYPE=INS;END=25602192;SVLEN=59;SVANN=TANDEM GT:AD:DP 0/1:14,13:27 1/1:4,28:32 0/0:32,0:32 262 | chr7 37848002 pbsv.INS.28007 G GAGGAAGAAGAGGAGGAAGAGGAGGAGGAGGAAGAGGAAGAGGAAGAAGAGGAAGAGGAAGAGGAAGAAGAAGAGAAAGAAGAAGAAGAAAAGAAGAAGAAAGAAGAAAAGAAGAGGAAAAGAAGAAGAAAAGAAGAAGAAAAGAAGAAGAAGAAGGAAGAAGAGGAAGAGGAAGAGGAAGAAGAAGAAGAAGAAAG . PASS SVTYPE=INS;END=37848002;SVLEN=196;SVANN=TANDEM GT:AD:DP 1/1:5,21:26 0/1:16,13:29 0/1:13,14:27 263 | chr7 39194536 pbsv.DEL.28037 ACTCTGTAAAGTGGACCAATCAGCG A . PASS SVTYPE=DEL;END=39194560;SVLEN=-24;SVANN=TANDEM GT:AD:DP 1/1:0,31:31 1/1:0,31:31 1/1:0,34:34 264 | chr7 65872985 pbsv.DEL.28591 GCCCATCCCCACCCCTGCCCCA G . PASS SVTYPE=DEL;END=65873006;SVLEN=-21 GT:AD:DP 0/1:11,13:24 0/0:34,0:34 0/1:19,13:32 265 | chr7 70871919 pbsv.INS.28765 G GGAAGGAAGGAAGGAAAGAAAGGAAG . PASS SVTYPE=INS;END=70871919;SVLEN=25;SVANN=TANDEM GT:AD:DP 0/0:23,0:23 0/1:13,7:20 0/0:21,0:21 266 | chr7 103160155 pbsv.DEL.29505 CCTTCACCATTCCCCAGTGCCCCTCACCATTTCCCAGTGTCCCTCACCATTTCCCAGTGCCCCTCACCATTTCCCAGTGCCCTTCACCATTCCCCAGTGCCCTTCACCATTTCCCAGTGCCCTTCACCATTTCCCAGTGCCCTTCACCATTTCCCAGTGCCCTTCACCATTTCCCAGTGCCCTTCACCATTTCCCAGTGCCCTTCACCATTTCCCAGTGCCCTTCACCATTCCCCAGTGTCCCTCACCATTTCCCAGTGCCTCTCACCATTCCCCAGTGCCCTTCACCATTCCCCAGTGCCTCTCACCATTCCCCAGTGCCCTTCACCATTTCCCAGTGCCCCTCACCATTCCCCAGTACCCTTCACCATTCCCCAGTGTCCCTCACCATTTCCCAGTGCCTCTCACCATTTCCCAGTGCCCTTCACCATTTCCCAGTGCCCTTCACCATTTCCCAGTGCCCTTCACCATTCCCCAGTGTCCCTCACCATTTCCCAGTGCCCTTCACCATTCCCCAGTGCCCTTCACCATTTCCCAGTGCCCTTCACCATTCCCCAGTGTCCCTCACCATTTCCCAGTGCCTCTCACCATTCCCCAGTGCCCTTCACCATTCCCCAGTGCCTCTCACCATTTCCCAGTGCCTCTCACCATTTCCCAGTGCCCTTCACCATTTCCCAGTGCT C . PASS SVTYPE=DEL;END=103160835;SVLEN=-680;SVANN=TANDEM GT:AD:DP 1/1:1,29:30 1/1:2,32:34 1/1:0,27:27 267 | chr7 157850245 pbsv.DEL.30762 ATAAGGGATCCCAGGTCTCCAAATTTCCGACATGGGGTGCCTCAGGCTCGCG A . PASS SVTYPE=DEL;END=157850296;SVLEN=-51;SVANN=TANDEM GT:AD:DP 0/0:32,0:32 0/1:14,16:30 0/0:27,0:27 268 | chr8 9313106 pbsv.INS.31568 T TTTTTTTTTTTTTTCTTTTTTCTTTTTCTTTC . PASS SVTYPE=INS;END=9313106;SVLEN=31;SVANN=TANDEM GT:AD:DP 0/1:11,13:24 1/1:2,29:31 0/0:30,0:30 269 | chr8 55911130 pbsv.INS.32494 C CATACATATATATACGTGTATATATATGTACATATATATACACGTATATATATACATATATATACGTGTATATATATGTACATATATATACACGTATATAT . PASS SVTYPE=INS;END=55911130;SVLEN=100;SVANN=TANDEM GT:AD:DP 0/0:31,0:31 0/1:16,14:30 0/0:32,0:32 270 | chr8 56272401 pbsv.INS.32496 G GGCTCCTCACTTCTCAGACGGGGCGGTCGGGCAGAGAC . PASS SVTYPE=INS;END=56272401;SVLEN=37;SVANN=TANDEM GT:AD:DP 0/1:9,15:24 0/1:10,17:27 0/0:24,0:24 271 | chr8 88236752 pbsv.DEL.33016 CAACAACAAACAGAAGACCAGCCCTATGATCAAAGCTTTTCACCTTCTCTCACCTTGCGATGACTCACTTTTCAGAGCTGGGCCACTGCTCATCTCACCTGTTCCAACATTTACTCATCAGCTACA C . PASS SVTYPE=DEL;END=88236877;SVLEN=-125 GT:AD:DP 0/0:29,0:29 0/1:19,17:36 0/1:17,15:32 272 | chr8 100317127 pbsv.INS.33231 G GCCGCACGCAGCCCAGCCCGGGTTC . PASS SVTYPE=INS;END=100317127;SVLEN=24 GT:AD:DP 0/0:30,0:30 0/1:13,16:29 0/0:37,0:37 273 | chr8 103587690 pbsv.INS.33284 A AAAAGAAAGAAAGAAAGAAAGAAA . PASS IMPRECISE;SVTYPE=INS;END=103587690;SVLEN=23;SVANN=TANDEM GT:AD:DP 0/1:21,9:30 0/1:13,19:32 0/1:27,6:33 274 | chr9 93095871 pbsv.DEL.35940 CGAGTAAGGGGCGGGGCCCGGTGGGGGGTTCCTT C . PASS SVTYPE=DEL;END=93095904;SVLEN=-33;SVANN=TANDEM GT:AD:DP 1/1:1,27:28 1/1:0,35:35 1/1:1,37:38 275 | chr9 94661588 pbsv.DEL.35969 AGTTCCACCTCCCGGGTTCACGCCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCGCCCGCCACTATGCCTGGCTAATTTTTTTGTATTTTTAGTAGGGACAGC A . PASS SVTYPE=DEL;END=94661701;SVLEN=-113;SVANN=TANDEM GT:AD:DP 0/0:28,0:28 0/1:11,22:33 0/0:34,0:34 276 | chr9 104833906 pbsv.DEL.36149 CTGCACAGATGTGCTGCCGGAAAAACAGCAGCTGCAAACTCAGCAAAAAGGAGAGGCATTTGCAACTCCTGTCACAGCTTCGACAGAGAAGATGAAGCTCTGGTTCTACCGAATTGGAGTCTCCAAGGGTCTGAAAACTGCTGCTTTATAAAAAAATCAGTCTTAGTTTCACTAACATTATAAGCTTTTCCCCCCTTGCGCCTTTGTATTTATTATTTCAATGGCAGCATAATGATTGTATAGAGTATATGTAATTTCATTAATTATTCCTATAGTGCTGAACATTTAGTCTATTTCCAATTTCTCACAATAACAATATAGCAATAACATTTTCTCTCATATGGTGACTTCTTTAAAATTATTTCAGTAGGCTACATCCCTGGGGGTGGGTGGAATGGAGGTACGCAGGATAGCATATTTTAAAATGGTTCTTTAAATTTTCATTCCAAATTGCTTTCCAATAGAGAATAGCAATTTATACCATCACCAGGAAGGTATGTATGTATTTTTGTTAACATAACAGACACAAAATTGTACCTCACAAGTTCACAGAGGCTAGGGTTAGCAATTCTGAAGCCACTTTATTTGGAACAAACAAAAGCAGAGTGAAGAGACGCAGCCTTTAGGGTGAAGTCGGCCCTGCCCAGTATGGTTTCTGAACAACATGGCCAAAAAGAATAGCAAGACTAGAAAGATGCGCCCACTGAGATCACTGAGCACGTGGCCTCTCTGGGGTGCTCGTGAGGGGCAGGAGACCTAAGACTGCCCGCAGGGGTGGAGGCAGAGAGAAAACAGGTCCCGAGGATGGGTGGCTAAAAGCAGCTCAGAAAAGAGAGCAAATCAGTGTCAGAGGGTTCCAAGACTTGCCCGAGAGAGGTGCCCTGGGACCTGTGGGGTGAGGCAACATGGGTAGACAG C . PASS SVTYPE=DEL;END=104834822;SVLEN=-916 GT:AD:DP 0/1:21,4:25 0/0:32,0:32 0/1:19,20:39 277 | chr10 3060727 pbsv.DEL.37394 TTCATTCACTCACTCATTCGGTCACTCAC T . PASS SVTYPE=DEL;END=3060755;SVLEN=-28;SVANN=TANDEM GT:AD:DP 0/1:14,10:24 0/0:32,0:32 0/1:12,16:28 278 | chr10 3222966 pbsv.INS.37438 C CCATGAAGATAGTCACTTTGACCAAA . PASS SVTYPE=INS;END=3222966;SVLEN=25 GT:AD:DP 1/1:0,34:34 1/1:1,27:28 1/1:0,31:31 279 | chr10 7719923 pbsv.INS.37635 A AAAGTGTGGGAGGGTGGGAGTGGGGAGACTTGGGAGAAGTGTGGGAGGGTGGGAGTGGGGAGACTTGGGAG . PASS SVTYPE=INS;END=7719923;SVLEN=70;SVANN=TANDEM GT:AD:DP 1/1:4,26:30 1/1:1,26:27 0/1:18,13:31 280 | chr10 32675290 pbsv.DEL.38239 ATTACCATAAAGTTAAAATTACAGTTACGA A . PASS SVTYPE=DEL;END=32675319;SVLEN=-29 GT:AD:DP 1/1:0,26:26 1/1:0,28:28 0/1:20,14:34 281 | chr10 55958447 pbsv.INS.38892 C CACACACACACAGAGAGAGAGAGAGAG . PASS SVTYPE=INS;END=55958447;SVLEN=26;SVANN=TANDEM GT:AD:DP 0/1:18,14:32 1/1:1,31:32 0/1:19,15:34 282 | chr10 56989023 pbsv.DEL.38902 AGGAGGAGGAGGAAGAGGAGGG A . PASS SVTYPE=DEL;END=56989044;SVLEN=-21;SVANN=TANDEM GT:AD:DP 0/1:16,13:29 0/0:32,0:32 1/1:0,30:30 283 | chr10 95672160 pbsv.INS.39591 T TTGTGTGTGTGTGTGTGTGTG . PASS SVTYPE=INS;END=95672160;SVLEN=20 GT:AD:DP 1/1:1,30:31 1/1:1,32:33 1/1:0,35:35 284 | chr10 133618547 pbsv.DEL.40597 CGTTCACTGTGGCCAGCCTGGGT C . PASS SVTYPE=DEL;END=133618569;SVLEN=-22;SVANN=TANDEM GT:AD:DP 0/0:35,0:35 0/1:18,14:32 0/0:35,0:35 285 | chr11 71773144 pbsv.DEL.42485 AATATAATATAATATAATATAATATATTATA A . PASS SVTYPE=DEL;END=71773174;SVLEN=-30;SVANN=TANDEM GT:AD:DP 0/0:21,0:21 0/0:33,0:33 0/1:17,16:33 286 | chr11 109839233 pbsv.DEL.43256 CATATATATATATATATATATATAT C . PASS SVTYPE=DEL;END=109839257;SVLEN=-24;SVANN=TANDEM GT:AD:DP 1/1:0,31:31 1/1:0,31:31 1/1:0,32:32 287 | chr11 112460033 pbsv.DEL.43297 GATAGATATAGATATAGATATAGAT G . PASS SVTYPE=DEL;END=112460057;SVLEN=-24;SVANN=TANDEM GT:AD:DP 1/1:2,31:33 1/1:0,29:29 1/1:0,31:31 288 | chr12 97091471 pbsv.INS.45925 A AAAACAGAAAGAAATTTAATC . PASS SVTYPE=INS;END=97091471;SVLEN=20 GT:AD:DP 0/1:17,14:31 0/1:16,14:30 0/0:32,0:32 289 | chr12 127018092 pbsv.DEL.46582 GCCCCCGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGCGGTGGCGGGTGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATGGCGTGAACCCGGGAAGCAGAGCTTGCAGTGAGCCGAGATTGCGCCACTGCAGTCCGCAGTCCGGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAACCCAGCTG G . PASS SVTYPE=DEL;END=127018320;SVLEN=-228 GT:AD:DP 0/1:12,16:28 0/1:20,13:33 0/0:34,0:34 290 | chr12 132727539 pbsv.INS.47071 C CGCCCGGGCGGAGGAGAGGCATTGCTGAACCCTGGCACCGCGGGCACATGCGCCCGGACGGAGGAGACGCACTGCTGAACCCTGGCACCGTGGGCACACGCGCCCGGGCGGAGGAGAGGCACTGCTGAACCCTGGCACCGCGGGCACACGCGCCCGGATGGAGGAGATGCACTGCTGAACCCTGGCACCGTGGGCACACGT . PASS SVTYPE=INS;END=132727539;SVLEN=200;SVANN=TANDEM GT:AD:DP 1/1:1,17:18 0/1:18,7:25 0/1:16,6:22 291 | chr13 68233699 pbsv.INS.48198 A ACACACATATATATATACACACACATATATATGTGTATATATATACACGTATATATACACACACATATATAAGTGTGTGTATATATATATATACACACACATATTATACACACACATATATATGTGTGTG . PASS SVTYPE=INS;END=68233699;SVLEN=129;SVANN=TANDEM GT:AD:DP 1/1:4,21:25 1/1:1,37:38 1/1:1,33:34 292 | chr13 97326099 pbsv.DEL.48777 AGTCCAGGTGATGCAGTGGAAT A . PASS SVTYPE=DEL;END=97326120;SVLEN=-21 GT:AD:DP 0/0:25,0:25 0/1:17,13:30 0/0:33,0:33 293 | chr15 32204311 pbsv.INS.51862 C CGAGAGAGGGTGAGAGAGAGAGCGAGAGAGAGGGTGAGAGAGAGAGCGAGAGAGAGAGCGAGAGAGAGGGTGAGAGAGGGTGAGAGAGAGGGCGAGAGAGCGAGAGAGAGAGCGAGAGAGAGGGTGAGAGAGCGAGAGAGAGGGTGAGAGAGAGCGAGAGAGAGGGTGAGAGAGAGCGAGAGAGAGGGTGAGAGAGAGCAGAGCGAGAGAGAGGGCGAGAGAGAGGGCGAGAGGGAGGGTGAGAGAGAGGGTGAGAGAGAGAGTGAGAGAGGGTGAGAGAGAGGGTGAGAGAGGGTGAGAGAGGGTGAGAGAGAGGGTGAGAGAGGGTGAGAGAGAGGGTGAGAGAGGGTGAGAGAGAGGGTGAGAGAGAGAGCAGAGCGAGAGAGAGGGCGAGAGAGAGGGCGAGAGAGAGAGCGAGAGGGAGGGTGAGAGAGAGGGTGAGAGAGAGCGAGAGAGAGGGCGAGAGAGAGGGTGAGAGAGGGTGAGAGAGAGCGAGAGAGAGGGTGAGAGAGAGAGCAGAGCGAGAGAGAGGGCGAGAGGGCGAGAGAGAGAGCGAGAGGGAGGGTGAGAGAGAGGGTGAGAGAGAGAGTGAGAGAGGGTGAGAGAGAGGGTGAGAGAGAGAGTGAGAGAGGGTGAGAGAGAGGGTGAGAGAGAGGGTGAGAGAGGGTGAGAGAGAGAGGGAGGGTGAGAGAGAGAGTGAGAGAGGGTGAGAGAGAGGGT . PASS SVTYPE=INS;END=32204311;SVLEN=719;SVANN=TANDEM GT:AD:DP 0/0:3,0:3 0/1:3,1:4 0/1:2,3:5 294 | chr15 63744136 pbsv.DEL.52419 GTGTGTGTGTGTGTGTGTGTGTGTGTGTGTCTCTC G . PASS SVTYPE=DEL;END=63744170;SVLEN=-34;SVANN=TANDEM GT:AD:DP 0/1:18,15:33 0/1:19,10:29 1/1:0,34:34 295 | chr15 66478417 pbsv.INS.52484 C CACAGGTATATATATATACACACAGGTATATATATAT . PASS SVTYPE=INS;END=66478417;SVLEN=36;SVANN=TANDEM GT:AD:DP 0/0:28,0:28 0/0:28,0:28 0/1:19,10:29 296 | chr15 99089497 pbsv.INS.53176 G GGCAATTATTAAGTTCAAGAAGCC . PASS SVTYPE=INS;END=99089497;SVLEN=23 GT:AD:DP 0/1:15,17:32 0/1:13,19:32 0/0:33,0:33 297 | chr16 14353905 pbsv.INS.53799 A AGAAAGTGGGGAGAGGAGGGG . PASS SVTYPE=INS;END=14353905;SVLEN=20;SVANN=TANDEM GT:AD:DP 0/1:10,12:22 0/1:4,11:15 0/1:14,4:18 298 | chr16 78968086 pbsv.DEL.54938 ATGGTCCGCATGGCACAGCGCG A . PASS SVTYPE=DEL;END=78968107;SVLEN=-21;SVANN=TANDEM GT:AD:DP 0/1:18,14:32 0/1:16,13:29 1/1:0,30:30 299 | chr16 89901414 pbsv.DEL.55423 TGAGGGGCTGTGAGGGGTGAGGGTGAAATCCCTCCTTAAGACGGGCCTCCGGCCGGGCGCGGTGGCTCACGCCTGTAATCCTAGCACTTTGGGAGGCCGAGGTGGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGACTAACACGGTGAAACCACGTCTCTACTAAAAATACAAAAAATTAGCCGGGCGTGTTGGCGGGCACCTGTAGTCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATGGCATGAACCCAGGAGGCAGAGCTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCAACATAGTGAGACTCCGTCTCAAAAAAAAAGAAAAAAAAAAAAAAGACGGGCCTCCTCCGAGGGGCTGTGAGGGGTGAGGGTGAAATCCCTCCTTAAGACGGGCCTCCTCCGAGGGGCTGTGAGGGGTGAGGGTGAAATCCCTCCTTAAGATGGGCCTCCTCC T . PASS SVTYPE=DEL;END=89901886;SVLEN=-472;SVANN=TANDEM GT:AD:DP 0/0:30,0:30 0/0:34,0:34 0/1:8,10:18 300 | chr17 40444922 pbsv.DEL.56640 CACACAGAGACAGAGAGAGAGAGAG C . PASS SVTYPE=DEL;END=40444946;SVLEN=-24;SVANN=TANDEM GT:AD:DP 0/1:14,19:33 0/0:24,0:24 1/1:1,32:33 301 | chr17 42436027 pbsv.INS.56715 G GGTATGGTATGGTATGGTATT . PASS IMPRECISE;SVTYPE=INS;END=42436027;SVLEN=20;SVANN=TANDEM GT:AD:DP 0/0:20,0:20 0/1:21,6:27 0/0:27,0:27 302 | chr17 45956548 pbsv.DEL.56810 TTATATATATATATATATATATATATATATA T . PASS SVTYPE=DEL;END=45956578;SVLEN=-30;SVANN=TANDEM GT:AD:DP 1/1:0,26:26 1/1:1,30:31 1/1:0,34:34 303 | chr17 76526892 pbsv.INS.57459 G GGGCGCCCTTGGTGTGGGAGAGGGGATGGGGCGCCCTTGGTGTGGGAGAGGGGATGGGGCGCCCTTGGTGTGGGACAGGGGATGGGGCGCCCTTGGTGTGGGACAGGGGATGGGGCGCCCTTGGTGTGGGACAGGGGATGGGGCGCCCTTGGTGTGGGACAGGGGATGGGGCGCCCTTGGTGTGGGACAGGGGAT . PASS SVTYPE=INS;END=76526892;SVLEN=194;SVANN=TANDEM GT:AD:DP 1/1:2,23:25 1/1:1,26:27 1/1:1,24:25 304 | chr18 41055532 pbsv.INS.58700 A ATTATTACATTCAAATGCATTTGTT . PASS SVTYPE=INS;END=41055532;SVLEN=24 GT:AD:DP 1/1:0,22:22 1/1:1,26:27 1/1:0,30:30 305 | chr18 42608374 pbsv.DEL.58717 CTCTTTCTTTCTTTCTCTTTCTTTCTTTCTT C . PASS SVTYPE=DEL;END=42608404;SVLEN=-30;SVANN=TANDEM GT:AD:DP 1/1:1,24:25 0/1:16,13:29 1/1:5,25:30 306 | chr18 69606273 pbsv.DEL.59312 AAAATAAATAAATAAATAAATAAATAAATAAAT A . PASS SVTYPE=DEL;END=69606305;SVLEN=-32;SVANN=TANDEM GT:AD:DP 0/1:16,10:26 0/1:17,15:32 1/1:1,33:34 307 | chr19 6270193 pbsv.DEL.60141 GCATTCTTTCTGCCCCAAATATTGGGAATTCTTCACATGGCACA G . PASS SVTYPE=DEL;END=6270236;SVLEN=-43 GT:AD:DP 0/0:31,0:31 0/0:29,0:29 0/1:17,13:30 308 | chr19 34948296 pbsv.DEL.61029 CAAAAAAAAAAAAAAAAAAAAAAAAAA C . PASS SVTYPE=DEL;END=34948322;SVLEN=-26 GT:AD:DP 0/0:25,0:25 0/1:15,15:30 0/0:31,0:31 309 | chr19 39905478 pbsv.INS.61201 G GAGAAAAGTCAATGCGGCCAGGTGCAGTGGCTCACGCCTGTAATCCTAGCGCTTTGGGAGGCCAAGGCGGGTGGATCACAAGGTCAGGCGTTCAAGACCAGCCTGGCCAACATGGTGAAACCCTGTCTCTACTAAATAGAAAAATTAGCCAGGTGTTGTGGTGGGCGCCTGTAATCCCAGCTACTTGGGAGGCTGAGGCAGGAGAATCCACTGAACCCAGGAAGCGGAGGTTGTAGTGAGCCCAGATCGCACCATTGCGTTGCAGCCTGGGCGACAGAGCCATACTCCATCTCAAAAAAAAAAAAAAAAGAAAA . PASS SVTYPE=INS;END=39905478;SVLEN=313 GT:AD:DP 1/1:3,26:29 1/1:4,29:33 1/1:3,29:32 310 | chr20 10963125 pbsv.DEL.62164 ATAGATGAACATGTGACCCACGCTCAC A . PASS SVTYPE=DEL;END=10963151;SVLEN=-26 GT:AD:DP 0/1:17,16:33 0/0:35,0:35 0/1:17,15:32 311 | chr20 22764133 pbsv.INS.62401 T TATATATATATATATATATAT . PASS SVTYPE=INS;END=22764133;SVLEN=20 GT:AD:DP 1/1:0,31:31 1/1:1,28:29 1/1:1,29:30 312 | chr20 57979495 pbsv.DEL.63378 TCCAGTGTGCCATCCGGTCTGTCTGCTCCCCACTGTGGTCTTAGTGGCCAGTGTGGCGTCTGGTCTGTCTGGTCCCCACTGTGGTCTTAGTGG T . PASS SVTYPE=DEL;END=57979587;SVLEN=-92;SVANN=TANDEM GT:AD:DP 0/0:28,0:28 0/0:30,0:30 0/1:16,18:34 313 | chr21 44993875 pbsv.INS.65035 T TCCTGTCCCCACCTGCCCCGTCCCCCACCTGCCCCATCCCCACCTGCCCCATCCCCCTGCCTGCCCCATCCCCTCACCTGCCCCATCCCCCACCTGCCCCATCCCCCACCTGCCCCATCCCCCCCACCTGCCCCATCCCCCACCTGCCCCATCCCCCCACCTACCCCATCACCTGTCCTGTCCACCACCTGTCCTGTCCTCCACCTGTCCACCACCTGTCCTGTCCTCACCTGCCCCATCCCAACCTAC . PASS SVTYPE=INS;END=44993875;SVLEN=248 GT:AD:DP 1/1:2,23:25 1/1:1,25:26 1/1:2,25:27 314 | chr21 45990464 pbsv.INS.65131 G GGACGGGGAGGGATGGGGTGGACAGCGTGAAGGTGACCGGGGAGGGATGGGGTGGACAGTGTGAAGGTGACCAGGGGAAGGACGGGGAGGGACGGGGAGG . PASS SVTYPE=INS;END=45990464;SVLEN=99;SVANN=TANDEM GT:AD:DP 0/1:13,12:25 0/1:12,14:26 0/1:11,15:26 315 | chr21 46091266 pbsv.INS.65153 T TGATGGTGATGATGGTGGTGATGATGGTGAAGGTGGTGATGGTGGTGGTAGTAGTGGTGATGGTGGC . PASS SVTYPE=INS;END=46091266;SVLEN=66;SVANN=TANDEM GT:AD:DP 1/1:4,21:25 1/1:6,26:32 1/1:3,25:28 316 | chr22 23462244 pbsv.INS.65546 A ATGGATGAAGCCCCCAGCCCG . PASS SVTYPE=INS;END=23462244;SVLEN=20;SVANN=TANDEM GT:AD:DP 0/1:20,10:30 0/1:14,12:26 0/0:34,0:34 317 | chr22 40170331 pbsv.DEL.65965 TTATATATAGTTTATATATATATTATGTATATAGTTTATATATATATTATATATATAGTTTATATATATATTATATATATAGTTTATATATATATTATATATATAGTTTATATATATATTA T . PASS SVTYPE=DEL;END=40170451;SVLEN=-120;SVANN=TANDEM GT:AD:DP 0/1:18,13:31 1/1:1,30:31 0/0:30,0:30 318 | chr22 48910763 pbsv.DEL.66387 CCCCAGATTCTGAAATCTTTCATTGTGGTTGAAGTCTCCCCTCCCGA C . PASS SVTYPE=DEL;END=48910809;SVLEN=-46;SVANN=TANDEM GT:AD:DP 0/1:17,11:28 0/1:16,13:29 0/0:25,0:25 319 | chr22 48933096 pbsv.INS.66485 T TCTTGGAAGCACAAAAGTTGTTAA . PASS SVTYPE=INS;END=48933096;SVLEN=23 GT:AD:DP 0/1:15,11:26 0/0:32,0:32 0/1:18,16:34 320 | chr22 50424683 pbsv.INS.66575 G GGCGTGTGGAAGGTCCGTCATTTT . PASS IMPRECISE;SVTYPE=INS;END=50424683;SVLEN=23;SVANN=TANDEM GT:AD:DP 0/1:19,9:28 0/1:15,18:33 0/1:21,12:33 321 | chrX 1735804 pbsv.DEL.66719 TTTCTCCCTCCTCCTCTTTCTCCCTCCTCCGCTTTCTCCCTCCTCCGCTTTCTCCCTCCTCCG T . PASS SVTYPE=DEL;END=1735866;SVLEN=-62;SVANN=TANDEM GT:AD:DP 0/0:10,0:10 0/0:12,0:12 0/1:8,3:11 322 | chrX 2243785 pbsv.DEL.66910 CCTGTTTTTTTTTTTTTTTTT C . PASS SVTYPE=DEL;END=2243805;SVLEN=-20;SVANN=TANDEM GT:AD:DP 0/1:14,16:30 1/1:0,32:32 0/0:27,0:27 323 | chrX 87326502 pbsv.DEL.68049 ACATATATATGTGTGTGTGTGTG A . PASS SVTYPE=DEL;END=87326524;SVLEN=-22;SVANN=TANDEM GT:AD:DP 0/1:27,1:28 0/0:25,0:25 0/1:14,19:33 324 | -------------------------------------------------------------------------------- /test-data/GIAB_PBSV_TRIO_CALLS_TEST2.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##fileDate=2020-03-04T19:05:39.98Z 3 | ##source=pbsv 2.3.0 (commit v2.3.0) 4 | ##PG="pbsv call -j 16 -t DEL,INS,INV -m 20 -A 3 -O 3 --call-min-read-perc-one-sample 20 /pbi/dept/secondary/siv/references/human_GRCh38_no_alt_analysis_set/sequence/human_GRCh38_no_alt_analysis_set.fasta /pbi/dept/bifx/awenger/prj/giab/20200303_PacBio_pbsv/svsig/AJTrio_GRCh38.fofn /pbi/dept/bifx/awenger/prj/giab/20200303_PacBio_pbsv/vcf/AJTrio_GRCh38.pbsv.vcf" 5 | ##INFO= 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##ALT= 15 | ##ALT= 16 | ##ALT= 17 | ##FILTER= 18 | ##FILTER== 50 Ns) in the reference assembly"> 19 | ##FILTER= 20 | ##FILTER= 21 | ##FILTER= 22 | ##FORMAT= 23 | ##FORMAT= 24 | ##FORMAT= 25 | ##FORMAT= 26 | ##FORMAT= 27 | ##reference=file:///pbi/dept/secondary/siv/references/human_GRCh38_no_alt_analysis_set/sequence/human_GRCh38_no_alt_analysis_set.fasta 28 | ##contig= 29 | ##contig= 30 | ##contig= 31 | ##contig= 32 | ##contig= 33 | ##contig= 34 | ##contig= 35 | ##contig= 36 | ##contig= 37 | ##contig= 38 | ##contig= 39 | ##contig= 40 | ##contig= 41 | ##contig= 42 | ##contig= 43 | ##contig= 44 | ##contig= 45 | ##contig= 46 | ##contig= 47 | ##contig= 48 | ##contig= 49 | ##contig= 50 | ##contig= 51 | ##contig= 52 | ##contig= 53 | ##contig= 54 | ##contig= 55 | ##contig= 56 | ##contig= 57 | ##contig= 58 | ##contig= 59 | ##contig= 60 | ##contig= 61 | ##contig= 62 | ##contig= 63 | ##contig= 64 | ##contig= 65 | ##contig= 66 | ##contig= 67 | ##contig= 68 | ##contig= 69 | ##contig= 70 | ##contig= 71 | ##contig= 72 | ##contig= 73 | ##contig= 74 | ##contig= 75 | ##contig= 76 | ##contig= 77 | ##contig= 78 | ##contig= 79 | ##contig= 80 | ##contig= 81 | ##contig= 82 | ##contig= 83 | ##contig= 84 | ##contig= 85 | ##contig= 86 | ##contig= 87 | ##contig= 88 | ##contig= 89 | ##contig= 90 | ##contig= 91 | ##contig= 92 | ##contig= 93 | ##contig= 94 | ##contig= 95 | ##contig= 96 | ##contig= 97 | ##contig= 98 | ##contig= 99 | ##contig= 100 | ##contig= 101 | ##contig= 102 | ##contig= 103 | ##contig= 104 | ##contig= 105 | ##contig= 106 | ##contig= 107 | ##contig= 108 | ##contig= 109 | ##contig= 110 | ##contig= 111 | ##contig= 112 | ##contig= 113 | ##contig= 114 | ##contig= 115 | ##contig= 116 | ##contig= 117 | ##contig= 118 | ##contig= 119 | ##contig= 120 | ##contig= 121 | ##contig= 122 | ##contig= 123 | ##contig= 124 | ##contig= 125 | ##contig= 126 | ##contig= 127 | ##contig= 128 | ##contig= 129 | ##contig= 130 | ##contig= 131 | ##contig= 132 | ##contig= 133 | ##contig= 134 | ##contig= 135 | ##contig= 136 | ##contig= 137 | ##contig= 138 | ##contig= 139 | ##contig= 140 | ##contig= 141 | ##contig= 142 | ##contig= 143 | ##contig= 144 | ##contig= 145 | ##contig= 146 | ##contig= 147 | ##contig= 148 | ##contig= 149 | ##contig= 150 | ##contig= 151 | ##contig= 152 | ##contig= 153 | ##contig= 154 | ##contig= 155 | ##contig= 156 | ##contig= 157 | ##contig= 158 | ##contig= 159 | ##contig= 160 | ##contig= 161 | ##contig= 162 | ##contig= 163 | ##contig= 164 | ##contig= 165 | ##contig= 166 | ##contig= 167 | ##contig= 168 | ##contig= 169 | ##contig= 170 | ##contig= 171 | ##contig= 172 | ##contig= 173 | ##contig= 174 | ##contig= 175 | ##contig= 176 | ##contig= 177 | ##contig= 178 | ##contig= 179 | ##contig= 180 | ##contig= 181 | ##contig= 182 | ##contig= 183 | ##contig= 184 | ##contig= 185 | ##contig= 186 | ##contig= 187 | ##contig= 188 | ##contig= 189 | ##contig= 190 | ##contig= 191 | ##contig= 192 | ##contig= 193 | ##contig= 194 | ##contig= 195 | ##contig= 196 | ##contig= 197 | ##contig= 198 | ##contig= 199 | ##contig= 200 | ##contig= 201 | ##contig= 202 | ##contig= 203 | ##contig= 204 | ##contig= 205 | ##contig= 206 | ##contig= 207 | ##contig= 208 | ##contig= 209 | ##contig= 210 | ##contig= 211 | ##contig= 212 | ##contig= 213 | ##contig= 214 | ##contig= 215 | ##contig= 216 | ##contig= 217 | ##contig= 218 | ##contig= 219 | ##contig= 220 | ##contig= 221 | ##contig= 222 | ##contig= 223 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG002 HG003 HG004 224 | chr1 909169 pbsv.DEL.39 CTTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGGGGTCGGGGTCAGGCCCCCGGGCGCACCGTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCGGGCGCATCTTTGCTGGTATATGCGGTGGTCGGGGTCAGGCCCCCCGGGCGCACT C . PASS SVTYPE=DEL;END=909480;SVLEN=-311;SVANN=TANDEM GT:AD:DP 1/1:1,23:24 0/1:17,13:30 0/1:11,16:27 225 | chr1 37128264 pbsv.INS.1199 A AGGCTACAGTTCCCACTCCCCGCCACTCCGCGGGCTACAGTTCCCACTCCCCGCCACTCCGCGGGCTACAGTTCCCACTCCCCGCCACTCCGCGGGCTACAGTTCCCACTCCCCGCCACTCCGCGGGCTACAGTTCCCACTCCCCGCCACTCCGCGGGCTACAGTTCCCACTCCCCGCCACTCCGCG . PASS SVTYPE=INS;END=37128264;SVLEN=186;SVANN=TANDEM GT:AD:DP 0/1:15,10:25 0/1:23,11:34 1/1:2,31:33 226 | chr22 48910763 pbsv.DEL.66387 CCCCAGATTCTGAAATCTTTCATTGTGGTTGAAGTCTCCCCTCCCGA C . PASS SVTYPE=DEL;END=48910809;SVLEN=-46;SVANN=TANDEM GT:AD:DP 0/1:17,11:28 0/1:16,13:29 0/0:25,0:25 227 | chr7 37848002 pbsv.INS.28007 G GAGGAAGAAGAGGAGGAAGAGGAGGAGGAGGAAGAGGAAGAGGAAGAAGAGGAAGAGGAAGAGGAAGAAGAAGAGAAAGAAGAAGAAGAAAAGAAGAAGAAAGAAGAAAAGAAGAGGAAAAGAAGAAGAAAAGAAGAAGAAAAGAAGAAGAAGAAGGAAGAAGAGGAAGAGGAAGAGGAAGAAGAAGAAGAAGAAAG . PASS SVTYPE=INS;END=37848002;SVLEN=196;SVANN=TANDEM GT:AD:DP 1/1:5,21:26 0/1:16,13:29 0/1:13,14:27 228 | -------------------------------------------------------------------------------- /test-data/GIAB_PBSV_TRIO_CALLS_TEST2_regions.bed: -------------------------------------------------------------------------------- 1 | chr1 908969 909369 2 | chr1 37128064 37128464 3 | chr22 48910563 48910963 4 | chr7 37847802 37848202 5 | -------------------------------------------------------------------------------- /test-data/README.md: -------------------------------------------------------------------------------- 1 | # Test data 2 | 3 | # GIAB_PBSV_TRIO_CALLS.vcf 4 | 5 | Genome in a bottle VCF of HG02/030/04. One hundred events were randomly picked. 6 | 7 | # GIAB_PBSV_TRIO_CALLS_TEST2.vcf 8 | 9 | Genome in a bottle VCF of HG02/030/04. Two insertion and deletions (hom/het for each). 10 | 11 | event_four.bam 12 | event_one.bam 13 | event_three.bam 14 | event_two.bam 15 | 16 | 17 | # GIAB_PBSV_TRIO_CALLS_TEST2_regions.bed 18 | 19 | Adding 200bp to VCF flanks -------------------------------------------------------------------------------- /test-data/event_four.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collaborativebioinformatics/nibSV/3fec3eb8fda8ee1c21879287f4c5660dd3961998/test-data/event_four.bam -------------------------------------------------------------------------------- /test-data/event_four.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collaborativebioinformatics/nibSV/3fec3eb8fda8ee1c21879287f4c5660dd3961998/test-data/event_four.bam.bai -------------------------------------------------------------------------------- /test-data/event_one.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collaborativebioinformatics/nibSV/3fec3eb8fda8ee1c21879287f4c5660dd3961998/test-data/event_one.bam -------------------------------------------------------------------------------- /test-data/event_one.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collaborativebioinformatics/nibSV/3fec3eb8fda8ee1c21879287f4c5660dd3961998/test-data/event_one.bam.bai -------------------------------------------------------------------------------- /test-data/event_three.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collaborativebioinformatics/nibSV/3fec3eb8fda8ee1c21879287f4c5660dd3961998/test-data/event_three.bam -------------------------------------------------------------------------------- /test-data/event_three.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collaborativebioinformatics/nibSV/3fec3eb8fda8ee1c21879287f4c5660dd3961998/test-data/event_three.bam.bai -------------------------------------------------------------------------------- /test-data/event_two.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collaborativebioinformatics/nibSV/3fec3eb8fda8ee1c21879287f4c5660dd3961998/test-data/event_two.bam -------------------------------------------------------------------------------- /test-data/event_two.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/collaborativebioinformatics/nibSV/3fec3eb8fda8ee1c21879287f4c5660dd3961998/test-data/event_two.bam.bai -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | /all 2 | /t_* 3 | !/t_*.nim 4 | -------------------------------------------------------------------------------- /tests/all.nim: -------------------------------------------------------------------------------- 1 | # vim: sts=2:ts=2:sw=2:et:tw=0 2 | 3 | include "t_welcome.nim" 4 | include "t_kmers.nim" 5 | include "t_util.nim" 6 | include "t_svidx.nim" 7 | include "t_read.nim" 8 | include "t_refmers.nim" 9 | include "t_composer.nim" 10 | -------------------------------------------------------------------------------- /tests/config.nims: -------------------------------------------------------------------------------- 1 | switch("path", "$projectDir/../src") -------------------------------------------------------------------------------- /tests/foo.fasta: -------------------------------------------------------------------------------- 1 | >foo 2 | GATTACA 3 | -------------------------------------------------------------------------------- /tests/foo.fasta.fai: -------------------------------------------------------------------------------- 1 | foo 7 5 7 8 2 | -------------------------------------------------------------------------------- /tests/makefile: -------------------------------------------------------------------------------- 1 | UNAME=$(shell uname) 2 | ifeq (${UNAME},Darwin) 3 | install=install_name_tool -add_rpath /opt/local/lib 4 | else 5 | install=echo 6 | endif 7 | 8 | tests: 9 | nim c all.nim 10 | ${install} all 11 | ./all 12 | clean: 13 | rm -f all 14 | -------------------------------------------------------------------------------- /tests/nim.cfg: -------------------------------------------------------------------------------- 1 | path = "$projectPath/../src" 2 | -------------------------------------------------------------------------------- /tests/t_composer.nim: -------------------------------------------------------------------------------- 1 | import unittest 2 | import nibpkg/compose 3 | import hts 4 | 5 | suite "compose suite": 6 | # TODO this test only partially covers the function. We need to check the full 7 | # sequences. For now some tests are better than none. 8 | 9 | test "check that the variant haplotypes are correctly constructed": 10 | var variants: VCF 11 | doAssert(open(variants, "../test-data/GIAB_PBSV_TRIO_CALLS_TEST2.vcf")) 12 | 13 | 14 | for v in variants: 15 | var variant_type: string 16 | doAssert v.info.get("SVTYPE", variant_type) == Status.OK 17 | var variant_seq = composePositioned(v, "AAAAA", "TTTTT") 18 | if variant_type == "DEL": 19 | check(variant_seq.sequences.alt_seq == "AAAAATTTTT") 20 | if variant_type == "INS": 21 | check(variant_seq.sequences.alt_seq != "AAAAATTTTT") 22 | 23 | test "TODO check positions are correct!": 24 | echo "Please fill me in." 25 | test "TODO check full alleles are ok!": 26 | echo "Please fill me in." 27 | -------------------------------------------------------------------------------- /tests/t_kmers.nim: -------------------------------------------------------------------------------- 1 | # vim: sw=4 ts=4 sts=4 tw=0 et: 2 | from nibpkg/kmers import hash # avoiding "*" imports 3 | import unittest 4 | import deques 5 | import sequtils 6 | import sets 7 | #from strformat import fmt 8 | 9 | suite "kmers": 10 | test "bin_to_dna": 11 | check kmers.bin_to_dna(0, 1, kmers.forward) == "A" 12 | check kmers.bin_to_dna(1, 1, kmers.forward) == "C" 13 | check kmers.bin_to_dna(2, 1, kmers.forward) == "G" 14 | check kmers.bin_to_dna(3, 1, kmers.forward) == "T" 15 | 16 | check kmers.bin_to_dna(0b10001111000100, 7, kmers.forward) == "GATTACA" 17 | check kmers.bin_to_dna(0b10001111000100, 7, kmers.reverse) == "TGTAATC" 18 | 19 | test "dna_to_kmers": 20 | check kmers.dna_to_kmers("AAAA", 2).seeds.len() == 6 21 | 22 | test "sorted_kmers": 23 | let 24 | sq = "ATCGGCTACTATT" 25 | expected = [ 26 | "TCGGCTACTATT", 27 | "ATCGGCTACTAT", 28 | "ATCGGCTACTAT", 29 | "TCGGCTACTATT", 30 | ] 31 | k = 12 32 | var 33 | kms = kmers.dna_to_kmers(sq, k) 34 | check kms != nil 35 | let spot = kmers.initSpot(kms) # sort 36 | check kms == nil 37 | let got = sequtils.mapIt(spot.seeds, kmers.bin_to_dna(it.kmer, 38 | k.uint8, 39 | it.strand)) 40 | check got == expected 41 | #check kmers.haskmer("AGCCGATGATAA", kms) 42 | 43 | test "search": 44 | let 45 | sq = "ATCGGCTACTATT" 46 | k = 12 47 | qms = kmers.dna_to_kmers(sq, k) 48 | var 49 | kms = kmers.dna_to_kmers(sq, k) 50 | let spot = kmers.initSpot(kms) 51 | let hits = kmers.search(spot, qms) 52 | check hits.len() == 4 53 | #check sets.toSet(seqUtils.toSeq(hits)).len() == 4 # 4 unique items 54 | check sets.len(sets.toHashSet(seqUtils.toSeq(deques.items(hits)))) == 55 | 4 # same as above 56 | 57 | suite "kmers difference": 58 | let 59 | sq = "ATCGGCTACTATT" 60 | k = 12 61 | 62 | test "difference_of_self_is_nothing": 63 | let kms = kmers.dna_to_kmers(sq, k) 64 | #let qms = deepCopy(kms) 65 | var qms: kmers.pot_t 66 | deepCopy(qms, kms) 67 | check qms[] == kms[] 68 | check kmers.nkmers(qms) == 4 69 | check kmers.nkmers(kms) == 4 70 | let qspot = kmers.initSpot(qms) 71 | check qms == nil 72 | 73 | let kms0 = kmers.difference(kms, qspot) 74 | 75 | check kmers.nkmers(kms0) == 0 76 | check kmers.nkmers(kms) == 4 77 | check kmers.nkmers(qspot) == 4 78 | 79 | let 80 | expected: array[0, string] = [] 81 | got = kmers.get_dnas(kms0) 82 | check got == expected 83 | 84 | test "difference_of_nothing_is_self": 85 | let kms = kmers.dna_to_kmers(sq, k) 86 | var qms = kmers.dna_to_kmers("", k) 87 | #let orig = deepCopy(kms) 88 | var orig: kmers.pot_t 89 | deepCopy(orig, kms) 90 | check kmers.nkmers(qms) == 0 91 | check kmers.nkmers(kms) == 4 92 | let qspot = kmers.initSpot(qms) 93 | check qms == nil 94 | 95 | let kms4 = kmers.difference(kms, qspot) 96 | 97 | check kmers.nkmers(kms4) == 4 98 | check kmers.nkmers(kms) == 4 99 | let got = kmers.get_dnas(kms4) 100 | let expected = kmers.get_dnas(orig) 101 | check got == expected 102 | 103 | suite "kmer order": 104 | let dna = "ATGCGGACAGAAATATATACATAGAGACATACTCCCNAAAAAAAACTCAGAAGACACACATGCGCCC" 105 | let kms = kmers.dna_to_kmers(dna, 11) 106 | 107 | test "paired position of neg/pos strand": 108 | for i in countup(0, kms.seeds.len - 2, 2): 109 | check kms.seeds[i + 0].pos == kms.seeds[i + 1].pos 110 | 111 | test "increasing position": 112 | var lv: uint32 = 0 113 | for i in countup(2, kms.seeds.len - 2, 2): 114 | check (kms.seeds[i].pos > lv) 115 | lv = kms.seeds[i].pos 116 | 117 | #proc test_FS*(pot: pot_t) = 118 | # for i in pot.seeds: 119 | # echo bin_to_dna(i.kmer,pot.word_size,i.strand) 120 | 121 | suite "sparse_seeds": 122 | 123 | let dna = "CCCGAAAGTTT" 124 | let kms = kmers.dna_to_kmers(dna, 4) 125 | 126 | #test_FS(kms) 127 | 128 | test "spaced-seeds": 129 | let test_seeds = kmers.spacing_kmer(kms, 3) 130 | 131 | #echo dna.len, " ", test_seeds.seeds.len 132 | 133 | # 11 (dna len) - 2*4 (2 kmer lens) == 3 134 | # So with a space of 3, we have exactly 1 forward and 1 reverse seed. 135 | check(test_seeds.seeds.len == 2) 136 | 137 | block: # forward seed 138 | let x = 0 139 | let i = test_seeds.seeds[x] 140 | var myDNA = kmers.bin_to_dna(i.kmer, test_seeds.word_size, i.strand) 141 | #echo fmt"{myDNA} {dna} {test_seeds.word_size} {i.strand}" 142 | check((myDNA.len mod 2) == 0) 143 | check(myDNA == "CCCGGTTT") 144 | check(i.strand == kmers.forward) 145 | block: # reverse seed 146 | let x = 1 147 | let i = test_seeds.seeds[x] 148 | var myDNA = kmers.bin_to_dna(i.kmer, test_seeds.word_size, i.strand) 149 | #echo fmt"{myDNA} {dna} {test_seeds.word_size} {i.strand}" 150 | check((myDNA.len mod 2) == 0) 151 | check(myDNA == "CCCGGTTT") 152 | check(i.strand == kmers.reverse) 153 | -------------------------------------------------------------------------------- /tests/t_read.nim: -------------------------------------------------------------------------------- 1 | import unittest 2 | import nibpkg/read 3 | 4 | suite "read suite": 5 | test "test that filter read works": 6 | 7 | var r = Read(compatible_SVs: initCountTable[uint32]()) 8 | r.compatible_SVs.inc(23, 5) 9 | r.compatible_SVs.inc(22, 1) 10 | 11 | r.filter_read_matches() 12 | 13 | check r.compatible_SVs.len == 1 14 | check 23 in r.compatible_SVs 15 | 16 | -------------------------------------------------------------------------------- /tests/t_refmers.nim: -------------------------------------------------------------------------------- 1 | # vim: sw=2 ts=2 sts=2 tw=0 et: 2 | from nibpkg/refmers import nil 3 | from nibpkg/svidx import nil 4 | import unittest 5 | import json 6 | import os 7 | import strutils 8 | const thisdir = system.currentSourcePath.rsplit(DirSep, 1)[0] 9 | 10 | let original = """ 11 | { 12 | "kmerSize": 3, 13 | "counts": 14 | { 15 | "3": { 16 | "refCount": 0, 17 | "altCount": 0, 18 | "svs": [ 19 | ] 20 | }, 21 | "4": { 22 | "refCount": 0, 23 | "altCount": 0, 24 | "svs": [ 25 | ] 26 | } 27 | } 28 | } 29 | """ 30 | # 4 -> ACA forward 31 | # 3 -> ATT reverse 32 | 33 | let expected = """ 34 | { 35 | "kmerSize": 3, 36 | "counts": 37 | { 38 | "3": { 39 | "refCount": 1, 40 | "altCount": 0, 41 | "svs": [ 42 | ] 43 | }, 44 | "4": { 45 | "refCount": 1, 46 | "altCount": 0, 47 | "svs": [ 48 | ] 49 | } 50 | } 51 | } 52 | """ 53 | 54 | let original_spaced = """ 55 | { 56 | "kmerSize": 3, 57 | "counts": 58 | { 59 | "2244": { 60 | "refCount": 0, 61 | "altCount": 0, 62 | "svs": [ 63 | ] 64 | }, 65 | "3789": { 66 | "refCount": 0, 67 | "altCount": 0, 68 | "svs": [ 69 | ] 70 | } 71 | } 72 | } 73 | """ 74 | let expected_spaced = """ 75 | { 76 | "kmerSize": 3, 77 | "counts": 78 | { 79 | "2244": { 80 | "refCount": 1, 81 | "altCount": 0, 82 | "svs": [ 83 | ] 84 | }, 85 | "3789": { 86 | "refCount": 1, 87 | "altCount": 0, 88 | "svs": [ 89 | ] 90 | } 91 | } 92 | } 93 | """ 94 | 95 | suite "refmers": 96 | let 97 | fn = thisDir & "/foo.fasta" 98 | test "updateSvIndex": 99 | var idx = svidx.loadIndexFromJson(original) 100 | #let path = os.absolutePath("tests/foo.fasta") 101 | # foo.fasta contains "GATTACA", which matches 2 3-mers from our index: 102 | # "ACA" (==4 forward) 103 | # "ATT" (==3 reversed) 104 | var 105 | kmer_size = 3 106 | space = 0 107 | refmers.updateSvIndex(fn, idx, kmer_size, 0, space=space) 108 | #echo "result:", svidx.dumpIdxtoJson(idx) 109 | var result = svidx.dumpIndextoJson(idx) 110 | check json.parseJson(result) == json.parseJson(expected) 111 | test "updateSvIndex_spaced": 112 | var idx = svidx.loadIndexFromJson(original_spaced) 113 | #let path = os.absolutePath("tests/foo.fasta") 114 | # foo.fasta contains "GATTACA", which matches 2 3-mers: 115 | # "GATACA" (==2244 forward) 116 | # "GATACA" (==3789 reversed) 117 | var 118 | kmer_size = 3 119 | space = 1 120 | refmers.updateSvIndex(fn, idx, kmer_size, 0, space=space) 121 | #echo "result:", svidx.dumpIdxtoJson(idx) 122 | var result = svidx.dumpIndextoJson(idx) 123 | check json.parseJson(result) == json.parseJson(expected_spaced) 124 | -------------------------------------------------------------------------------- /tests/t_svidx.nim: -------------------------------------------------------------------------------- 1 | # vim: sw=1 ts=1 sts=1 tw=0 et: 2 | import unittest 3 | import nibpkg/svidx 4 | import tables 5 | 6 | suite "SvIndex suite": 7 | test "that sv insertion works": 8 | 9 | var idx: SvIndex 10 | idx.insert("ATCGGCTACTATT", 11, 2) 11 | 12 | for kmer, t in idx.counts: 13 | check t.svs == @[2'u32] 14 | 15 | test "that no ref insertion occurs unless kmer matches": 16 | 17 | var idx: SvIndex 18 | idx.insert("ATCGGCTACTATT", 11, -1) 19 | check idx.len == 0 20 | 21 | idx.insert("ATCGGCTACTATT", 11, 2) 22 | idx.insert("ATCGGCTACTATT", 11, -1) 23 | 24 | for kmer, t in idx.counts: 25 | check t.svs == @[2'u32] 26 | check t.refCount == 1'u32 27 | 28 | test "that filter removes SV entries with refcount gt zero": 29 | var idx: SvIndex 30 | 31 | idx.insert("ATCGGCTACTATT", 11, 2) 32 | idx.insert("ATCGGCTACTATT", 11, -1) 33 | 34 | filterRefKmers(idx, 0) 35 | check idx.len == 0; 36 | 37 | test "that filter does not SV entries with refcount <= two": 38 | var idx: SvIndex 39 | 40 | idx.insert("ATCGGCTACTATT", 11, 2) 41 | idx.insert("ATCGGCTACTATT", 11, -1) 42 | idx.insert("ATCGGCTACTATT", 11, -1) 43 | 44 | filterRefKmers(idx, 2) 45 | check idx.len == 6; 46 | 47 | test "that filter does not SV entries with refcount <= 0": 48 | var idx: SvIndex 49 | 50 | idx.insert("ATCGGCTACTATT", 11, 2) 51 | 52 | 53 | filterRefKmers(idx, 2) 54 | check idx.len == 6; 55 | -------------------------------------------------------------------------------- /tests/t_util.nim: -------------------------------------------------------------------------------- 1 | # vim: sw=4 ts=4 sts=4 tw=0 et: 2 | import nibpkg/util 3 | import unittest 4 | from strformat import fmt 5 | 6 | from os import nil 7 | from sequtils import nil 8 | from strutils import nil 9 | 10 | suite "util": 11 | test "thousands": 12 | check thousands(0) == "0" 13 | check thousands(1) == "1" 14 | check thousands(10) == "10" 15 | check thousands(100) == "100" 16 | check thousands(1_000) == "1,000" 17 | check thousands(10_000) == "10,000" 18 | check thousands(100_000) == "100,000" 19 | check thousands(1_000_000) == "1,000,000" 20 | check thousands(-1_000_000) == "-1,000,000" 21 | check thousands(-10_000) == "-10,000" 22 | check thousands(-1_000) == "-1,000" 23 | check thousands(-1) == "-1" 24 | check thousands(-0) == "0" 25 | test "splitWeighted": 26 | check splitWeighted(0, @[]) == [] 27 | check splitWeighted(0, @[42]) == [] 28 | check splitWeighted(1, @[42]) == [1] 29 | check splitWeighted(2, @[42, 2]) == [1, 1] 30 | check splitWeighted(3, @[1, 1, 1]) == [1, 1, 1] 31 | check splitWeighted(3, @[1, 1, 1, 1]) == [2, 1, 1] 32 | check splitWeighted(3, @[1, 1]) == [1, 1] 33 | check splitWeighted(1, @[1, 2, 3, 4]) == [4] 34 | check splitWeighted(2, @[1, 2, 3, 4]) == [3, 1] 35 | check splitWeighted(3, @[1, 2, 3, 4]) == [3, 1] # greedy 36 | check splitWeighted(4, @[1, 2, 3, 4]) == [2, 1, 1] # greedy, so order matters 37 | check splitWeighted(4, @[4, 3, 2, 1]) == [1, 1, 1, 1] # see? 38 | check splitWeighted(3, @[4, 3, 2, 1]) == [1, 1, 2] 39 | check splitWeighted(2, @[4, 3, 2, 1]) == [2, 2] 40 | check splitWeighted(1, @[4, 3, 2, 1]) == [4] 41 | check splitWeighted(4, @[4191650, 4009608, 4154778, 4096102]) == [1, 2, 1] # not very good 42 | test "partitionWeighted": 43 | check partitionWeighted(4, @[4191650, 4009608, 4154778, 4096102]) == @[@[1], @[3], @[2], @[0]] 44 | check partitionWeighted(4, @[4191650, 4009608, 4154778, 4096102, 99]) == @[@[1, 4], @[3], @[2], @[0]] 45 | check partitionWeighted(4, @[4_191_650, 4_009_608, 4_154_778, 4_096_102, 500_000]) == @[@[3], @[2], @[0], @[1, 4]] 46 | check partitionWeighted(2, @[1, 3, 5, 2, 4, 6]) == @[@[2, 4, 0], @[5, 1, 3]] 47 | check partitionWeighted(3, @[1, 2, 2]) == @[@[0], @[2], @[1]] 48 | test "combineToTarget": 49 | proc icombineToTarget(t: int, weights: seq[int]): seq[seq[int]] = 50 | return combineToTarget(t, sequtils.mapIt(weights, int64(it))) 51 | check icombineToTarget(3, @[2, 2, 2, 2]) == @[@[0, 1], @[2, 3]] 52 | check icombineToTarget(3, @[2, 2, 2]) == @[@[0, 1], @[2]] 53 | check icombineToTarget(3, @[2, 2]) == @[@[0, 1]] 54 | check icombineToTarget(2, @[2, 2, 2, 2]) == @[@[0], @[1], @[2], @[3]] 55 | check icombineToTarget(1, @[2, 2, 2, 2]) == @[@[0], @[1], @[2], @[3]] 56 | check icombineToTarget(4, @[2, 2, 2, 2]) == @[@[0, 1], @[2, 3]] 57 | check icombineToTarget(3, @[1, 2, 3, 4]) == @[@[0, 1], @[2], @[3]] 58 | check icombineToTarget(3, @[1, 1, 2, 1]) == @[@[0, 1, 2], @[3]] 59 | check icombineToTarget(3, @[1, 2, 1, 1]) == @[@[0, 1], @[2, 3]] 60 | 61 | test "sscanf": 62 | let s_frmt = strutils.format("%ld %$#[^\n]", 63 | (util.MAX_HEADROOM - 1)) 64 | var 65 | bufAname: util.Headroom 66 | name: string 67 | val: int32 68 | line: string 69 | line = "123 abc def" 70 | let scanned = util.sscanf(line.cstring, s_frmt.cstring, 71 | addr val, addr bufAname) 72 | check val == 123 73 | check scanned == 2 74 | util.toString(bufAname, name, line) 75 | check "abc def" == name 76 | 77 | test "isEmptyFile": 78 | let fn = "empty.txt" 79 | check os.execShellCmd("rm -f {fn}".fmt) == 0 80 | check os.execShellCmd("touch {fn}".fmt) == 0 81 | check isEmptyFile(fn) 82 | check os.execShellCmd("echo fuller >> {fn}".fmt) == 0 83 | check not isEmptyFile(fn) 84 | check os.execShellCmd("rm -f {fn}".fmt) == 0 85 | 86 | test "isOlderFile": 87 | let afn = "a.txt" 88 | let bfn = "b.txt" 89 | check os.execShellCmd("touch {afn}".fmt) == 0 90 | check os.execShellCmd("touch {bfn}".fmt) == 0 91 | # We cannot reliably test for strictly older because of fsys probs. 92 | check not isOlderFile(bfn, afn) 93 | check os.execShellCmd("touch {afn}".fmt) == 0 94 | check not isOlderFile(afn, bfn) 95 | check os.execShellCmd("rm -f {afn} {bfn}".fmt) == 0 96 | 97 | test "getNthWord": 98 | check getNthWord("a", 0, ' ') == "a" 99 | check getNthWord("a b", 0, ' ') == "a" 100 | check getNthWord("a b", 1, ' ') == "b" 101 | check getNthWord("a b ", 1, ' ') == "b" 102 | check getNthWord("ax bx cx", 0, ' ') == "ax" 103 | check getNthWord("ax bx cx", 1, ' ') == "bx" 104 | check getNthWord("ax bx cx", 2, ' ') == "cx" 105 | expect PbError: 106 | discard getNthWord("ax bx cx", 3, ' ') 107 | -------------------------------------------------------------------------------- /tests/t_welcome.nim: -------------------------------------------------------------------------------- 1 | # vim: sw=4 ts=4 sts=4 tw=0 et: 2 | from nibpkg/welcome import nil 3 | import unittest 4 | 5 | suite "welcome": 6 | test "home": 7 | assert 1 == 1 8 | assert welcome.getWelcomeMessage() == "Hello, World!" 9 | -------------------------------------------------------------------------------- /vendor/README.md: -------------------------------------------------------------------------------- 1 | ## Fetching 2 | These are not in Nimble, so we need to install them manually. 3 | 4 | * https://github.com/quinlan-lab/STRling.git 5 | * https://github.com/yglukhov/threadpools.git 6 | 7 | For now, we can use git-submodules. 8 | 9 | git submodule update --init 10 | --------------------------------------------------------------------------------