├── .gitmodules ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── gcp.Dockerfile ├── pipeline ├── GoodPanGenomeGraph.snakefile ├── LeaveOneOut.snakefile ├── RefGraph.snakefile ├── goodPanGenomeGraph.json ├── k17.json ├── k25.json ├── leaveOneOut.json └── refGraph.json ├── script ├── PlotRegression.py ├── SelectRegions.py ├── __init__.py ├── binaryIO.py ├── bubblecalling.py ├── chrsize.sh ├── danbing.call.py ├── eqtl.noPerm.py ├── fixMaskedFasta.py ├── getCovByLocus.397.sh ├── individualExpansion.py ├── jointExpansion.py ├── kmc2length.LOO.py ├── kmc2length.py ├── kmers.linreg.py ├── kmerutils.py ├── liftbed.clean.py ├── mergeMBEbed.py ├── multiBoundaryExpansion.parallel.py ├── multiBoundaryExpansion.parallel.ref_guided_single_hap.py ├── multiBoundaryExpansion.parallel.single_hap.py ├── multiBoundaryExpansion.py ├── multiBoundaryExpansion.single_process.py ├── parseMergeSet.py ├── prepareIndividualDatasets.py ├── prepareJointDatasets.py ├── prepareQCDatasets.py ├── rmLinebyIndFile.py ├── rmNAforBothBeds.py ├── sim.confusionMatrix.py ├── utils.py ├── vntrutils.py ├── writeBEbed.py └── writeBoundaryExpandedBeds.py ├── src ├── Num2seq.cpp ├── RVseq.cpp ├── Seq2num.cpp ├── aQueryFasta_thread.cpp ├── aQueryFasta_thread.h ├── bait.cpp ├── bam2pe.cpp ├── binaryKmerIO.hpp ├── fa2kmers.cpp ├── genPanKmers.cpp ├── kmer.hpp ├── kmerIO.hpp ├── kmertools.cpp ├── mapkmers.cpp ├── pred.cpp ├── pred.h └── sim_reads.cpp └── test ├── QC ├── fn1a.sim.sh ├── fn1b.annot.sh ├── fn1c.extract.sh ├── fn2a1.raw.map.sh ├── fn2a2.build.FPSkmer.sh ├── fn2a2.build.profile.sh ├── fn2a3.bait.map.sh └── input │ ├── HG002.0.fa │ ├── HG002.0.fa.fai │ ├── HG002.1.fa │ ├── HG002.1.fa.fai │ ├── genomes.txt │ ├── hs1.0.fa │ ├── hs1.0.fa.fai │ ├── hs1.1.fa │ ├── hs1.1.fa.fai │ ├── pan.graph.kmers │ ├── pan.graph.umap │ ├── pan.kmerDBi.umap │ ├── pan.kmerDBi.vv │ ├── pan.ntr.kmers │ ├── pan.tr.kmers │ └── pan.tr.mbe.v2.bed ├── goodPanGenomeGraph.json └── input ├── HG00514.0.fa ├── HG00514.0.fa.fai ├── HG00514.1.fa ├── HG00514.1.fa.fai ├── HG00514.filtered.reads.bam ├── HG00514.filtered.reads.bam.bai ├── HG00733.0.fa ├── HG00733.0.fa.fai ├── HG00733.1.fa ├── HG00733.1.fa.fai ├── HG00733.filtered.reads.bam ├── HG00733.filtered.reads.bam.bai ├── archive ├── HG00514.filtered.fasta.gz ├── HG00733.filtered.fasta.gz ├── getfasta.514.bed └── getfasta.733.bed ├── ctrl.bed ├── genome.test.txt ├── hg38.chr12.2155791.2356090.fasta ├── hg38.chr12.2155791.2356090.fasta.fai └── tr.bed /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "Eigen"] 2 | path = Eigen 3 | url = https://gitlab.com/libeigen/eigen 4 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y --no-install-recommends libz-dev libncurses5-dev libbz2-dev liblzma-dev libssl-dev make gcc g++ autoconf python3-pip && \ 5 | pip install numpy==1.23.3 pandas==1.5.0 scikit-learn==1.1.2 && \ 6 | apt-get clean && rm -rf /var/lib/apt/lists/* 7 | 8 | WORKDIR /opt 9 | 10 | COPY samtools-1.17.tar.bz2 . 11 | 12 | RUN bunzip2 samtools-1.17.tar.bz2 && \ 13 | tar xvf samtools-1.17.tar && \ 14 | cd samtools-1.17 && \ 15 | ./configure && make -j 8 && make install && \ 16 | cd .. && rm -r samtools-1.17* 17 | 18 | COPY danbing-tk/script/danbing.call.py danbing-tk/script/bubblecalling.py danbing-tk/script/kmerutils.py /usr/local/bin/ 19 | 20 | COPY danbing-tk ./danbing-tk 21 | 22 | RUN cd danbing-tk && mkdir -p bin && \ 23 | g++ -std=c++11 -pthread -I ./cereal/include -I ./Eigen -O2 -o bin/danbing-tk src/aQueryFasta_thread.cpp && \ 24 | cp bin/* /usr/local/bin/ && \ 25 | cd .. && rm -rf danbing-tk 26 | 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, ChaissonLab 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PREFIX ?=. 2 | TARGETS = bin/danbing-tk bin/fa2kmers bin/genPanKmers bin/ktools bin/danbing-tk-pred bin/baitBuilder bin/sim_reads 3 | TARGETSg = bin/danbing-tk_g bin/fa2kmers_g bin/genPanKmers_g bin/ktools_g bin/danbing-tk-pred_g bin/baitBuilder_g 4 | #TARGETS = seq2num num2seq rvseq bam2pe 5 | 6 | CXX ?= g++ 7 | LDLIBS = -pthread 8 | dir_guard = @mkdir -p $(@D) 9 | INC=$(PREFIX)/include 10 | CPPFLAGS = -std=c++11 -I $(INC) -I $(INC)/eigen3 11 | INCFILES = $(INC)/eigen3 12 | 13 | 14 | all: $(INCFILES) $(TARGETS) 15 | allg: $(INCFILES) $(TARGETS) $(TARGETSg) 16 | 17 | 18 | # copy INCLUDE files to ./include 19 | $(INCFILES): 20 | mkdir -p $(INC)/eigen3 21 | if [ ! -f $(INC)/eigen3 ]; then cp -r Eigen/Eigen $(INC)/eigen3; fi 22 | 23 | # dependencies between programs and .o files 24 | bin/danbing-tk: src/aQueryFasta_thread.cpp src/aQueryFasta_thread.h src/kmerIO.hpp src/binaryKmerIO.hpp 25 | $(dir_guard) 26 | $(CXX) $(LDLIBS) $(CPPFLAGS) -O3 -o bin/danbing-tk src/aQueryFasta_thread.cpp 27 | 28 | bin/danbing-tk_g: src/aQueryFasta_thread.cpp src/aQueryFasta_thread.h src/kmerIO.hpp src/binaryKmerIO.hpp 29 | $(dir_guard) 30 | $(CXX) $(LDLIBS) $(CPPFLAGS) -g -o bin/danbing-tk_g src/aQueryFasta_thread.cpp 31 | 32 | bin/danbing-tk-pred: src/pred.cpp 33 | $(dir_guard) 34 | $(CXX) $(CPPFLAGS) -O3 -o bin/danbing-tk-pred src/pred.cpp 35 | 36 | bin/danbing-tk-pred_g: src/pred.cpp 37 | $(dir_guard) 38 | $(CXX) $(CPPFLAGS) -g -o bin/danbing-tk-pred_g src/pred.cpp 39 | 40 | bin/ktools: src/kmertools.cpp src/kmerIO.hpp src/binaryKmerIO.hpp 41 | $(dir_guard) 42 | $(CXX) $(CPPFLAGS) -O3 -o bin/ktools src/kmertools.cpp 43 | 44 | bin/ktools_g: src/kmertools.cpp src/kmerIO.hpp src/binaryKmerIO.hpp 45 | $(dir_guard) 46 | $(CXX) $(CPPFLAGS) -g -o bin/ktools_g src/kmertools.cpp 47 | 48 | bin/fa2kmers: src/fa2kmers.cpp 49 | $(dir_guard) 50 | $(CXX) $(CPPFLAGS) -O3 -o bin/fa2kmers src/fa2kmers.cpp 51 | 52 | bin/fa2kmers_g: src/fa2kmers.cpp 53 | $(dir_guard) 54 | $(CXX) $(CPPFLAGS) -g -o bin/fa2kmers_g src/fa2kmers.cpp 55 | 56 | bin/genPanKmers: src/genPanKmers.cpp 57 | $(dir_guard) 58 | $(CXX) $(CPPFLAGS) -O3 -o bin/genPanKmers src/genPanKmers.cpp 59 | 60 | bin/genPanKmers_g: src/genPanKmers.cpp 61 | $(dir_guard) 62 | $(CXX) $(CPPFLAGS) -g -o bin/genPanKmers_g src/genPanKmers.cpp 63 | 64 | bin/baitBuilder: src/bait.cpp 65 | $(dir_guard) 66 | $(CXX) $(CPPFLAGS) -O3 -o bin/baitBuilder src/bait.cpp 67 | 68 | bin/baitBuilder_g: src/bait.cpp 69 | $(dir_guard) 70 | $(CXX) $(CPPFLAGS) -g -o bin/baitBuilder_g src/bait.cpp 71 | 72 | bin/sim_reads: src/sim_reads.cpp 73 | $(dir_guard) 74 | $(CXX) $(CPPFLAGS) -O3 -o bin/sim_reads src/sim_reads.cpp 75 | 76 | bin/seq2num: src/seq2num.cpp 77 | $(dir_guard) 78 | $(CXX) -O3 -o bin/seq2num src/seq2num.cpp 79 | 80 | bin/num2seq: src/num2seq.cpp 81 | $(dir_guard) 82 | $(CXX) -O3 -o bin/num2seq src/num2seq.cpp 83 | 84 | bin/rvseq: src/rvseq.cpp 85 | $(dir_guard) 86 | $(CXX) -O3 -o bin/rvseq src/rvseq.cpp 87 | 88 | bin/bam2pe: src/bam2pe.cpp 89 | $(dir_guard) 90 | $(CXX) -O3 -o bin/bam2pe src/bam2pe.cpp 91 | 92 | # 93 | # generic build rules 94 | # 95 | 96 | #$(TARGETS): 97 | # $(CXX) $^ -o $@ 98 | 99 | #.PHONY: install 100 | install: $(TARGETS) 101 | mkdir -p $(PREFIX)/bin 102 | cp $^ $(PREFIX)/bin/. 103 | 104 | #uninstall: $(TARGETS) 105 | # for TARGET in $(TARGETS); do \ 106 | # rm $(PREFIX)/bin/$$TARGET; \ 107 | # done 108 | 109 | clean: 110 | rm -f *.o *~ $(TARGETS) 111 | 112 | -------------------------------------------------------------------------------- /gcp.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y --no-install-recommends apt-transport-https ca-certificates gnupg curl sudo && \ 5 | echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ 6 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ 7 | apt-get update && \ 8 | apt-get install -y --no-install-recommends google-cloud-cli libz-dev libncurses5-dev libbz2-dev liblzma-dev libssl-dev make gcc g++ autoconf && \ 9 | apt-get clean && rm -rf /var/lib/apt/lists/* 10 | 11 | WORKDIR /opt 12 | 13 | COPY danbing-tk ./danbing-tk 14 | 15 | COPY samtools-1.17.tar.bz2 . 16 | 17 | RUN bunzip2 samtools-1.17.tar.bz2 && \ 18 | tar xvf samtools-1.17.tar && \ 19 | cd samtools-1.17 && \ 20 | ./configure && make -j 8 && make install && \ 21 | cd .. && rm -r samtools-1.17* 22 | 23 | RUN cd danbing-tk && \ 24 | make -j 5 && \ 25 | cp bin/* /usr/local/bin/ && \ 26 | cd .. && rm -rf danbing-tk 27 | -------------------------------------------------------------------------------- /pipeline/RefGraph.snakefile: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | configfile: "refGraph.json" 5 | 6 | srcdir = config["srcDir"] 7 | indir = config["inputDir"] 8 | outdir = config["outputDir"] 9 | pdir = config["pangenomeDir"] 10 | 11 | genomefile = config["genomefile"] 12 | genomes = np.loadtxt(genomefile, dtype=object).reshape(-1).tolist() 13 | ref = config["ref"] 14 | refTR = config["refTR"] 15 | refsize = config["refsize"] 16 | kmerTypes = ["tr", "ntr", "graph"] 17 | 18 | ksize = config["ksize"] 19 | FS = config["flankSize"] 20 | cth = config["countThreashold"] 21 | rth = config["ratioThreashold"] 22 | rstring = f'{rth*100:.0f}' 23 | thcth = config["threadingCountThreshold"] 24 | LB = config["sizeLowerBound"] 25 | TRwindow = config["TRwindow"] 26 | mbe_th1 = float(config["MBE_th1"]) 27 | mbe_th2 = float(config["MBE_th2"]) 28 | copts = config["clusterOpts"] 29 | 30 | 31 | localrules: all, GenMap_v0_v2 32 | 33 | rule all: 34 | input: 35 | MBEfoo = outdir + "MBE.foo", 36 | refkmers = expand(outdir + "hg38.{kmerType}.kmers", kmerType=kmerTypes), 37 | #TRfa = outdir + "ref.tr.fasta", 38 | #TRbed = outdir + "ref.bed", 39 | #panbed = outdir + "pan.tr.bed", 40 | #mapping = outdir + "refMap.tbl", 41 | #pankmers = expand(outdir + "pan.{kmerType}.kmers", kmerType=kmerTypes), 42 | #panILkmers = expand(outdir + "pan.{genome}.IL.tr.kmers", genome=genomes), 43 | #pred = expand(outdir + "{genome}.LR.pred", genome=genomes), 44 | 45 | 46 | rule JointTRAnnotation: 47 | input: 48 | fa = ref, 49 | output: 50 | foo = outdir + "MBE.foo", 51 | TRfa = outdir + "hg38.tr.fasta", 52 | #mapping = outdir + "OrthoMap.v2.tsv", 53 | #TRfa = outdir + "ref.tr.fasta", 54 | #TRbed = outdir + "ref.bed", 55 | resources: 56 | cores = 6, 57 | mem = lambda wildcards, attempt: 40 + 20*(attempt-1) 58 | priority: 96 59 | params: 60 | copts = copts, 61 | sd = srcdir, 62 | od = outdir, 63 | refTR = refTR, 64 | ksize = ksize, 65 | FS = FS, 66 | LB = LB, 67 | TRwindow = TRwindow, 68 | th1 = mbe_th1, 69 | th2 = mbe_th2, 70 | #gf = genomefile, 71 | #genomes = genomes 72 | shell:""" 73 | set -eu 74 | ulimit -c 20000 75 | cd {params.od} 76 | 77 | g=hg38 78 | ln -sf {input.fa} $g.0.fa 79 | ln -sf {input.fa}.fai $g.0.fa.fai 80 | ln -sf $g.0.fa $g.1.fa 81 | ln -sf $g.0.fa.fai $g.1.fa.fai 82 | mkdir -p $g 83 | awk 'BEGIN {{OFS="\t"}} {{print $0, ".", ".", ".", "+"}}' {params.refTR} >$g/tmp1.0.bed 84 | cp $g/tmp1.0.bed $g/tmp1.1.bed 85 | 86 | echo "Generating panbed" 87 | awk 'BEGIN {{OFS="\t"}} {{print $1, $2, $3, 1}}' {params.refTR} >pan.tr.mbe.v0.bed 88 | {params.sd}/script/preMBE.py <(echo "hg38") pan.tr.mbe.v0.bed 89 | {params.sd}/script/multiBoundaryExpansion.py 90 | {params.sd}/script/writeMBEbed.py {params.th1} {params.th2} 91 | hi=0 92 | for h in 0 1; do 93 | echo ">""$g"".""$h" 94 | cut -f $((4+4*hi))-$((6+4*hi)) pan.tr.mbe.v1.bed | 95 | awk 'BEGIN {{OFS="\t"}} {{print $0, NR-1}}' | 96 | grep -v "None" | 97 | sort -k1,1 -k2,2n -k3,3n >tmp.bed 98 | if [[ "$(cat tmp.bed | wc -l)" != "0" ]]; then 99 | bedtools merge -d 700 -c 4 -o collapse -i tmp.bed | 100 | cut -f 4 | grep "," 101 | fi 102 | ((++hi)) 103 | done >mbe.m0.loci 104 | rm tmp.bed 105 | {params.sd}/script/mergeMBEbed.py 106 | 107 | ### write fasta 108 | echo "Fetching TR+flank" 109 | h=0 110 | cut -f 4-6 pan.tr.mbe.v2.bed | 111 | grep -v "None" | 112 | awk 'BEGIN {{OFS="\t"}} {{ 113 | $2=$2-700 114 | $3=$3+700 115 | print $0 116 | }}' | 117 | {params.sd}/script/SelectRegions.py /dev/stdin "$g"."$h".fa /dev/stdout | 118 | awk '{{if ($1 ~ />/) {{print}} else {{print toupper($0)}} }}' >"$g".tr.fasta 119 | rm hg38.?.fa* 120 | touch MBE.foo 121 | """ 122 | 123 | 124 | rule GenMap_v0_v2: 125 | input: 126 | foo = outdir + "MBE.foo", 127 | output: 128 | mapping = outdir + "locusMap.v0.to.v2.txt", 129 | resources: 130 | cores = 1, 131 | mem = lambda wildcards, attempt: 4, 132 | params: 133 | copts = copts, 134 | sd = srcdir, 135 | od = outdir, 136 | ksize = ksize, 137 | FS = FS, 138 | graph = "hg38" 139 | run: 140 | import numpy as np 141 | nloci = np.loadtxt(f"{params.od}/pan.tr.mbe.v0.bed", usecols=1).size 142 | m21 = np.loadtxt(f"{params.od}/locusMap.v2.to.v1.txt", dtype=int) 143 | m10 = np.loadtxt(f"{params.od}/locusMap.v1.to.v0.txt", dtype=int) 144 | m02 = np.full(nloci, ".", dtype=object) 145 | m02[m10[m21]] = np.arange(m21.size) 146 | np.savetxt(f"{params.od}/locusMap.v0.to.v2.txt", m02, fmt='%s') 147 | 148 | 149 | rule GenRefGraph: 150 | input: 151 | TRfa = outdir + "hg38.tr.fasta", 152 | mapping = outdir + "locusMap.v0.to.v2.txt", 153 | output: 154 | refkmers = expand(outdir + "hg38.{kmerType}.kmers", kmerType=kmerTypes), 155 | resources: 156 | cores = 1, 157 | mem = lambda wildcards, attempt: 8*attempt, 158 | params: 159 | copts = copts, 160 | sd = srcdir, 161 | od = outdir, 162 | ksize = ksize, 163 | FS = FS, 164 | graph = "hg38" 165 | shell:""" 166 | set -eu 167 | ulimit -c 20000 168 | cd {params.od} 169 | 170 | 171 | {params.sd}/bin/vntr2kmers_thread -g -m {input.mapping} -k {params.ksize} -fs {params.FS} -ntr {params.FS} -o {params.graph} -fa 1 {input.TRfa} 172 | """ 173 | 174 | 175 | rule GenPanGraph: 176 | input: 177 | #TRbed = outdir + "hg38.bed", 178 | refkmers = expand(outdir + "hg38.{kmerType}.kmers", kmerType=kmerTypes), 179 | output: 180 | panbed = outdir + "pan.tr.bed", 181 | mapping = outdir + "refMap.tbl", 182 | pankmers = expand(outdir + "pan.{kmerType}.kmers", kmerType=kmerTypes), 183 | resources: 184 | cores = 1, 185 | mem = lambda wildcards, attempt: 8*attempt, 186 | params: 187 | copts = copts, 188 | sd = srcdir, 189 | od = outdir, 190 | refTR = config["refTR"], 191 | kmerpref = f'{outdir}/ref', 192 | shell:""" 193 | cd {params.od} 194 | ulimit -c 20000 195 | 196 | awk 'BEGIN {{OFS="\t"}} {{$4=$4"\t"(NR-1); print $0}}' {params.refTR} > {output.panbed} 197 | bedtools map -c 4 -o collapse -a {output.panbed} -b <(awk 'BEGIN {{OFS="\t"}} {{print $4, $5, $6, NR-1}}' {input.TRbed}) > {output.panbed}.tmp 198 | mv {output.panbed}.tmp {output.panbed} 199 | 200 | cut -f 6- {output.panbed} > {output.mapping} 201 | 202 | {params.sd}/bin/genPanKmers -o pan -m {output.mapping} -k {params.kmerpref} 203 | """ 204 | 205 | 206 | rule GenotypeSamples: 207 | input: 208 | pankmers = expand(outdir + "pan.{kmerType}.kmers", kmerType=kmerTypes), 209 | ILbam = indir + "{genome}.IL.srt.bam", 210 | ILbai = indir + "{genome}.IL.srt.bam.bai", 211 | output: 212 | ILkmers = outdir + "pan.{genome}.IL.tr.kmers", 213 | resources: 214 | cores = 24, 215 | mem = 40, 216 | params: 217 | copts = copts, 218 | sd = srcdir, 219 | od = outdir, 220 | ksize = ksize, 221 | FS = FS, 222 | cth = cth, 223 | rth = rth, 224 | rstring = rstring, 225 | thcth = thcth, 226 | graph = "pan", 227 | shell:""" 228 | set -eu 229 | ulimit -c 20000 230 | cd {params.od} 231 | 232 | samtools fasta -@2 -n {input.ILbam} | 233 | awk '{{if (substr($1,1,1) == ">") {{ 234 | if (substr($1,length($1)-1,1) == "/") {{ print substr($1, 1, length($1)-2) }} else {{ print $1 }} }} 235 | else {{ print $1 }} 236 | }}' | 237 | {params.sd}/bin/bam2pe -k {params.ksize} -fai /dev/stdin | 238 | {params.sd}/bin/aQueryFasta_thread -g {params.thcth} -k {params.ksize} -qs {params.od}/{params.graph} -fai /dev/stdin -o {params.graph}.{wildcards.genome}.IL -p {resources.cores} -cth {params.cth} -rth {params.rth} 239 | """ 240 | 241 | 242 | rule EvalRefGraph: 243 | input: 244 | mapping = outdir + "locusMap.tbl", 245 | PBkmers = pdir + "{genome}.PB.tr.kmers", 246 | panILkmers = outdir + "pan.{genome}.IL.tr.kmers" 247 | output: 248 | PBkmers = indir + "{genome}.PB.tr.kmers", 249 | mappedILkmers = outdir + "{genome}.mappedIL.tr.kmers", 250 | pred = outdir + "{genome}.LR.pred", 251 | resources: 252 | cores = 12, 253 | mem = 8, 254 | params: 255 | copts = copts, 256 | sd = srcdir, 257 | od = outdir, 258 | indir = indir, 259 | gi = lambda wildcards: genomes.index(wildcards.genome), 260 | shell:""" 261 | set -eu 262 | ulimit -c 20000 263 | 264 | cd {params.indir} 265 | ln -s {input.PBkmers} . 266 | 267 | cd {params.od} 268 | {params.sd}/bin/mapkmers {input.mapping} {params.gi} {input.panILkmers} {input.PBkmers} {wildcards.genome}.mappedIL.tr 269 | {params.sd}/script/kmers.linreg.py --mode invalid --R2threshold -2 {input.PBkmers} {output.mappedILkmers} {wildcards.genome}.LR 270 | """ 271 | -------------------------------------------------------------------------------- /pipeline/goodPanGenomeGraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "srcDir" : "/project/mchaisso_100/cmb-16/tsungyul/work/vntr/danbing-tk/", 3 | "inputDir" : "/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_trf_len100/hprc/input/", 4 | "outputDir" : "/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_trf_len100/hprc/output/", 5 | "pairs" : "/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc_ref/input/genome.bam.tsv", 6 | "AsmAligner" : "minimap2", 7 | "ksize" : 21, 8 | "flankSize" : 700, 9 | "dist_merge" : 700, 10 | "dist_scan" : 700, 11 | "countThreashold" : 45, 12 | "ratioThreashold" : 0.5, 13 | "threadingCountThreshold" : 50, 14 | "sizeLowerBound" : 50, 15 | "TRwindow" : 100000, 16 | "MBE_th1" : 0.3, 17 | "MBE_th2" : 0.6, 18 | "pruning" : "False", 19 | "ref" : "/project/mchaisso_100/cmb-16/tsungyul/work/vntr/datasets/reference/hg38_noalts/hg38.no_alts.fasta", 20 | "refTR" : "/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_trf_len100/hprc/input/trf.vntr.len_100.merge.primary.non_cen.bed", 21 | "refctrl" : "/project/mchaisso_100/cmb-16/tsungyul/work/vntr/hapdb/a1_regions/ctrl/pan.fn2.bed", 22 | "clusterOpts" : "sbatch --time=48:00:00 --partition=qcb --account=mchaisso_100 -N 1" 23 | } 24 | -------------------------------------------------------------------------------- /pipeline/k17.json: -------------------------------------------------------------------------------- 1 | { 2 | "inputDir" : /home/cmb-17/mjc/vntr_genotyping/k17/input/, 3 | "outputDir" : /staging/mjc/tsungyul/k17/, 4 | "genomes" : /home/cmb-17/mjc/vntr_genotyping/goodPanGenomeGraph/input/genomes.txt, 5 | "ksize" : 17, 6 | "flankSize" : 700, 7 | "countThreashold" : 45, 8 | "ratioThreashold" : 0.5, 9 | "threadingCountThreshold" : 50, 10 | "sizeLowerBound" : 50, 11 | "sizeUpperBound" : 5000, 12 | "TRwindow" : 10000, 13 | "ref" : /home/cmb-16/mjc/tsungyul/work/vntr/datasets/reference/hg38_noalts/hg38.no_alts.fasta, 14 | "refTR" : /home/cmb-17/mjc/vntr_genotyping/k17/input/tr.good.bed, 15 | "refctrl" : /home/cmb-17/mjc/vntr_genotyping/cmb-16/work/vntr/hapdb/a1_regions/ctrl/pan.fn2.bed, 16 | "clusterOpts" : "sbatch --time=48:00:00 --partition=cmb" 17 | } 18 | -------------------------------------------------------------------------------- /pipeline/k25.json: -------------------------------------------------------------------------------- 1 | { 2 | "inputDir" : /home/cmb-17/mjc/vntr_genotyping/k25/input/, 3 | "outputDir" : /staging/mjc/tsungyul/k25/, 4 | "genomes" : /home/cmb-17/mjc/vntr_genotyping/goodPanGenomeGraph/input/genomes.txt, 5 | "ksize" : 25, 6 | "flankSize" : 700, 7 | "countThreashold" : 45, 8 | "ratioThreashold" : 0.5, 9 | "threadingCountThreshold" : 50, 10 | "sizeLowerBound" : 50, 11 | "sizeUpperBound" : 5000, 12 | "TRwindow" : 10000, 13 | "ref" : /home/cmb-16/mjc/tsungyul/work/vntr/datasets/reference/hg38_noalts/hg38.no_alts.fasta, 14 | "refTR" : /home/cmb-17/mjc/vntr_genotyping/k25/input/tr.good.bed, 15 | "refctrl" : /home/cmb-17/mjc/vntr_genotyping/cmb-16/work/vntr/hapdb/a1_regions/ctrl/pan.fn2.bed, 16 | "clusterOpts" : "sbatch --time=48:00:00 --partition=cmb" 17 | } 18 | -------------------------------------------------------------------------------- /pipeline/leaveOneOut.json: -------------------------------------------------------------------------------- 1 | { 2 | "outputDir" : /scratch2/tsungyul/rpgg_k21_29k/LOO_v1/output/, 3 | "pangenomeDir" : /home/cmb-17/mjc/vntr_genotyping/rpgg_k21_29k/input_v1/, 4 | "genomefile" : /home/cmb-17/mjc/vntr_genotyping/rpgg_k21_29k/input/genomes.txt, 5 | "LOOgenomefile" : /home/cmb-17/mjc/vntr_genotyping/rpgg_k21_84k/LOO_v1/input/LOO.genomes.txt, 6 | "pairs" : /home/cmb-17/mjc/vntr_genotyping/rpgg_k21_29k/input/genome.bam.tsv, 7 | "ksize" : 21, 8 | "countThreashold" : 45, 9 | "ratioThreashold" : 0.5, 10 | "threadingCountThreshold" : 50, 11 | "clusterOpts" : "sbatch --time=120:00:00 --partition=cmb", 12 | "covbed" : /home/cmb-16/mjc/tsungyul/work/vntr/hapdb/a3_r2ok/pan_prune/v2_1/ctrlbam/input/pan.fn2.bed, 13 | "cov" : /home/cmb-17/mjc/vntr_genotyping/analysis/read_depth/ctrl/19g.cov.tsv, 14 | "LOOconf" : "1111011011111111011", 15 | "sampleConf" : "3100000022222100013", 16 | "badgenome" : "HG04217" 17 | } 18 | -------------------------------------------------------------------------------- /pipeline/refGraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "srcDir" : /home/cmb-16/mjc/tsungyul/work/vntr/danbing-tk/, 3 | "inputDir" : /home/cmb-17/mjc/vntr_genotyping/rpgg_k21_29k/refGraph_v1/input/, 4 | "outputDir" : /home/cmb-17/mjc/vntr_genotyping/rpgg_k21_29k/refGraph_v1/output/, 5 | "pangenomeDir" : /home/cmb-17/mjc/vntr_genotyping/rpgg_k21_29k/output_v1/, 6 | "pairs" : /home/cmb-17/mjc/vntr_genotyping/rpgg_k21_29k/input/genome.bam.tsv, 7 | "genomefile" : /home/cmb-17/mjc/vntr_genotyping/rpgg_k21_29k/input/genomes.txt, 8 | "ksize" : 21, 9 | "flankSize" : 700, 10 | "countThreashold" : 45, 11 | "ratioThreashold" : 0.5, 12 | "threadingCountThreshold" : 50, 13 | "sizeLowerBound" : 50, 14 | "TRwindow" : 10000, 15 | "MBE_th1" : 0.2, 16 | "MBE_th2" : 0.8, 17 | "ref" : /home/cmb-16/mjc/tsungyul/work/vntr/datasets/reference/hg38_noalts/hg38.no_alts.fasta, 18 | "refsize" : /panfs/qcb-panasas/tsungyul/hg38_noalts/hg38.no_alts.chrSize, 19 | "refTR" : /home/cmb-17/mjc/vntr_genotyping/rpgg_k21_29k/input_v1/good.tr.bed, 20 | "clusterOpts" : "sbatch --time=240:00:00 --partition=cmb" 21 | } 22 | -------------------------------------------------------------------------------- /script/PlotRegression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import vntrutils as vu 5 | import numpy as np 6 | 7 | #from scipy.optimize import minimize 8 | #from scipy.misc import factorial 9 | 10 | ap = argparse.ArgumentParser(description="plot the correlation given two files") 11 | ap.add_argument("xFile", help="x as a single column of data") 12 | ap.add_argument("yFile", help="y as a single column of data") 13 | args = ap.parse_args() 14 | 15 | x = np.loadtxt(args.xFile).reshape(-1,1) 16 | y = np.loadtxt(args.yFile).reshape(-1,1) 17 | 18 | vu.PlotRegression(x, y, args.xFile, args.yFile, fname=args.xFile+"."+args.yFile) 19 | -------------------------------------------------------------------------------- /script/SelectRegions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import vntrutils as vu 4 | 5 | if len(sys.argv) != 4: 6 | print("usage: program region.bed fasta region.fasta") 7 | sys.exit(1) 8 | 9 | bedFile=open(sys.argv[1]) 10 | fa = vu.Fasta(sys.argv[2]) 11 | outFile=open(sys.argv[3],'w') 12 | 13 | 14 | for line in bedFile: 15 | vals = line.split() 16 | outFile.write(">"+":".join(vals)+"\n") 17 | if vals[0] == "NA": # NF does not have to be 6 18 | continue 19 | elif int(vals[1]) > int(vals[2]) or int(vals[1]) < 0 or int(vals[2]) < 0: # XXX use chrsize info, instead of only checking start pos 20 | # XXX print empty seq or not 21 | print("valError:\t", vals, file=sys.stderr) 22 | continue 23 | else: 24 | seq = fa.get_seq(vals[0], int(vals[1]), int(vals[2])).upper() 25 | outFile.write(seq + "\n") 26 | fa.close() 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /script/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChaissonLab/danbing-tk/d29ade3963cedd82d54ed2fe983d3227cbd45e39/script/__init__.py -------------------------------------------------------------------------------- /script/binaryIO.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def load_kdb(fn): 4 | """ 5 | Output: TR kmer/index array 6 | Input file format: 7 | bytes content 8 | 8 index array size: NTR 9 | NTR*8 index array: # of kmers in each locus 10 | 8 kmer array size: NK 11 | NK*8 kmer array: kmers in database 12 | """ 13 | with open(fn, 'rb') as f: 14 | NTR = int.from_bytes(f.read(8), byteorder='little') 15 | index = np.frombuffer(f.read(8*NTR), dtype=int) 16 | NK = int.from_bytes(f.read(8), byteorder='little') 17 | ks = np.frombuffer(f.read(8*NK), dtype=int) 18 | print(f"index, ks size = {index.size}, {ks.size}") 19 | return index, ks 20 | 21 | def load_kmdb(fn): 22 | """ 23 | Output: kmer map database 24 | Input file format: 25 | bytes content 26 | 8 index array size: NTR 27 | NTR*8 index array: # of kmers in each locus 28 | 8 kmer array size: NK 29 | NK*8 kmer array: kmers in database 30 | NK*8 val array: associated value per kmer 31 | """ 32 | with open(fn, 'rb') as f: 33 | NTR = int.from_bytes(f.read(8), byteorder='little') 34 | index = np.frombuffer(f.read(8*NTR), dtype=int) 35 | NK = int.from_bytes(f.read(8), byteorder='little') 36 | ks = np.frombuffer(f.read(8*NK), dtype=int) 37 | vs = np.frombuffer(f.read(8*NK), dtype=int) 38 | print(f"index, ks, vs size = {index.size}, {ks.size}, {vs.size}") 39 | return index, ks, vs 40 | 41 | def load_Karray(fn): 42 | """ 43 | Output: kmer map database 44 | Input file format: 45 | bytes content 46 | 8 array size: NK 47 | NK*8 array 48 | """ 49 | with open(fn, 'rb') as f: 50 | NK = int.from_bytes(f.read(8), byteorder='little') 51 | ks = np.frombuffer(f.read(8*NK), dtype=int) 52 | print(f"ks size = {ks.size}") 53 | return ks 54 | 55 | def reconstruct_kmdb(index, ks, vs, imax=None): 56 | out = {} 57 | si, ei = 0, 0 58 | NTR = index.size 59 | if imax is None: imax = NTR 60 | for tri in range(imax): 61 | k2v = {} 62 | nk = index[tri] 63 | ei += nk 64 | for k, v in zip(ks[si:ei], vs[si:ei]): 65 | k2v[k] = v 66 | out[tri] = k2v 67 | si = ei 68 | return out 69 | 70 | def reconstruct_ksdb(index, ks, imax=None): 71 | out = np.empty(NTR, dtype=object) 72 | si, ei = 0, 0 73 | NTR = index.size 74 | if imax is None: imax = NTR 75 | for tri in range(imax): 76 | ei += index[tri] 77 | out[tri] = set() 78 | for k in ks[si:ei]: 79 | out[tri].add(k) 80 | si = ei 81 | return out 82 | -------------------------------------------------------------------------------- /script/bubblecalling.py: -------------------------------------------------------------------------------- 1 | from kmerutils import getRCkmer, read2kmers, decodeNumericString 2 | 3 | def e2ce(e): 4 | er = getRCkmer(e,22) 5 | return min(e, er) 6 | 7 | def k2ck(k): 8 | kr = getRCkmer(k,21) 9 | return min(k, kr) 10 | 11 | class Edge: 12 | def __init__(self, edge, parent, child): 13 | self.e = edge 14 | self.p = parent 15 | self.c = child 16 | self.a = False # isalive 17 | self.ue = None # upstream edge 18 | self.de = [] # downstream edge(s) 19 | def __str__(self): 20 | return f"{self.e} {self.p} {self.c} {self.a} up: {self.ue.e if self.ue else None} down: {[e.e for e in self.de]}" 21 | 22 | class Cyclic_DFS: 23 | def __init__(self): 24 | self.q = [] # queue 25 | self.g = set() # growing nodes 26 | self.sni2nx = [] # [(nodex0, edgex0), ...] 27 | self.sni2n = [] # [set([node0, ...]), ...] 28 | self.sni2e = [] # [[e0, ...], ...] 29 | self.n2sni = {} # {node0:supernode_id, ...} 30 | 31 | def add(self, e0, e1s): 32 | for e1 in e1s: 33 | e0.de.append(e1) 34 | e1.ue = e0 35 | 36 | def prune(self, dead, e): 37 | # backtrack until last branching node 38 | pruned = set() 39 | while len(e.de) < 2 and e.e is not None: 40 | pruned.add(e.c) 41 | e_ = e 42 | e = e.ue 43 | if e.e is not None: # not the root edge 44 | e.de.remove(e_) 45 | e_.ue = None 46 | dead |= pruned 47 | self.g -= pruned 48 | return e 49 | 50 | def remove_supernode(self, sni): 51 | for n in self.sni2n[sni]: 52 | self.n2sni.pop(n) 53 | self.sni2nx.pop(sni) 54 | self.sni2n.pop(sni) 55 | self.sni2e.pop(sni) 56 | 57 | def make_alive(self, alive, alive_e, e): 58 | # bacaktrack until an alive edge 59 | survived = set() 60 | while True: 61 | if e.e is None: break # root edge 62 | if e.a: break 63 | if e.p in self.n2sni: # pa is in a supernode 64 | sni = self.n2sni[e.p] 65 | nodex, edgex = self.sni2nx[sni] 66 | survived |= self.sni2n[sni] 67 | for e_ in self.sni2e[sni]: 68 | alive_e.add(e_.e) 69 | e.a = True 70 | self.remove_supernode(sni) 71 | e = edgex 72 | else: 73 | survived.add(e.p) 74 | alive_e.add(e.e) 75 | e.a = True 76 | e = e.ue 77 | alive |= survived 78 | self.g -= survived 79 | return self.q[-1].ue if self.q else None 80 | 81 | def merge(self, e): 82 | if e.c in self.n2sni: 83 | sni = self.n2sni[e.c] 84 | nodex, _ = self.sni2nx[sni] 85 | else: 86 | nodex = e.c 87 | 88 | # backtrack until nodex 89 | sn = set([e.p, e.c]) 90 | se = [e] 91 | usni = set([self.n2sni[e.p]]) if e.p in self.n2sni else set() 92 | npa = self.q[-1].p if self.q else None # next pa to start dfs 93 | found = e if e.c == npa else False 94 | while e.p != nodex: 95 | e = e.ue 96 | if e.e is None: assert False 97 | if e.c == npa: 98 | found = e 99 | if e.p in self.n2sni: 100 | sni = self.n2sni[e.p] 101 | usni.add(sni) 102 | else: 103 | sn.add(e.p) 104 | se.append(e) 105 | 106 | if usni: 107 | for sni in usni: 108 | sn |= self.sni2n[sni] 109 | se += self.sni2e[sni] 110 | self.sni2nx[sni] = None 111 | self.sni2n[sni] = None 112 | self.sni2e[sni] = None 113 | self.sni2nx.append((nodex, e.ue)) 114 | self.sni2n.append(sn) 115 | self.sni2e.append(se) 116 | sni = len(self.sni2nx) - 1 117 | for n in sn: 118 | self.n2sni[n] = sni 119 | 120 | return found if found else e 121 | 122 | def check_survival(self, dead, e0): 123 | ch = e0.c 124 | if ch not in self.n2sni: return None 125 | 126 | sni = self.n2sni[ch] 127 | nodex, _ = self.sni2nx[sni] 128 | if ch != nodex: return None 129 | 130 | e1s = e0.de 131 | isalive = any([e1.a for e1 in e1s]) 132 | e0.de = [] 133 | for e1 in e1s: 134 | e1.ue = None 135 | ns = self.sni2n[sni] 136 | dead |= ns 137 | self.g -= ns 138 | self.remove_supernode(sni) 139 | return self.prune(dead, e0) 140 | 141 | def check_edge_v1(gf, trks, ntrks, e, dfs, alive, alive_e, dead, verbose=False): 142 | """ 143 | return: isalive, bte 144 | is_alive: 145 | 0: dead 146 | 1: growing, non-terminal 147 | 2: growing, terminal, merged with existing growing branch 148 | 3: alive 149 | bte 150 | - backtrack edge, used to traverse upstream in search for dfs.q[-1].ue 151 | - if bte is None: dfs.q is empty 152 | - if bte == 0: growing path, no need to backtrack 153 | """ 154 | if e.p == e.c: # when it forms a self-loop 155 | if verbose: print("[X.homo]",end=" ") 156 | bte = dfs.prune(dead, e) 157 | return 0, bte 158 | 159 | if e.c in alive: # when it merges with an alive branch 160 | if verbose: print("[O.merge]", end=" ") 161 | bte = dfs.make_alive(alive, alive_e, e) 162 | return 3, bte 163 | if e.c in trks: # complete bubble 164 | if verbose: print("[O.tr]", end=" ") 165 | bte = dfs.make_alive(alive, alive_e, e) 166 | return 3, bte 167 | 168 | if e.c not in gf: # when it's a tip 169 | if verbose: print(f"[X.tip]",end=" ") 170 | dead.add(e.c) 171 | bte = dfs.prune(dead, e) 172 | return 0, bte 173 | if e.c in dead: # when it merges with a dead branch 174 | if verbose: print("[X.dead]",end=" ") 175 | bte = dfs.prune(dead, e) 176 | return 0, bte 177 | if e.c in ntrks: # when it reaches NTR 178 | if verbose: print("[X.NTR]",end=" ") 179 | bte = dfs.prune(dead, e) 180 | return 0, bte 181 | 182 | if e.c in dfs.g: # when it merges with a growing branch 183 | if verbose: print("[m.grow]",end=" ") 184 | bte = dfs.merge(e) 185 | return 2, bte 186 | else: # growing branch w/ unknown survival 187 | dfs.g.add(e.c) 188 | return 1, 0 189 | 190 | def decode_edges(gf, pa): 191 | out = gf[pa] 192 | es = [] 193 | mask = (1<<(2*21)) - 1 194 | pa_km1 = ((pa << 2) & mask) 195 | for i in range(4): 196 | if out % 2: 197 | ch = pa_km1 + i 198 | e = (pa << 2) + i 199 | es.append(Edge(e, pa, ch)) 200 | out >>= 1 201 | ne = len(es) 202 | return ne, es 203 | 204 | def es2bigf(es, k=22, bi=True): 205 | gf = {} 206 | for e in es: 207 | pa = e >> 2 208 | nt = e % 4 209 | if pa not in gf: 210 | gf[pa] = 2**nt 211 | else: 212 | gf[pa] |= 2**nt 213 | # make it bidirectional 214 | if bi: 215 | er = getRCkmer(e, k) 216 | par = er >> 2 217 | ntr = er % 4 218 | if par not in gf: 219 | gf[par] = 2**ntr 220 | else: 221 | gf[par] |= 2**ntr 222 | return gf 223 | 224 | def check_bubble_root_edge(rt, edge, gf, trks, ntrks, alive, dead): 225 | alive_e = set() 226 | dfs = Cyclic_DFS() 227 | dfs.q = [edge] 228 | dfs.add(rt, [edge]) 229 | while True: 230 | e0 = dfs.q.pop() 231 | isalive, bte = check_edge_v1(gf, trks, ntrks, e0, dfs, alive, alive_e, dead) 232 | while bte == 0: # growing path, no need to backtrack 233 | ne, e1s = decode_edges(gf, e0.c) 234 | dfs.add(e0, e1s) 235 | if ne > 1: 236 | for i in range(len(e1s)-1): 237 | dfs.q.append(e1s[i]) 238 | e0 = e1s[-1] 239 | isalive, bte = check_edge_v1(gf, trks, ntrks, e0, dfs, alive, alive_e, dead) 240 | 241 | # backtrack till dfs.q[-1].ue 242 | if not dfs.q: break 243 | npa = dfs.q[-1].p # next pa to start dfs 244 | while bte.c != npa: # done traversing the subtree of bte 245 | out = dfs.check_survival(dead, bte) # check nodex and survival 246 | if out is None: 247 | bte = bte.ue 248 | else: 249 | bte = out 250 | return alive_e 251 | 252 | -------------------------------------------------------------------------------- /script/chrsize.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | awk 'BEGIN {OFS="\t"} 4 | { 5 | if ($1 ~ />/) { 6 | if (size) {print name, size} 7 | name = substr($1,2,length($1)-1); size=0 8 | } 9 | else { size += length($1) } 10 | } 11 | END { 12 | print name, size 13 | }' $1 | sort -k1,1 14 | -------------------------------------------------------------------------------- /script/fixMaskedFasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # the original file has missing empty loci and unexpeted short loci (< 4200 bp) 4 | 5 | import argparse 6 | import csv 7 | ap = argparse.ArgumentParser(description="fix missing titles and splitted lines in a fasta file") 8 | ap.add_argument("th", help="length threshold for each locus", type=int) 9 | ap.add_argument("fi", help="suffix of corrupted file e.g. combined-hap.fasta.masked for HG00514.h0.combined-hap.fasta.masked") 10 | ap.add_argument("--fixTitle", help="fix missing titles as well, otherwise will fix splitted lines only", action='store_true') 11 | ap.add_argument("--useCSV", help="use CSV file to configure haps", action='store_true') 12 | ap.add_argument("--haps", help="specify hap to process") 13 | args = ap.parse_args() 14 | print(args) 15 | th = args.th 16 | 17 | fixInv = False # flag for fixing *.inv.fasta file 18 | if args.fi.split('.')[-2:] == ["inv", "fasta"]: 19 | fixInv = True 20 | 21 | # return next read title ending with '\n' 22 | def mergeSplittedLines(fi, fout): 23 | line = fi.readline() 24 | seq = "" 25 | while line[0] != ">": 26 | seq += line.rstrip() 27 | line = fi.readline() 28 | 29 | if not line: # end of file 30 | if len(seq) >= th: 31 | fout.write(seq + "\n") 32 | return "" 33 | 34 | if len(seq) >= th: 35 | fout.write(seq + "\n") 36 | 37 | return line 38 | 39 | 40 | 41 | # get haps 42 | haps = [] 43 | if args.useCSV: 44 | with open("haplotypes.csv", newline="") as f: 45 | for row in csv.reader(f): 46 | if row[0] != "Name": haps.append(row[0]) 47 | else: 48 | haps = args.haps.split() 49 | print(haps) 50 | 51 | 52 | # fix file format 53 | for hap in haps: 54 | print("processing ", hap) 55 | 56 | with open(hap+"."+args.fi+".fix" , 'w') as outf: 57 | with open(hap+"."+args.fi, 'r') as f2: # corrupted file 58 | ind = -1 59 | if not args.fixTitle: 60 | line2 = f2.readline() 61 | while line2: 62 | ind += 1 63 | outf.write(line2) 64 | line2 = mergeSplittedLines(f2, outf) 65 | 66 | else: 67 | with open(hap+".combined-hap.fasta", 'r') as f1: # reference 68 | line2 = f2.readline() 69 | 70 | for line1 in f1: 71 | if not line1: break 72 | if line1[0] == ">": 73 | ind += 1 74 | 75 | if fixInv: 76 | #print('/'.join(line2.split('_')[:3])) 77 | # if read title is found in reference, merge splitted lines of a read to a single read 78 | if line1.rstrip() == '/'.join(line2.split('_')[:3]): 79 | while line1.rstrip() == '/'.join(line2.split('_')[:3]): 80 | outf.write(line1) 81 | line2 = mergeSplittedLines(f2, outf) 82 | else: 83 | outf.write(line1) 84 | 85 | else: 86 | outf.write(line1) 87 | # if read title is found in reference, merge splitted lines of a read to a single read 88 | if line1.rstrip() == line2.split(':')[0]: 89 | line2 = mergeSplittedLines(f2, outf) 90 | 91 | print("finished at locus", ind) 92 | 93 | -------------------------------------------------------------------------------- /script/getCovByLocus.397.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #refctrl=/home/cmb-17/mjc/vntr_genotyping/cmb-16/work/vntr/hapdb/a1_regions/ctrl/pan.fn0.bed.bak 4 | refctrl=/home/cmb-17/mjc/vntr_genotyping/cmb-16/work/vntr/hapdb/a1_regions/ctrl/pan.fn2.bed 5 | out=ctrl.397.cov 6 | datadir=/home/cmb-16/nobackups/mjc/data_download/phase3 7 | 8 | gi=0 9 | for cram in $datadir/*.cram; do 10 | g=$(basename $cram | cut -b 1-7) 11 | samtools bedcov $refctrl $cram | awk '{print $4/($3-$2)}' | tr '\n' '\t' | 12 | awk -v g=$g -v gi=$gi 'BEGIN {OFS="\t"} {$1=$1; print gi, g, $0}' 13 | ((++gi)) 14 | done > $out 15 | -------------------------------------------------------------------------------- /script/individualExpansion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import vntrutils as vu 5 | import numpy as np 6 | import pickle 7 | from multiprocessing import Pool, Manager 8 | 9 | 10 | def individualTRexpansion(seqs, poss, hap, locus, UB, ksize=21, zoomout=False, verbose=False): 11 | """plot loci w/ primary contamination only""" 12 | 13 | if locus not in poss[hap]: return False, False 14 | seq = seqs[hap][locus] 15 | pos = poss[hap][locus] 16 | 17 | start, end = pos 18 | TRsize = end - start 19 | kmers = vu.read2kmers(seq, ksize) 20 | kmersi = vu.getkmersindex(kmers) 21 | 22 | xs, ys, badfrom, badto = [], [], [], [] 23 | badkmc = np.zeros(2, dtype=int) 24 | newregion = pos 25 | for kmer, indices in kmersi.items(): 26 | if len(indices) > 1: 27 | 28 | # analyze contamination 29 | badkmc_, badindices = vu.getbadkmc_bothhaps(indices, indices, pos, pos, getindices=True) 30 | badkmc_ = badkmc_[:2] 31 | badindices = badindices[0] 32 | badkmc += badkmc_ 33 | 34 | # compute new region to remove contamination 35 | if badindices: 36 | newregion = min(*(badindices), newregion[0]), max(max(badindices)+1, newregion[1]) # half open at the end 37 | 38 | while np.sum(badkmc) and start - newregion[0] < UB and newregion[1] - end < UB: 39 | if verbose: print(newregion, badkmc) 40 | prevregion = newregion 41 | badfrom, badto = [], [] 42 | badkmc = np.zeros(2, dtype=int) 43 | for kmer, indices in kmersi.items(): 44 | if len(indices) > 1: 45 | # analyze contamination 46 | badkmc_, badindices = vu.getbadkmc_bothhaps(indices, indices, prevregion, prevregion, getindices=True) 47 | badkmc_ = badkmc_[:2] 48 | badindices = badindices[0] 49 | badkmc += badkmc_ 50 | 51 | # compute new region to remove contamination 52 | if badindices: 53 | newregion = min(*(badindices), newregion[0]), max(max(badindices)+1, newregion[1]) # half open at the end 54 | 55 | if pos==newregion: return False, False, newregion 56 | else: 57 | if not np.any(badkmc): return True, True, newregion 58 | else: return True, False, newregion 59 | 60 | 61 | def minibatch_both_hap_expansion(minibatch): 62 | [miniseqs, miniposs] = pickle.load(open("seqpos{}.pickle".format(minibatch), 'rb')) 63 | newposs = [{}, {}] 64 | stats = np.zeros((2,2), dtype=int) 65 | 66 | for hap in [0,1]: 67 | nexpanded, nresolved = 0, 0 68 | 69 | for locus in range(nloci): 70 | if locus % nprocess != minibatch: continue 71 | if locus % (nloci//100) == 0: 72 | print(".", end="", flush=True) 73 | if locus not in miniposs[0]: continue 74 | 75 | expanded, resolved, newposs[hap][locus] = individualTRexpansion(miniseqs, miniposs, hap, locus, TRWINDOW-FS, ksize=KSIZE) 76 | 77 | if expanded: 78 | nexpanded += 1 79 | nresolved += resolved 80 | 81 | stats[hap] = nexpanded, nresolved 82 | return stats, newposs 83 | 84 | 85 | if __name__ == "__main__": 86 | nprocess, KSIZE, FS, UB, TRWINDOW, nloci = [int(v) for v in sys.argv[1:7]] 87 | newposs = [{}, {}] 88 | stats = np.zeros((2,2), dtype=int) 89 | 90 | print("Running individualExpansion", end="") 91 | p = Pool(nprocess) 92 | results = p.map(minibatch_both_hap_expansion, list(range(nprocess))) 93 | p.close(); p.join() 94 | print() 95 | 96 | for i in range(nprocess): 97 | for hap in [0,1]: 98 | for k, v in results[i][1][hap].items(): 99 | newposs[hap][k] = v 100 | stats += results[i][0] 101 | print(stats) 102 | 103 | pickle.dump([stats, newposs], open("newposs_stat.pickle", 'wb')) 104 | -------------------------------------------------------------------------------- /script/jointExpansion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import vntrutils as vu 5 | import numpy as np 6 | import pickle 7 | from multiprocessing import Pool, Manager 8 | 9 | 10 | def jointTRexpansion(seqs, poss, oldposs, locus, UB, ksize=21, ax=None, verbose=0): 11 | """plot loci w/ primary contamination only""" 12 | 13 | oldpos0 = oldposs[0][locus] 14 | oldpos1 = oldposs[1][locus] 15 | ostart0, oend0 = oldpos0[0], oldpos0[1]-ksize+1 16 | ostart1, oend1 = oldpos1[0], oldpos1[1]-ksize+1 17 | pos0 = poss[0][locus] 18 | pos1 = poss[1][locus] 19 | start0, end0 = pos0[0], pos0[1]-ksize+1 20 | start1, end1 = pos1[0], pos1[1]-ksize+1 21 | 22 | ctg0 = seqs[0][locus] 23 | ctg1 = seqs[1][locus] 24 | kmers0 = vu.read2kmers(ctg0, ksize, rightflank=ksize-1) 25 | kmers1 = vu.read2kmers(ctg1, ksize, rightflank=ksize-1) 26 | cokmers = (set(kmers0) & set(kmers1)) - set([0xffffffffffffffff]) 27 | cokmersi = vu.getcokmersindex(cokmers, kmers0, kmers1) 28 | 29 | badkmc = np.zeros(4, dtype=int) # 0L, 0R, 1L, 1R 30 | newregion0, newregion1 = (start0, end0), (start1, end1) 31 | for kmer in cokmers: 32 | indices0 = cokmersi[0][kmer] 33 | indices1 = cokmersi[1][kmer] 34 | 35 | # analyze contamination 36 | badkmc_, badindices = vu.getbadkmc_bothhaps(indices0, indices1, newregion0, newregion1, getindices=True) 37 | badkmc += badkmc_ 38 | 39 | # compute new region to remove contamination 40 | if badindices[0]: 41 | newregion0 = min(*(badindices[0]), newregion0[0]), max(max(badindices[0])+1, newregion0[1]) # half open at the end 42 | newregion1 = min(*(badindices[1]), newregion1[0]), max(max(badindices[1])+1, newregion1[1]) # half open at the end 43 | if verbose >= 2: 44 | print(badkmc_, badindices) 45 | 46 | es0 = (ostart0 - newregion0[0], newregion0[1] - oend0) # expansionsize h0 47 | es1 = (ostart1 - newregion1[0], newregion1[1] - oend1) # expansionsize h1 48 | if verbose and np.any(badkmc): print(es0, es1, badkmc) 49 | while np.any(badkmc) and es0[0] < UB and es0[1] < UB and es1[0] < UB and es1[1] < UB: 50 | prevregion0, prevregion1 = newregion0, newregion1 51 | badkmc = np.zeros(4, dtype=int) # 0L, 0R, 1L, 1R 52 | for kmer in cokmers: 53 | indices0 = cokmersi[0][kmer] 54 | indices1 = cokmersi[1][kmer] 55 | 56 | # analyze contamination 57 | badkmc_, badindices = vu.getbadkmc_bothhaps(indices0, indices1, prevregion0, prevregion1, getindices=True) 58 | badkmc += badkmc_ 59 | 60 | # compute new region to remove contamination 61 | if badindices[0]: 62 | newregion0 = min(*(badindices[0]), newregion0[0]), max(max(badindices[0])+1, newregion0[1]) # half open at the end 63 | newregion1 = min(*(badindices[1]), newregion1[0]), max(max(badindices[1])+1, newregion1[1]) # half open at the end 64 | if verbose >= 2: 65 | print(badindices) 66 | 67 | es0 = (ostart0 - newregion0[0], newregion0[1] - oend0) 68 | es1 = (ostart1 - newregion1[0], newregion1[1] - oend1) 69 | if verbose: print(es0, es1, badkmc) 70 | 71 | outregion0, outregion1 = (newregion0[0], newregion0[1]+ksize-1), (newregion1[0], newregion1[1]+ksize-1) 72 | if newregion0 == (start0, end0) and newregion1 == (start1, end1): 73 | return False, None, pos0, pos1 # good locus after individualExpansion 74 | if not np.any(badkmc): 75 | return True, True, outregion0, outregion1 76 | else: 77 | return True, False, outregion0, outregion1 78 | 79 | def minibatch_jointExpansion(minibatch): 80 | [miniseqs, mininewposs, miniposs] = pickle.load(open("seqnewpos{}.pickle".format(minibatch), 'rb')) 81 | jointnewposs = [{}, {}] 82 | jointstats = np.zeros(2, dtype=int) 83 | nexpanded, nresolved = 0, 0 84 | 85 | for locus in range(nloci): 86 | if locus % nprocess != minibatch: continue 87 | if locus % (nloci//100) == 0: 88 | print(".", end="", flush=True) 89 | if locus not in miniposs[0]: continue 90 | 91 | expanded, resolved, newregion0, newregion1 = jointTRexpansion(miniseqs, mininewposs, miniposs, locus, TRWINDOW-FS, ksize=KSIZE) 92 | 93 | jointnewposs[0][locus] = newregion0 94 | jointnewposs[1][locus] = newregion1 95 | 96 | if expanded: 97 | nexpanded += 1 98 | nresolved += resolved 99 | 100 | jointstats = nexpanded, nresolved 101 | return jointstats, jointnewposs 102 | 103 | 104 | if __name__ == "__main__": 105 | nprocess, KSIZE, FS, UB, TRWINDOW, nloci = [int(v) for v in sys.argv[1:7]] 106 | jointnewposs = [{}, {}] 107 | jointstats = np.zeros(2, dtype=int) 108 | 109 | print("Runnning JointExpansion", end="") 110 | p = Pool(nprocess) 111 | results = p.map(minibatch_jointExpansion, list(range(nprocess))) 112 | p.close(); p.join() 113 | print() 114 | 115 | for i in range(nprocess): 116 | for hap in [0,1]: 117 | for locus, newregion in results[i][1][hap].items(): 118 | jointnewposs[hap][locus] = newregion 119 | jointstats += results[i][0] 120 | print(jointstats) 121 | 122 | pickle.dump([jointstats, jointnewposs], open("jointnewposs_stat.pickle", 'wb')) 123 | -------------------------------------------------------------------------------- /script/kmc2length.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import argparse 5 | import numpy as np 6 | import pandas as pd 7 | import warnings 8 | from vntrutils import readKms 9 | #import pickle 10 | 11 | 12 | def get1DIQRmask(data, whis=1.5): 13 | m = np.isfinite(data) 14 | q1s = np.quantile(data[m], 0.25) 15 | q3s = np.quantile(data[m], 0.75) 16 | kIQRs = (q3s - q1s) * whis 17 | return ~m | (data < (q1s - kIQRs)) | (data > (q3s + kIQRs)) 18 | 19 | def loadvntrmat(fnames): 20 | vntrmat = np.zeros([fnames.size, nloci], dtype=int) 21 | for fi, f in enumerate(fnames): 22 | readKms(f, vntrmat[fi]) 23 | return vntrmat 24 | 25 | def processCtrlBamCov(covmat, whis=1.5): 26 | cov = covmat@ctrlsize / np.sum(ctrlsize) 27 | badmask = np.zeros_like(ctrlsize, dtype=bool) 28 | 29 | ### compute coverage for each locus; normalize wrt sample global coverage 30 | normcovmat = covmat / (covmat@ctrlsize / np.sum(ctrlsize))[:,None] 31 | 32 | ### check variance 33 | stds = np.std(normcovmat, axis=0) 34 | badmask = np.logical_or(badmask, get1DIQRmask(stds)) 35 | 36 | ### check if mean is biased 37 | means = np.mean(normcovmat, axis=0) 38 | badmask = np.logical_or(badmask, get1DIQRmask(means)) 39 | 40 | print(f'\t{np.sum(badmask)} out of {badmask.size} unique regions removed') 41 | 42 | ### reject outliers 43 | pctrlsize = ctrlsize[~badmask] 44 | pcovmat = covmat[:,~badmask] 45 | pcov = pcovmat@pctrlsize / np.sum(pctrlsize) 46 | 47 | ### covmat for nearest neighbor search 48 | normcovmat = covmat / cov[:,None] 49 | pnormcovmat = pcovmat / pcov[:,None] 50 | return pcov, pnormcovmat, normcovmat 51 | 52 | def loadLSB(f): 53 | df = pd.read_csv(f, sep="\t", index_col=0) 54 | nloci0 = df.shape[0] - nloci 55 | ntrbiasmat_db = df.iloc[:nloci0].to_numpy().T 56 | trbiasmat_db = df.iloc[nloci0:].to_numpy().T 57 | dbgenomes = np.array(df.columns) 58 | cbed = np.array([v[4:].split("_") for v in df.index[:nloci0]], dtype=object) 59 | ctrlsize = cbed[:,2].astype(int) - cbed[:,1].astype(int) 60 | return trbiasmat_db, ntrbiasmat_db, dbgenomes, ctrlsize 61 | 62 | def rowDistance(mat1, mat2, reject=True): 63 | # input: N1xL, N2xL matrix. output: N1xN2 distance matrix 64 | warnings.filterwarnings('ignore') 65 | 66 | n1, n2 = mat1.shape[0], mat2.shape[0] 67 | stats = np.zeros([n1, n2]) 68 | for i in range(n1): 69 | for j in range(n2): 70 | if reject: 71 | bm = get1DIQRmask(mat1[i]) | get1DIQRmask(mat2[j]) | (mat1[i] == 0) | (mat2[j] == 0) 72 | else: 73 | bm = ~np.isfinite(mat1[i]) | ~np.isfinite(mat2[j]) | (mat1[i] == 0) | (mat2[j] == 0) 74 | gt, est = mat1[i][~bm], mat2[j][~bm] 75 | stats[i,j] = np.nanmean(np.abs(1 - gt/est)) 76 | 77 | warnings.filterwarnings('default') 78 | return stats 79 | 80 | def lenPred(ilkms, bias, cov): 81 | est = np.full(nloci, np.nan) 82 | m = (bias > 0) & np.isfinite(bias) 83 | est[m] = ilkms[m] / (cov * bias[m]) 84 | est[est>=1] += (args.ksize - 1) 85 | est[est<1] *= args.ksize 86 | return est 87 | 88 | def BiasCorrectedLenPred(outdir="./"): 89 | N = trmat.shape[0] 90 | ests = np.full([N, nloci], np.nan) 91 | dis = rowDistance(ntrbiasmat, ntrbiasmat_db) 92 | bestids = np.argsort(dis, axis=1)[:,0] 93 | for idx, bidx in enumerate(bestids): # idx: sample index. bidx: index of best estimator in db 94 | ests[idx] = lenPred(trmat[idx], trbiasmat_db[bidx], pbamcov[idx]) 95 | return ests 96 | 97 | def SaveEstErr(ests, outdir="./"): 98 | trid = ["_".join(r) for r in trbed] 99 | df = pd.DataFrame(ests.T, index=trid) 100 | df.to_csv(f'{outdir}/estimated_TR_len.tsv', sep="\t", na_rep="nan") 101 | 102 | 103 | if __name__ == "__main__": 104 | ap = argparse.ArgumentParser(description=\ 105 | "Predict VNTR lengths from kmer genotype using precomputed locus-specific biases (LSB)\n"+\ 106 | "** Please download precomputed LSB from the GitHub release page") 107 | 108 | ap.add_argument("--outdir", help="output directory for estimation/error table", required=True) 109 | ap.add_argument("--ksize", help="kmer size of RPGG", type=int, required=True) 110 | ap.add_argument("--kmers", help="file of sorted kmer file names", required=True) 111 | ap.add_argument("--trbed", help="VNTR bed file", required=True) 112 | ap.add_argument("--LSB", help="Precomputed non-TR and TR locus-specific sampling biases", required=True) 113 | ap.add_argument("--cov", help="bam coverage file", required=True) 114 | ap.add_argument("--covbed", help="unique region bed file", required=True) 115 | args = ap.parse_args() 116 | wd = os.getcwd() 117 | outdir = args.outdir 118 | 119 | print("Loading metadata and precomputed TR/NTR LSB", flush=True) 120 | trbed = np.loadtxt(args.trbed, dtype=object, ndmin=2) 121 | nloci = trbed.shape[0] 122 | trbiasmat_db, ntrbiasmat_db, dbgenomes, ctrlsize = loadLSB(args.LSB) 123 | 124 | print("Computing NTR LSB in current dataset", flush=True) 125 | rawcovmat = np.loadtxt(args.cov, dtype=object, ndmin=2) 126 | pbamcov, _, ntrbiasmat = processCtrlBamCov(rawcovmat[:,2:].astype(float)) 127 | 128 | print("Loading genotype data", flush=True) 129 | trmat = loadvntrmat(np.loadtxt(args.kmers, dtype=object, ndmin=1)) 130 | #with open("analysis/trmat.pickle", 'wb') as f: 131 | # pickle.dump(trmat, f) 132 | #with open("analysis/trmat.pickle", 'rb') as f: 133 | # trmat = pickle.load(f) 134 | 135 | print("Estimating VNTR Length", flush=True) 136 | lenEstimates = BiasCorrectedLenPred(outdir=outdir) 137 | 138 | print("Writing outputs", flush=True) 139 | SaveEstErr(lenEstimates, outdir=outdir) # ncovmat is empirically better than pncovmat 140 | -------------------------------------------------------------------------------- /script/kmers.linreg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import vntrutils as vu 5 | import numpy as np 6 | import statsmodels.api as sm 7 | #from sklearn.linear_model import LinearRegression 8 | 9 | ap = argparse.ArgumentParser(description="read *.kmers and output regression plots and prediction results") 10 | ap.add_argument("pacbio", help="PB.tr.kmers with locus, kmer name, and kmer count information") 11 | ap.add_argument("illumina", help="IL.tr.kmers", nargs='+') 12 | ap.add_argument("out", help="output file prefix") 13 | ap.add_argument("--index", help="tr.kmers with locus and kmer name info used for IL.tr.kmers when only count info is present", nargs='?', const="", default="") 14 | ap.add_argument("--mapkmer", help="map pangenome kmers to genome kmers", action="store_true") 15 | ap.add_argument("--mode", help="Outlier rejection mode in regression. Choose from 'invalid', 'invalid|zero', 'invalid|bad' or 'invalid|bad|zero' Default: invalid", nargs='?', const="invalid", default="invalid") 16 | ap.add_argument("--combine", help="combine multiple IL.kmers when multiple IL.kmers are provided; will not perform regression. Default: False", action='store_true') 17 | ap.add_argument("--plot", help="plot regression results of the loci specified.", nargs='?', const="", default="") 18 | ap.add_argument("--threshold", help="rejecting outliers locating threshold*std away from the mean. Default: 10", type=int, nargs='?', const=10, default=10) 19 | ap.add_argument("--R2threshold", help="plot summary report for loci with R^2 > threshold. Default: -1", type=float, nargs='?', const=-1, default=-1) 20 | args = ap.parse_args() 21 | print(args) 22 | mapkmer = args.mapkmer 23 | threshold = args.threshold 24 | R2threshold = args.R2threshold 25 | combine = args.combine and len(args.illumina) != 1 26 | plotloci = set([int(v) for v in args.plot.split(",")]) if args.plot else set([]) 27 | 28 | print("reading illumina kmers") 29 | y = {} 30 | for fname in args.illumina: 31 | print("\treading", fname) 32 | if combine: 33 | vu.readKmerDict(fname, y) 34 | else: 35 | if args.index: 36 | vu.readKmersWithIndex(fname, args.index, y, sort=True, kmerName=mapkmer) 37 | else: 38 | vu.readKmers(fname, y, sort=True, kmerName=mapkmer) 39 | if combine: 40 | vu.writeKmerDict(args.out, y) 41 | exit(0) 42 | 43 | nloci = len(y) 44 | print("#loci:", nloci) 45 | 46 | print("reading pacbio kmers") 47 | x = {} 48 | vu.readKmers(args.pacbio, x, sort=True, kmerName=mapkmer) 49 | 50 | data = {} 51 | for k, v in y.items(): 52 | if v.size and x[k].size: 53 | if mapkmer: 54 | m = np.isin(y[k][:,0], x[k][:,0]) 55 | data[k] = np.column_stack((np.insert(x[k][:,1],0,0), np.insert(y[k][m,1],0,0))) 56 | else: 57 | data[k] = np.column_stack((np.insert(x[k],0,0), np.insert(y[k],0,0))) 58 | 59 | results = np.zeros((nloci, 4)) 60 | for k, v in x.items(): 61 | if v.size: 62 | if mapkmer: 63 | truth = np.sum(v[:,1]) 64 | else: 65 | truth = np.sum(v) 66 | results[k,0] = truth 67 | 68 | for k, v in data.items(): 69 | slope, _, r2, pred = vu.PlotRegression(v[:,0:1], v[:,1:2], "Assembly kmer counts", "Read kmer counts", 70 | f"locus {k}, n={v.shape[0]}", f"{args.out}.{k}", outlier=args.mode, pred=True) 71 | if k % 1000 == 0: 72 | print(".", end="", flush=True) 73 | results[k, 1:] = [pred, slope, r2] 74 | print() 75 | 76 | print("writing outputs") 77 | np.savetxt(f'{args.out}.pred', results, fmt=['%i','%.1f','%.2f','%.4f'], delimiter="\t", header="TrueDosage\tPredDosage\tSlope\tr^2") 78 | -------------------------------------------------------------------------------- /script/kmerutils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | base2num = {'A':0, 'C':1, 'G':2, 'T':3} 4 | num2base = ['A', 'C', 'G', 'T'] 5 | baseNumConversion = [ 6 | 'A', 'C', 'G', 'T', 127, 127, 127, 127, 7 | 127, 127, 127, 127, 127, 127, 127, 127, 8 | 127, 127, 127, 127, 127, 127, 127, 127, 9 | 127, 127, 127, 127, 127, 127, 127, 127, 10 | 127, 127, 127, 127, 127, 127, 127, 127, 11 | 127, 127, 127, 127, 127, 127, 127, 127, 12 | 127, 127, 127, 127, 127, 127, 127, 127, 13 | 127, 127, 127, 127, 127, 127, 127, 127, 14 | 127, 0, 127, 1, 127, 127, 127, 2, 15 | 127, 127, 127, 127, 127, 127, 127, 127, 16 | 127, 127, 127, 127, 3, 127, 127, 127, 17 | 127, 127, 127, 127, 127, 127, 127, 127, 18 | 127, 0, 127, 1, 127, 127, 127, 2, 19 | 127, 127, 127, 127, 127, 127, 127, 127, 20 | 127, 127, 127, 127, 3, 127, 127, 127] 21 | baseComplement = [ 22 | 3, 2, 1, 0, 127, 127, 127, 127, 23 | 127, 127, 127, 127, 127, 127, 127, 127, 24 | 127, 127, 127, 127, 127, 127, 127, 127, 25 | 127, 127, 127, 127, 127, 127, 127, 127, 26 | 127, 127, 127, 127, 127, 127, 127, 127, 27 | 127, 127, 127, 127, 127, 127, 127, 127, 28 | 127, 127, 127, 127, 127, 127, 127, 127, 29 | 127, 127, 127, 127, 127, 127, 127, 127, 30 | 127, 'T', 127, 'G', 127, 127, 127, 'C', 31 | 127, 127, 127, 127, 127, 127, 'N', 127, 32 | 127, 127, 127, 127, 'A', 127, 127, 127, 33 | 127, 127, 127, 127, 127, 127, 127, 127, 34 | 127, 't', 127, 'g', 127, 127, 127, 'c', 35 | 127, 127, 127, 127, 127, 127, 'n', 127, 36 | 127, 127, 127, 127, 'a', 127, 127, 127] 37 | byteRC = [ 38 | 255, 191, 127, 63, 239, 175, 111, 47, 223, 159, 39 | 95, 31, 207, 143, 79, 15, 251, 187, 123, 59, 40 | 235, 171, 107, 43, 219, 155, 91, 27, 203, 139, 41 | 75, 11, 247, 183, 119, 55, 231, 167, 103, 39, 42 | 215, 151, 87, 23, 199, 135, 71, 7, 243, 179, 43 | 115, 51, 227, 163, 99, 35, 211, 147, 83, 19, 44 | 195, 131, 67, 3, 254, 190, 126, 62, 238, 174, 45 | 110, 46, 222, 158, 94, 30, 206, 142, 78, 14, 46 | 250, 186, 122, 58, 234, 170, 106, 42, 218, 154, 47 | 90, 26, 202, 138, 74, 10, 246, 182, 118, 54, 48 | 230, 166, 102, 38, 214, 150, 86, 22, 198, 134, 49 | 70, 6, 242, 178, 114, 50, 226, 162, 98, 34, 50 | 210, 146, 82, 18, 194, 130, 66, 2, 253, 189, 51 | 125, 61, 237, 173, 109, 45, 221, 157, 93, 29, 52 | 205, 141, 77, 13, 249, 185, 121, 57, 233, 169, 53 | 105, 41, 217, 153, 89, 25, 201, 137, 73, 9, 54 | 245, 181, 117, 53, 229, 165, 101, 37, 213, 149, 55 | 85, 21, 197, 133, 69, 5, 241, 177, 113, 49, 56 | 225, 161, 97, 33, 209, 145, 81, 17, 193, 129, 57 | 65, 1, 252, 188, 124, 60, 236, 172, 108, 44, 58 | 220, 156, 92, 28, 204, 140, 76, 12, 248, 184, 59 | 120, 56, 232, 168, 104, 40, 216, 152, 88, 24, 60 | 200, 136, 72, 8, 244, 180, 116, 52, 228, 164, 61 | 100, 36, 212, 148, 84, 20, 196, 132, 68, 4, 62 | 240, 176, 112, 48, 224, 160, 96, 32, 208, 144, 63 | 80, 16, 192, 128, 64, 0] 64 | 65 | def getRCstring(string): 66 | RCstring = np.empty(len(string), dtype='U') 67 | rlen = len(string) 68 | for i in range(rlen): 69 | RCstring[rlen-i-1] = baseComplement[ord(string[i])] 70 | 71 | return ''.join(RCstring) 72 | 73 | 74 | def decodeNumericString(num, k): 75 | string = '' 76 | for i in range(k): 77 | string = num2base[(num % 4)] + string 78 | num >>= 2 79 | 80 | return string 81 | 82 | 83 | def getRCkmer(kmer, k): 84 | rckmer = 0 85 | while k >= 4: 86 | rckmer <<= 8 87 | rckmer += byteRC[kmer & 0xff] 88 | kmer >>= 8 89 | k -= 4 90 | if k > 0: 91 | rckmer <<= (k<<1) 92 | rckmer += (byteRC[kmer] >> ((4-k)<<1)) 93 | return rckmer 94 | 95 | 96 | def encodeString(string): 97 | """Direct encoding. Use string2CaKmer() for canonical encoding""" 98 | numericString = 0 99 | for i in range(len(string)): 100 | numericString = (numericString << 2) + base2num[string[i]] 101 | 102 | return numericString 103 | 104 | 105 | def string2CaKmer(string): 106 | kmer = encodeString(string) 107 | rckmer = getRCkmer(kmer, len(string)) 108 | return kmer if kmer <= rckmer else rckmer 109 | 110 | 111 | def getNextKmer(beg, seq, k): 112 | if beg + k >= len(seq): 113 | return (len(seq), 0) 114 | 115 | validlen = 0 116 | while validlen != k: 117 | if beg + k >= len(seq): 118 | return (len(seq), 0) 119 | if seq[(beg + validlen)] not in base2num: 120 | beg = beg + validlen + 1 121 | validlen = 0 122 | else: 123 | validlen += 1 124 | 125 | return (beg, encodeString(seq[beg:beg + k])) 126 | 127 | 128 | def buildNuKmers(read, k, leftflank=0, rightflank=0, count=True): 129 | rlen = len(read) 130 | mask = (1 << 2*(k-1)) - 1 131 | kmers = {} 132 | 133 | beg, kmer = getNextKmer(leftflank, read, k) 134 | if beg == rlen: return kmers 135 | rckmer = getRCkmer(kmer, k) 136 | 137 | it = iter(range(beg, rlen-k-rightflank+1)) 138 | for i in it: 139 | canonicalkmer = rckmer if kmer > rckmer else kmer 140 | if canonicalkmer not in kmers: kmers[canonicalkmer] = 0 141 | kmers[canonicalkmer] += count 142 | 143 | if i + k >= rlen: return kmers 144 | if read[i + k] not in num2base: 145 | nbeg, kmer = getNextKmer(i+k+1, read, k) 146 | rckmer = getRCkmer(kmer, k) 147 | for j in range(nbeg-i-1): 148 | next(it, None) 149 | else: 150 | kmer = ((kmer & mask) << 2) + base2num[read[i + k]] 151 | rckmer = (rckmer >> 2) + (3-base2num[read[i+k]] << 2*(k-1)) 152 | 153 | return kmers 154 | 155 | def read2kmers(read, k, leftflank=0, rightflank=0, dtype='uint64', canonical=True): 156 | """ 157 | CAUTION: 158 | - this function should be avoided and only used for certain visualization 159 | - the output kmers vector shifts index if a tract of kmers at the beginning contains N 160 | will return max_val_of_uint64 (-1) if there's 'N' within kmer 161 | """ 162 | rlen = len(read) 163 | if rlen - k - leftflank - rightflank + 1 <= 0: return np.array([]) 164 | 165 | mask = (1 << 2*(k-1)) - 1 166 | kmers = np.zeros(rlen-k-leftflank-rightflank+1, dtype=dtype) - 1 167 | 168 | beg, kmer = getNextKmer(leftflank, read, k) 169 | if beg == rlen: return kmers 170 | rckmer = getRCkmer(kmer, k) 171 | 172 | it = iter(range(beg, rlen-k-rightflank+1)) 173 | for i in it: 174 | canonicalkmer = rckmer if kmer > rckmer else kmer 175 | kmers[i-beg] = canonicalkmer if canonical else kmer 176 | 177 | if i + k >= rlen: return kmers 178 | if read[i + k] not in num2base: 179 | nbeg, kmer = getNextKmer(i+k+1, read, k) 180 | rckmer = getRCkmer(kmer, k) 181 | for j in range(nbeg-i-1): 182 | next(it, None) 183 | else: 184 | kmer = ((kmer & mask) << 2) + base2num[read[i + k]] 185 | rckmer = (rckmer >> 2) + (3-base2num[read[i+k]] << 2*(k-1)) 186 | 187 | return kmers 188 | 189 | def read2kmers_noshift(read, k, leftflank=0, rightflank=0, dtype='uint64', canonical=True): 190 | """ 191 | will return max_val_of_uint64 (-1) if there's 'N' within kmer 192 | """ 193 | rlen = len(read) 194 | if rlen - k - leftflank - rightflank + 1 <= 0: return np.array([]) 195 | 196 | mask = (1 << 2*(k-1)) - 1 197 | kmers = np.zeros(rlen-k-leftflank-rightflank+1, dtype=dtype) - 1 198 | 199 | beg, kmer = getNextKmer(leftflank, read, k) 200 | if beg == rlen: return kmers 201 | rckmer = getRCkmer(kmer, k) 202 | 203 | it = iter(range(beg, rlen-k-rightflank+1)) 204 | for i in it: 205 | canonicalkmer = rckmer if kmer > rckmer else kmer 206 | kmers[i-leftflank] = canonicalkmer if canonical else kmer 207 | 208 | if i + k >= rlen: return kmers 209 | if read[i + k] not in num2base: 210 | nbeg, kmer = getNextKmer(i+k+1, read, k) 211 | rckmer = getRCkmer(kmer, k) 212 | for j in range(nbeg-i-1): 213 | next(it, None) 214 | else: 215 | kmer = ((kmer & mask) << 2) + base2num[read[i + k]] 216 | rckmer = (rckmer >> 2) + (3-base2num[read[i+k]] << 2*(k-1)) 217 | 218 | return kmers 219 | -------------------------------------------------------------------------------- /script/liftbed.clean.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import numpy as np 5 | from collections import defaultdict 6 | 7 | # Usage: program 8 | if len(sys.argv) != 2: 9 | sys.exit("Usage: program \nLiftbed should be sorted. Filtered regions are printed to stdout") 10 | 11 | 12 | # assign chrom mapped by contig based on majority vote 13 | # fix edge cases of t5_t3 14 | # fix edge case where a region is split into >2 segments 15 | # Retain strand orientation tag 16 | # split read may have different orientation 17 | class DupInfo: 18 | def __init__(self): 19 | self.dup = False 20 | self.valid = True 21 | self.asm = "" 22 | self.regions = [] 23 | self.start = -1 24 | self.end = -1 25 | self.strand = [] 26 | 27 | def cleanbed(): 28 | r2a = defaultdict(DupInfo) 29 | for f1, f2, f3, f4, _, f6 in lb: 30 | r = "_".join(f4.split("_")[:3]) 31 | f2, f3 = int(f2), int(f3) 32 | if r not in r2a: 33 | r2a[r].asm = f1 34 | r2a[r].regions.append((f2,f3)) 35 | r2a[r].start = f2 36 | r2a[r].end = f3 37 | r2a[r].strand.append(f6) 38 | else: 39 | if r2a[r].valid: 40 | if r2a[r].asm == f1: 41 | r2a[r].dup = True 42 | d1 = f2 - r2a[r].end 43 | d2 = f3 - r2a[r].start 44 | d3 = r2a[r].start - f3 45 | if d1 <= 0 and d2 >= 0: # if overlap then merge 46 | r2a[r].start = min(r2a[r].start, f2) 47 | r2a[r].end = max(r2a[r].end, f3) 48 | r2a[r].regions.append((f2,f3)) 49 | r2a[r].strand.append(f6) 50 | elif d1 < 1e4 and d1 > 0: # downstream & gap < 1e4 51 | r2a[r].end = f3 52 | r2a[r].regions.append((f2,f3)) 53 | r2a[r].strand.append(f6) 54 | elif d3 < 1e4 and d3 > 0: # upstream & gap < 1e4 55 | r2a[r].start = f2 56 | r2a[r].regions.append((f2,f3)) 57 | r2a[r].strand.append(f6) 58 | else: 59 | r2a[r].valid = False 60 | else: # either share high similarity w/ another locus OR at the breakpoints of two contigs 61 | r2a[r].valid = False 62 | 63 | a2ch = defaultdict(lambda: defaultdict(int)) 64 | for f1, f2, f3, f4, _, f6 in lb: 65 | ch = f4.split("_")[0][3:] 66 | a2ch[f1][ch] += 1 67 | 68 | a2mc = {} # asm to major chrom 69 | for k0, v0 in a2ch.items(): 70 | tc, mc = 0, 0 71 | for k1, v1 in v0.items(): 72 | tc += v1 73 | if v1 > mc: 74 | mch = k1 75 | mc = v1 76 | if mc/tc >= 0.6: # check major mapped chrom freq 77 | a2mc[k0] = mch 78 | 79 | # write clean bed 80 | s2i = {"+": 1, "-": -1} 81 | for k, v in r2a.items(): 82 | rr = "\t".join(k.split("_")) 83 | if v.valid and v.asm in a2mc: 84 | ch = k.split("_")[0][3:] 85 | if ch == a2mc[v.asm]: # check if mapped chrom is the same as major mapped chrom 86 | strand = np.all(np.array(v.strand) == v.strand[0]) * s2i[v.strand[0]] # 1: plus strand, -1: minus strand, 0: mixed 87 | print(f'{v.asm}\t{v.start}\t{v.end}\t{rr}\t{strand}') 88 | 89 | 90 | if __name__ == "__main__": 91 | lb = np.loadtxt(sys.argv[1], dtype=object, ndmin=2, comments=None) 92 | cleanbed() 93 | 94 | 95 | -------------------------------------------------------------------------------- /script/mergeMBEbed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import numpy as np 5 | 6 | def parseMergeSet(): 7 | ms = [] 8 | bs = set() 9 | v2si = {} 10 | si = 0 11 | with open("mbe.m0.loci") as f: 12 | hap = "" 13 | for line in f: 14 | if line[0] == ">": 15 | hap = line.rstrip()[1:] 16 | continue 17 | seq = sorted([int(v) for v in line.rstrip().split(",")]) 18 | skip = seq[0] in bs # check if good v reported by this hap is bad in another hap 19 | for i in range(1,len(seq)): 20 | skip |= seq[i] in bs 21 | if seq[i] != seq[i-1] + 1: 22 | skip = True 23 | for v in seq: 24 | if v in v2si: # check if bad v reported by this hap is good in another hap 25 | si_ = v2si[v] 26 | if ms[si_] is not None: 27 | for v_ in ms[si_]: 28 | bs.add(v_) 29 | ms[si_] = None 30 | v2si.pop(v) 31 | bs.add(v) 32 | print(f"Bad seq {seq} in {hap}") 33 | break 34 | else: 35 | if skip: 36 | for v in seq: 37 | bs.add(v) 38 | if v in v2si: # check if v was once reported good 39 | si_ = v2si[v] 40 | ms[si_] = None 41 | v2si.pop(v) 42 | continue 43 | sis = set() # set index of existing v 44 | for v in seq: 45 | if v in v2si: 46 | sis.add(v2si[v]) 47 | if len(sis) == 0: # make a new set 48 | ms.append(set()) 49 | for v in seq: 50 | ms[-1].add(v) 51 | v2si[v] = si 52 | si += 1 53 | else: 54 | si_s = None 55 | if len(sis) > 1: 56 | vstr = f"{seq}" 57 | for si_i in sis: 58 | vstr += f"{ms[si_i]}" 59 | print(f"[Note] {hap} induced merging across {vstr}", flush=True) # enable merging multiple sets 60 | for si_i in sis: 61 | if si_s is None: 62 | si_s = si_i 63 | else: 64 | ms[si_s] |= ms[si_i] 65 | for v in ms[si_i]: 66 | v2si[v] = si_s 67 | ms[si_i] = None 68 | ms[si_s] |= set(seq) 69 | for v in seq: 70 | v2si[v] = si_s 71 | ms = np.array(ms, dtype=object) 72 | ms = ms[ms!=None].tolist() 73 | for i1s_ in ms: 74 | assert len(i1s_ & bs) == 0 75 | return ms, bs 76 | 77 | def getdist(bed): 78 | out = [] 79 | if int(bed[0,2]) == 1: # no inversion 80 | for i in range(bed.shape[0]-1): 81 | out.append(int(bed[i+1,0]) - int(bed[i,1])) 82 | else: 83 | for i in range(bed.shape[0]-1): 84 | out.append(int(bed[i,0]) - int(bed[i+1,1])) 85 | return out 86 | 87 | def writeBed_MergeMBE(MAXSVLEN=10000): 88 | ms, bs = parseMergeSet() 89 | 90 | # QC on merging set 91 | panbed = np.loadtxt(f"pan.tr.mbe.v1.bed", dtype=object, ndmin=2, comments=None) 92 | i1togood = {} 93 | qcb = [] # QC bad 94 | for i1s_ in ms: 95 | i1s = sorted(list(i1s_)) 96 | nm = len(i1s)-1 97 | dist = np.full([nm, 2*ng], np.nan) 98 | for hi in range(2*ng): 99 | if np.all(panbed[i1s,3+hi*4] != "None"): 100 | if np.any(panbed[i1s,3+hi*4] != panbed[i1s[0],3+hi*4]): 101 | print(f"[Haplotype removed] merging across contigs: {hi}\t{i1s}\n {panbed[i1s,3+hi*4]}") 102 | else: 103 | if np.any(panbed[i1s,6+hi*4] != panbed[i1s[0],6+hi*4]): 104 | print(f"[Note] {i1s} mixed orientation") 105 | dist[:,hi] = getdist(panbed[i1s,4+hi*4:7+hi*4]) 106 | good = np.all(np.isfinite(dist), axis=0) 107 | if np.nanmax(dist) > MAXSVLEN: 108 | qcb.append(i1s_) 109 | print(f"[Loci removed] huge SV, {i1s}") 110 | elif np.sum(good)/(2*ng) < THRESH: 111 | qcb.append(i1s_) 112 | print(f"[Loci removed] QC failed {i1s}") 113 | else: 114 | i1togood[i1s[0]] = good # record hap to remove 115 | for i1s_ in qcb: 116 | assert i1s_ in ms 117 | ms.remove(i1s_) 118 | for i1 in i1s_: 119 | bs.add(i1) 120 | nmi = 0 121 | mis = set() 122 | for i1s_ in ms: 123 | nmi += len(i1s_) 124 | for i1 in i1s_: 125 | mis.add(i1) 126 | 127 | # fill v2 bed 128 | nloci1, _ = panbed.shape 129 | for i1s_ in ms: 130 | assert len(i1s_ & bs) == 0 131 | pv2bed = np.full([nloci1-nmi+len(ms)-len(bs), 3+2*ng*4], None, dtype=object) 132 | nloci2, _ = pv2bed.shape 133 | i2toi1 = set(list(range(nloci1))) - mis - bs | set([sorted(list(i1s_))[0] for i1s_ in ms]) 134 | i2toi1 = sorted(list(i2toi1)) # map v2 index to v1 135 | assert nloci2 == len(i2toi1) 136 | i1toi2 = np.full(nloci1, None, dtype=object) 137 | i1toi2[i2toi1] = np.arange(nloci2) # map v1 index to v2 138 | pv2bed = panbed[i2toi1] 139 | for i1s_ in ms: 140 | i1s = sorted(list(i1s_)) 141 | # fill ref 142 | i2 = i1toi2[i1s[0]] 143 | ids = i1s[0] 144 | ide = i1s[-1]+1 145 | refs = min([int(s) for s in panbed[ids:ide,1]]) 146 | refe = max([int(e) for e in panbed[ids:ide,2]]) 147 | pv2bed[i2,[1,2]] = [refs, refe] 148 | # fill asm 149 | for hi in range(2*ng): 150 | if not i1togood[i1s[0]][hi]: # bad hap to remove 151 | pv2bed[i2,3+hi*4:7+hi*4] = ["None"]*4 152 | continue 153 | asms = min([int(s) for s in panbed[ids:ide,4+hi*4]]) 154 | asme = max([int(e) for e in panbed[ids:ide,5+hi*4]]) 155 | pv2bed[i2,4+hi*4:6+hi*4] = [asms, asme] 156 | np.savetxt("pan.tr.mbe.v2.bed", pv2bed, delimiter="\t", fmt='%s') 157 | 158 | # orthology map 159 | lmap = np.full([nloci2, 2*ng], ".", dtype=object) 160 | for hi in range(2*ng): 161 | m = pv2bed[:,3+4*hi] != "None" 162 | lmap[m,hi] = np.arange(np.sum(m)) 163 | np.savetxt("OrthoMap.v2.tsv", lmap, delimiter="\t", fmt='%s') 164 | np.savetxt("locusMap.v2.to.v1.txt", i2toi1, fmt='%s') 165 | 166 | 167 | if __name__ == "__main__": 168 | gs = np.loadtxt(sys.argv[1], usecols=0, dtype=object, ndmin=1) 169 | ng = gs.size 170 | #panmap = np.loadtxt(sys.argv[2], dtype=object, ndmin=2)[:,3:].astype(int) 171 | THRESH = float(sys.argv[2]) 172 | writeBed_MergeMBE() 173 | -------------------------------------------------------------------------------- /script/multiBoundaryExpansion.parallel.single_hap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import sys 5 | import numpy as np 6 | from vntrutils import read2kmers_noshift 7 | import pickle 8 | from multiprocessing import Pool, Lock, Manager 9 | from time import sleep 10 | 11 | INVALID_KMER = 0xffffffffffffffff 12 | 13 | class expStat: 14 | def __init__(self, exp, fail, es, opos, npos): 15 | self.exp = exp 16 | self.fail = fail 17 | self.es = es 18 | self.opos = opos 19 | self.npos = npos 20 | 21 | class faCache: 22 | def __init__(self): 23 | self.ctg = "" 24 | self.hd = "" 25 | 26 | def make_process_pickle(): 27 | for i in range(NPROCESS): 28 | ibeg, iend = i*bsize, min((i+1)*bsize, nloci) 29 | indices = np.arange(ibeg, iend) 30 | with open(f"BE/{SN}.{i}.pickle", 'wb') as f: 31 | pickle.dump([BED[ibeg:iend,:], indices, ibeg], f) 32 | 33 | def load_process_pickle(i): 34 | with open(f"BE/{SN}.{i}.pickle", 'rb') as f: 35 | obj = pickle.load(f) 36 | return obj 37 | 38 | def get_ctg(fa, hd): 39 | assert hd in hd2i, f"{hd}" 40 | i = hd2i[hd] 41 | L, s, w = fai[i] 42 | e = s + (L-1)//w + L 43 | fa.seek(s, 0) 44 | return fa.read(e-s).decode("utf-8").replace("\n",""), L, s, e 45 | 46 | def get_seq_pos(fa, bed, fac, idx, ibeg): 47 | seq, pos = None, None 48 | hd = bed[idx-ibeg,0] 49 | if hd != fac.hd: 50 | fac.hd = hd 51 | with lock: 52 | fac.ctg, L, s, e = get_ctg(fa, hd) 53 | if len(fac.ctg) != L: 54 | raise ValueError(f"worker {ibeg//bsize}: {idx} ctg actually len {len(fac.ctg)} != theoretical len {L}; {fa.tell()}, {s}, {e}") 55 | s, e = [int(v) for v in bed[idx-ibeg,[1,2]]] 56 | assert s < e 57 | ns = s - TRWINDOW if s > TRWINDOW else 0 58 | ne = e + TRWINDOW if e + TRWINDOW < len(fac.ctg) else len(fac.ctg) 59 | seq = fac.ctg[ns:ne] 60 | pos = (s-ns, e-ns) # start pos of TR relative to the left end of NTR 61 | return seq, pos 62 | 63 | def boundaryExpansion(seq, pos, idx, ibeg, UB, ksize=21): 64 | trs = set() 65 | npos = pos 66 | s, e = npos 67 | tmp = read2kmers_noshift(seq, ksize, leftflank=s, rightflank=len(seq)-e) 68 | for kmer in tmp: # TR 69 | if kmer != INVALID_KMER: 70 | trs.add(kmer) 71 | 72 | # seq,dt can have distinct orientations 73 | # force kmers/noise to have the same orientation 74 | exp = False 75 | dt = np.zeros(2, dtype=int) + FS # delta start/end; initialized with flanksize 76 | kmers = np.full([2, FS], -1, dtype='uint64') 77 | fail = False 78 | while True: 79 | noise = np.zeros([2,FS], dtype=int) # left/right flank, flank pos 80 | sl = len(seq) 81 | s, e = npos 82 | lf = [s-FS , e+FS-dt[1]-ksize+1] # left flank of [left,right] delta_flank 83 | rf = [sl-s+FS-dt[0]-ksize+1, sl-e-FS ] # right flank of [left,right] delta_flank 84 | if lf[0] >= 0 and rf[1] >= 0: # indicate TR not near breakpoint 85 | assert lf[0] >= 0 and lf[1] >= 0 and rf[0] >= 0 and rf[1] >= 0, print(idx, s, e, sl) 86 | for sfl in [0,1]: # seq left, right flank 87 | if dt[sfl]: 88 | if sfl == 0: # (no-inv, left flank) or (inv, right flank) 89 | ns = dt[sfl] 90 | ne = FS 91 | os = 0 92 | oe = FS-dt[sfl] 93 | else: 94 | ns = 0 95 | ne = FS-dt[sfl] 96 | os = dt[sfl] 97 | oe = FS 98 | kmers[sfl,ns:ne] = kmers[sfl,os:oe] 99 | if sfl == 0: 100 | ns = 0 101 | ne = dt[sfl] 102 | else: 103 | ns = FS-dt[sfl] 104 | ne = FS 105 | kmers[sfl,ns:ne] = read2kmers_noshift(seq, ksize, leftflank=lf[sfl], rightflank=rf[sfl]) 106 | for ki, kmer in enumerate(kmers[sfl]): # recheck even if not expanded 107 | if kmer in trs: 108 | noise[sfl,ki] = 1 109 | else: 110 | fail = True 111 | print(f"{idx}.X ", flush=True) 112 | break 113 | 114 | if not np.any(noise): 115 | break 116 | else: 117 | exp = True 118 | dt = np.zeros(2, dtype=int) 119 | if np.any(noise[0]): 120 | dt[0] = FS - np.nonzero(noise[0])[0][0] 121 | for kmer in kmers[0,-dt[0]:]: 122 | if kmer != INVALID_KMER: 123 | trs.add(kmer) 124 | if np.any(noise[1]): 125 | dt[1] = np.nonzero(noise[1])[0][-1] + 1 126 | for kmer in kmers[1,:dt[1]]: 127 | if kmer != INVALID_KMER: 128 | trs.add(kmer) 129 | if np.any(noise): 130 | npos = (npos[0]-dt[0], npos[1]+dt[1]) 131 | if pos[0] - npos[0] > UB or npos[1] - pos[1] > UB: 132 | fail = True 133 | print(f"{idx}.X ", flush=True) 134 | break 135 | return exp, fail, npos 136 | 137 | def load_fai(fa): 138 | lock = Lock() 139 | fai = np.loadtxt(f"{fa}.fai", dtype=object, ndmin=2, comments=None) 140 | hd2i = {} 141 | for i, hd in enumerate(fai[:,0]): 142 | hd2i[hd] = i 143 | fai = fai[:,[1,2,3]].astype(int) 144 | return fai, hd2i, lock 145 | 146 | def gwBE(batch, stat): 147 | fa = open(f"{FASTA}", 'rb') 148 | bed, indices, ibeg = load_process_pickle(batch) # partial table; full_table_index = partial_table_index + ibeg 149 | out = [] 150 | fac = faCache() 151 | for idx in indices: 152 | stat[0] += 1 153 | seq, pos = get_seq_pos(fa, bed, fac, idx, ibeg) 154 | expanded, failed, npos = boundaryExpansion(seq, pos, idx, ibeg, TRWINDOW-FS, ksize=KSIZE) 155 | es = [] 156 | if expanded: 157 | stat[1] += 1 158 | if not failed: 159 | es = pos[0]-npos[0]+npos[1]-pos[1] 160 | else: 161 | stat[2] += 1 162 | es = -1 163 | else: 164 | es = 0 165 | out.append([idx, expStat(expanded, failed, es, pos, npos)]) 166 | if stat[0] % max(nloci//200, 1) == 0: 167 | print(f"n={stat[0]}({stat[0]/nloci:.1%}), n_expanded={stat[1]}, n_fail={stat[2]}", flush=True) 168 | fa.close() 169 | return out 170 | 171 | def writeBed_BE(): 172 | bs = set() # bad set 173 | for idx, expstat in idx2exp.items(): 174 | if expstat.exp: 175 | if expstat.fail: 176 | bs.add(idx) 177 | vi = sorted(list(set(range(nloci))-bs)) # valid indices 178 | 179 | nbed = np.full([nloci, 4], ".", dtype=object) 180 | BED = np.loadtxt(sys.argv[1], dtype=object, ndmin=2, comments=None) 181 | for tri in vi: 182 | os, oe = idx2exp[tri].opos 183 | ns, ne = idx2exp[tri].npos 184 | dt = [os-ns, ne-oe] 185 | s, e = int(BED[tri,1]), int(BED[tri,2]) 186 | s -= dt[0] 187 | e += dt[1] 188 | nbed[tri] = [BED[tri,0], s, e, sum(dt)] 189 | np.savetxt(f"{SN}.be.bed", np.hstack((BED,nbed)), delimiter="\t", fmt='%s') 190 | 191 | if __name__ == "__main__": 192 | if len(sys.argv) != 8: 193 | print("Usage: PROGRAM bed fasta sample_name kmer_size flank_size TR_window n_process") 194 | print("Output will be written to [sample_name].be.bed") 195 | exit() 196 | print("Loading data", flush=True) 197 | BED = np.loadtxt(sys.argv[1], dtype=object, ndmin=2, comments=None) 198 | FASTA = sys.argv[2] 199 | SN = sys.argv[3] 200 | KSIZE, FS, TRWINDOW = [int(sys.argv[i]) for i in range(4,7)] 201 | NPROCESS = int(sys.argv[7]) 202 | 203 | nloci = BED.shape[0] 204 | bsize = (nloci-1) // NPROCESS + 1 205 | fai, hd2i, lock = load_fai(FASTA) 206 | print(f"Making pickle for each process, {NPROCESS} in total") 207 | make_process_pickle() 208 | print("Running multi-boundary expansion", flush=True) 209 | del BED 210 | 211 | os.system(f"taskset -p 0xffffffff {os.getpid()}") 212 | stat = Manager().list([0,0,0]) # ncase, nexp, nfail 213 | idx2exp = {} 214 | with Pool(NPROCESS) as pool: 215 | results = [] 216 | for i in range(NPROCESS): 217 | results.append(pool.apply_async(gwBE, args=(i,stat))) 218 | ps = set(list(range(NPROCESS))) 219 | while ps: 220 | done = [] 221 | for i in ps: 222 | if results[i].ready(): 223 | print(f"[multiprocessing.Pool] worker {i} ended", flush=True) 224 | for k, v in results[i].get(): 225 | idx2exp[k] = v 226 | done.append(i) 227 | for i in done: 228 | ps.remove(i) 229 | sleep(5) 230 | print("Parallel computing results merged", flush=True) 231 | 232 | print("Dumping results", flush=True) 233 | with open(f"BE/{SN}.idx2exp.pickle", 'wb') as f: 234 | pickle.dump(idx2exp, f, protocol=pickle.HIGHEST_PROTOCOL) 235 | print("Writing new bed regions", flush=True) 236 | writeBed_BE() 237 | print("Done", flush=True) 238 | 239 | 240 | -------------------------------------------------------------------------------- /script/parseMergeSet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import numpy as np 4 | import pandas as pd 5 | import glob 6 | 7 | 8 | # utility functions 9 | class UnionFind: 10 | def __init__(self, n): 11 | self.parent = list(range(n)) 12 | self.freq = {} 13 | 14 | def find(self, i): 15 | if self.parent[i] == i: 16 | return i 17 | self.parent[i] = self.find(self.parent[i]) # path compression 18 | return self.parent[i] 19 | 20 | def union(self, x, y): 21 | k = f"{x},{y}" 22 | if k not in self.freq: 23 | self.freq[k] = 0 24 | if self.freq[k] >= 4: # only consider events with freq > N 25 | root_x = self.find(x) 26 | root_y = self.find(y) 27 | if root_x != root_y: 28 | if root_x < root_y: 29 | self.parent[root_y] = root_x 30 | else: 31 | self.parent[root_x] = root_y 32 | self.freq[k] += 1 33 | 34 | def mergeRefTR(ufp): 35 | rb0 = refTRv0.copy() 36 | rb1 = rb0.copy() 37 | i_ = None 38 | i1 = -1 39 | for i0, i in enumerate(ufp): 40 | ch, s, e = rb0.iloc[i0] # np.int64 41 | if i == i_: 42 | assert ch == rb1.iloc[i1,0] and s > rb1.iloc[i1,1] and e > rb1.iloc[i1,2], f"{i0} {i1} {i} {rb0.iloc[i0]}\t{rb1.iloc[i1]}" 43 | rb1.iloc[i1,2] = e 44 | else: 45 | i1 += 1 46 | rb1.iloc[i1] = rb0.iloc[i0] 47 | i_ = i 48 | if i0 != i1: 49 | rb1 = rb1.iloc[:i1+1] 50 | return rb1 51 | 52 | # workflow functions 53 | def parseMeta(fn): 54 | meta = np.loadtxt(fn, dtype=object) 55 | NH = np.sum(meta[:,1:].flatten() != "None") 56 | return meta, NH 57 | 58 | def get_refi(): 59 | hi = 0 60 | for gn, f0, f1 in meta: 61 | if gn == REFNAME: 62 | return hi 63 | if f1 != "None": 64 | hi += 2 65 | else: 66 | hi += 1 67 | assert False 68 | 69 | def getInputBeds(): 70 | fs = [] 71 | for gn, f0, f1 in meta: 72 | fs.append(f"{OUTDIR}/{gn}/tmp1.0.bed") 73 | if f1 != "None": 74 | fs.append(f"{OUTDIR}/{gn}/tmp1.1.bed") 75 | return fs 76 | 77 | def parseMergeSet(): 78 | fs = sorted(glob.glob(f"{OUTDIR}/*/*merge*")) 79 | uf = UnionFind(NTR0) 80 | print(f"{len(fs)} files") 81 | for fi, fn in enumerate(fs): 82 | if fi % 100 == 0: 83 | print(f"at file {fi}, {np.unique(uf.parent).size} loci remaining") 84 | with open(fn) as f: 85 | for line in f: 86 | tris = [int(v) for v in line.split()[0].split(",")] 87 | for i0, i1 in zip(tris[:-1], tris[1:]): 88 | assert i0 < i1 89 | uf.union(i0, i1) 90 | for tri in range(NTR0): 91 | uf.find(tri) # make sure every element only points to root 92 | 93 | print(f"{np.unique(uf.parent).size} loci remains after native merge") 94 | return uf, uf.parent 95 | 96 | def mergeQC(): 97 | ufp0 = pan_tri_v02v1 98 | rb0 = refTRv0.copy() 99 | 100 | # naive merge 101 | rb1 = mergeRefTR(ufp0) 102 | 103 | # TRlen QC 104 | l0s = (rb0.iloc[:,2] - rb0.iloc[:,1]).to_numpy() 105 | l1s = (rb1.iloc[:,2] - rb1.iloc[:,1]).to_numpy() 106 | i12i0s = {} 107 | i_ = None 108 | i1 = -1 109 | for i0, i in enumerate(ufp0): 110 | if i == i_: 111 | k = f"{i0-1},{i0}" 112 | if k in merge_events.freq: 113 | f = merge_events.freq[k] 114 | else: 115 | f = np.nan 116 | i12i0s[i1].append((i0, f)) 117 | else: 118 | i1 += 1 119 | i12i0s[i1] = [(i0, np.nan)] 120 | i_ = i 121 | inspect = {"good":[], "check":[], "bad":[]} 122 | reset = set() 123 | for i1, i0s in i12i0s.items(): 124 | if len(i0s) == 1: continue 125 | 126 | l1 = l1s[i1] 127 | l0s_ = [l0s[i0] for i0, _ in i0s] 128 | f = np.nanmean([v for _, v in i0s]) 129 | l0 = np.sum(l0s_) 130 | r = (l1-l0)/l0 131 | if r > 5: 132 | inspect["bad"].append([i1,i0s]) 133 | for i0, _ in i0s: 134 | reset.add(i0) 135 | elif r < 0.5: 136 | inspect["good"].append([i1,i0s]) 137 | else: # 0.5x ~ 5x 138 | inspect["check"].append([i1,i0s]) 139 | 140 | # apply QC and generate refTR.v1 141 | ufp1 = np.copy(ufp0) 142 | for i0 in reset: 143 | ufp1[i0] = i0 144 | print(f"# of loci after merge check is {np.unique(ufp1).size}") 145 | 146 | return inspect, ufp1 147 | 148 | def genRawPanbed(): 149 | fs = inBeds 150 | data0 = np.full([NH,NTR0,4], None, dtype=object) 151 | print("loading beds") 152 | for hi, fn in enumerate(fs): 153 | if hi % 50 == 0: print(".", end="", flush=True) 154 | bed = pd.read_csv(fn, sep="\t", header=None, comment="!") 155 | m = bed.iloc[:,0] != "." 156 | bed = bed[m] 157 | ptris = bed.iloc[:,7].astype(int) 158 | data0[hi, ptris, :3] = bed.iloc[:,:3] 159 | data0[hi, ptris, 3] = bed.iloc[:,6] # orientation 160 | print() 161 | return data0 162 | 163 | def genNewBeds(): 164 | ufp = pan_tri_v02v1_QC 165 | fs = inBeds 166 | data0 = panbed_v0_unmerge 167 | 168 | # generate refTR.v1 169 | rb1 = mergeRefTR(ufp) 170 | 171 | # generate panbed 172 | NTR1 = np.unique(ufp).size 173 | data1 = np.full([NH,NTR1,4], None, dtype=object) 174 | print("merging loci") 175 | nm = 0 # num of merging events 176 | n_s0 = 0 # num of cases, src None caused skip 177 | n_d0 = 0 # num of cases, dst None caused skip 178 | nb_ch = 0 # num of bad cases, inconsistnet ch 179 | nb_o = 0 # num of bad cases, inconsistnet o 180 | inspect = [] 181 | for hi in range(NH): 182 | if hi % 100 == 0: print(".", end="", flush=True) 183 | d0 = data0[hi] 184 | d1 = data1[hi] 185 | i_ = None 186 | i1 = -1 187 | for i0, i in enumerate(ufp): 188 | ch, s, e, o = d0[i0] # dtype=object 189 | if ch is not None: 190 | s, e, o = int(s), int(e), int(o) 191 | if i == i_: 192 | nm += 1 193 | if d1[i1,0] is None: 194 | if ch is not None: 195 | n_s0 += 1 196 | elif ch is None: # also, d1[i1,0] is not None 197 | n_d0 += 1 198 | d1[i1] = [None]*4 199 | else: 200 | if ch != d1[i1,0]: 201 | nb_ch += 1 202 | d1[i1] = [None]*4 203 | else: 204 | if o != int(d1[i1,3]): 205 | nb_o += 1 206 | d1[i1] = [None]*4 207 | inspect.append([rb1.iloc[i1], d0[i0], d1[i1]]) # reference not copy 208 | else: 209 | d1[i1,1] = min(s, int(d1[i1,1])) 210 | d1[i1,2] = max(e, int(d1[i1,2])) 211 | else: 212 | i1 += 1 213 | d1[i1] = np.copy(d0[i0]) 214 | i_ = i 215 | print() 216 | print(f"# of merging events: {nm}") 217 | print(f"# of merging skips s,d: {n_s0},{n_d0}") 218 | print(f"# of inconsistent ch cases: {nb_ch}") 219 | print(f"# of inconsistent o cases: {nb_o}") 220 | 221 | if np.any(np.all(data1[:,:,0]==None, axis=0)): 222 | assert False, f"some loci are all dropped: {np.nonzero(np.all(data1[:,:,0]==None, axis=0))}" 223 | 224 | return rb1, data1 225 | 226 | def writeNewBeds(): 227 | refTR_v1.to_csv(f"{OUTDIR}/refTR.v1.bed", sep="\t", index=None, header=None) 228 | tmp = np.hstack(panbed_v0) 229 | si = 4*refi 230 | ei = si + 3 231 | out = np.hstack((tmp[:, si:ei], tmp)) # first 3 cols are ref coord 232 | print(f"panbed.shape {out.shape}") 233 | np.savetxt(f"{OUTDIR}/pan.tr.mbe.v0.bed", out, delimiter="\t", fmt="%s", comments="!") 234 | 235 | 236 | if __name__ == "__main__": 237 | if len(sys.argv) == 1: 238 | print("Usage: program FN_refTRv0 FN_meta refName OUTDIR") 239 | sys.exit(0) 240 | 241 | refTRv0 = pd.read_csv(sys.argv[1], sep="\t", header=None, usecols=[0,1,2]) # /project/mchaisso_100/cmb-17/vntr_genotyping/rpgg/ng369/output/refTR.v0.bed 242 | NTR0 = refTRv0.shape[0] 243 | meta, NH = parseMeta(sys.argv[2]) # /project/mchaisso_100/cmb-17/vntr_genotyping/rpgg/ng369/input/allgenomes.meta.v4.txt 244 | REFNAME = sys.argv[3] 245 | refi = get_refi() 246 | OUTDIR = sys.argv[4] # /project/mchaisso_100/cmb-17/vntr_genotyping/rpgg/ng369/output 247 | inBeds = getInputBeds() 248 | merge_events, pan_tri_v02v1 = parseMergeSet() 249 | large_gap_case_to_inspect, pan_tri_v02v1_QC = mergeQC() 250 | panbed_v0_unmerge = genRawPanbed() 251 | refTR_v1, panbed_v0 = genNewBeds() 252 | writeNewBeds() # OUTDIR/refTR.v1.bed, OUTDIR/pan.tr.mbe.v0.bed 253 | -------------------------------------------------------------------------------- /script/prepareIndividualDatasets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import numpy as np 5 | import pickle 6 | 7 | nprocess, KSIZE, FS, UB, TRWINDOW, nloci = [int(v) for v in sys.argv[1:7]] 8 | fastafiles = sys.argv[7:9] 9 | beds = [np.loadtxt(bed, dtype=object, ndmin=2) for bed in sys.argv[9:11]] 10 | 11 | def getName2loci(beds): 12 | name2loci = [{}, {}] 13 | for hap in range(2): 14 | for locus in range(nloci): 15 | name = beds[hap][locus,0] 16 | if name != "NA": 17 | if name not in name2loci[hap]: 18 | name2loci[hap][name] = [] 19 | name2loci[hap][name].append(locus) 20 | return name2loci 21 | 22 | def getLociSeqFromCtg(hap, ctg, locus, beds, window=TRWINDOW): 23 | start = int(beds[hap][locus,1]) 24 | end = int(beds[hap][locus,2]) 25 | assert start < end 26 | start = start-window if start > window else 0 27 | end = end+window if end+window <= len(ctg) else len(ctg) 28 | return ctg[start:end] 29 | 30 | def getLociPosFromCtg(hap, ctg, locus, beds, window=TRWINDOW): 31 | # TODO: this includes 1k flanking seq, although will be rescued later 32 | start = int(beds[hap][locus,1]) 33 | end = int(beds[hap][locus,2]) 34 | assert start < end 35 | NTRstart = start-window if start > window else 0 36 | return (start-NTRstart, end-NTRstart) # start pos of TR relative to the left end of NTR 37 | 38 | def getLociSeqPos(fastafiles, beds, name2loci): 39 | lociSeqs = [{}, {}] # {locus : seq} 40 | lociPoss = [{}, {}] # {locus : (start,end)} 41 | for hap, fastafile in enumerate(fastafiles): 42 | print("hap ",hap) 43 | with open(fastafile) as f: 44 | ctg = "" 45 | getctg = False 46 | 47 | for line in f: 48 | if line[0] == '>': 49 | if ctg: 50 | for locus in name2loci[hap][name]: 51 | lociSeqs[hap][locus] = getLociSeqFromCtg(hap, ctg, locus, beds) 52 | lociPoss[hap][locus] = getLociPosFromCtg(hap, ctg, locus, beds) 53 | ctg = "" 54 | getctg = False 55 | 56 | name = line.split()[0][1:] 57 | if name in name2loci[hap]: 58 | getctg = True 59 | else: 60 | if getctg: ctg += line.rstrip() 61 | if ctg: 62 | for locus in name2loci[hap][name]: 63 | lociSeqs[hap][locus] = getLociSeqFromCtg(hap, ctg, locus, beds) 64 | lociPoss[hap][locus] = getLociPosFromCtg(hap, ctg, locus, beds) 65 | return lociSeqs, lociPoss 66 | 67 | def prepareMinibatchData(seqs, poss): 68 | print("preparing minibatch datasets") 69 | for batch in range(nprocess): 70 | miniseqs, miniposs = [{}, {}], [{}, {}] 71 | for hap in [0,1]: 72 | for locus in range(nloci): 73 | if locus % nprocess == batch: 74 | if locus in poss[hap]: 75 | miniseqs[hap][locus] = seqs[hap][locus] 76 | miniposs[hap][locus] = poss[hap][locus] 77 | pickle.dump([miniseqs, miniposs], open("seqpos{}.pickle".format(batch), 'wb')) 78 | 79 | name2loci = getName2loci(beds) 80 | seqs, poss = getLociSeqPos(fastafiles, beds, name2loci) 81 | pickle.dump([seqs, poss], open("seqpos.pickle",'wb')) 82 | prepareMinibatchData(seqs, poss) 83 | 84 | -------------------------------------------------------------------------------- /script/prepareJointDatasets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | import sys 5 | import numpy as np 6 | import pickle 7 | 8 | nprocess, KSIZE, FS, UB, TRWINDOW, nloci = [int(v) for v in sys.argv[1:7]] 9 | [seqs, poss] = pickle.load(open("seqpos.pickle", 'rb')) 10 | [_, newposs] = pickle.load(open("newposs_stat.pickle", 'rb')) 11 | 12 | def prepareMinibatchData(seqs, poss, newposs): 13 | print("preparing minibatch datasets") 14 | for batch in range(nprocess): 15 | miniseqs, mininewposs, miniposs= [{}, {}], [{}, {}], [{}, {}] 16 | for hap in [0,1]: 17 | for locus in range(nloci): 18 | if locus % nprocess == batch: 19 | if locus in poss[hap]: 20 | miniseqs[hap][locus] = seqs[hap][locus] 21 | mininewposs[hap][locus] = newposs[hap][locus] 22 | miniposs[hap][locus] = poss[hap][locus] 23 | pickle.dump([miniseqs, mininewposs, miniposs], open("seqnewpos{}.pickle".format(batch), 'wb')) 24 | 25 | prepareMinibatchData(seqs, poss, newposs) 26 | -------------------------------------------------------------------------------- /script/prepareQCDatasets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | import sys 5 | import numpy as np 6 | import pickle 7 | 8 | nprocess, KSIZE, FS, UB, TRWINDOW, nloci = [int(v) for v in sys.argv[1:7]] 9 | [seqs, poss] = pickle.load(open("seqpos.pickle", 'rb')) 10 | [_, jointnewposs] = pickle.load(open("jointnewposs_stat.pickle", 'rb')) 11 | 12 | def prepareMinibatchData(seqs, poss, jointnewposs): 13 | print("preparing minibatch datasets") 14 | for batch in range(nprocess): 15 | miniseqs, minijointnewposs, miniposs = [{}, {}], [{}, {}], [{}, {}] 16 | for hap in [0,1]: 17 | for locus in range(nloci): 18 | if locus % nprocess == batch: 19 | if locus in poss[hap]: 20 | miniseqs[hap][locus] = seqs[hap][locus] 21 | minijointnewposs[hap][locus] = jointnewposs[hap][locus] 22 | miniposs[hap][locus] = poss[hap][locus] 23 | pickle.dump([miniseqs, minijointnewposs, miniposs], open("seqjointnewpos{}.pickle".format(batch), 'wb')) 24 | 25 | prepareMinibatchData(seqs, poss, jointnewposs) 26 | 27 | -------------------------------------------------------------------------------- /script/rmLinebyIndFile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import numpy as np 5 | 6 | if len(sys.argv) == 1 or sys.argv[1] == "-h" or sys.argv[1] == "--help": 7 | print( 8 | """ 9 | Remove line indices (0-based) specified in 'index.txt' 10 | usage: program [-k] index.txt inFile 11 | -k Keep line indices in 'index.txt' instead of removing them. 12 | """) 13 | sys.exit() 14 | 15 | rm = True 16 | idxf = "" 17 | infile = "" 18 | for i, v in enumerate(sys.argv): 19 | if i == 0: 20 | continue 21 | elif v == "-k": 22 | rm = False 23 | elif not idxf: 24 | idxf = v 25 | elif not infile: 26 | infile = v 27 | else: 28 | assert False, f"too many arguments {v}" 29 | if not idxf: 30 | assert False, "index.txt not specified" 31 | if not infile: 32 | assert False, "inFile not specified" 33 | 34 | ids = set(np.loadtxt(idxf, dtype=int, ndmin=1).tolist()) 35 | with open(infile) as f: 36 | ind = 0 37 | for line in f: 38 | if (ind not in ids) == rm: 39 | print(line, end='') 40 | ind += 1 41 | -------------------------------------------------------------------------------- /script/rmNAforBothBeds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | 5 | if len(sys.argv) != 5: 6 | print("usage: program in.bed0 in.bed1 out.bed0 out.bed1") 7 | exit 8 | 9 | in0, in1, out0, out1 = sys.argv[1:] 10 | badids = set() 11 | with open(in0) as f: 12 | for ids, line in enumerate(f): 13 | if line[:2] == "NA": 14 | badids.add(ids) 15 | with open(in1) as f: 16 | for ids, line in enumerate(f): 17 | if line[:2] == "NA": 18 | badids.add(ids) 19 | 20 | with open(in0) as f: 21 | with open(out0, 'w') as fo: 22 | for ids, line in enumerate(f): 23 | if ids not in badids: 24 | fo.write(line) 25 | with open(in1) as f: 26 | with open(out1, 'w') as fo: 27 | for ids, line in enumerate(f): 28 | if ids not in badids: 29 | fo.write(line) 30 | -------------------------------------------------------------------------------- /script/sim.confusionMatrix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | srcdir = "/project/mchaisso_100/cmb-16/tsungyul/work/vntr/danbing-tk/script/" 5 | sys.path.insert(0, srcdir) 6 | import numpy as np 7 | import gzip 8 | import vntrutils as vu 9 | 10 | 11 | 12 | def load_trdb(fn): 13 | tri = -1 14 | trdb = np.empty(NTR, dtype=object) 15 | with open(fn) as f: 16 | for line in f: 17 | if line[0] == ">": 18 | tri += 1 19 | trdb[tri] = set() 20 | else: 21 | trdb[tri].add(int(line.split()[0])) 22 | return trdb 23 | 24 | def write_f1_filter(trdb, f_pref): 25 | TP = np.zeros(NTR, dtype=int) 26 | FP = np.zeros(NTR, dtype=int) 27 | FN = np.zeros(NTR, dtype=int) 28 | with gzip.open(f"{f_pref}.aln.gz", 'rt') as f: 29 | for line in f: 30 | vs = line.split() 31 | src, dst, dst_, tr1, tr2 = int(vs[0]), int(vs[1]), int(vs[2]), vs[8], vs[9] 32 | if src == dst: 33 | if "=" in tr1: 34 | TP[src] += 1 35 | if "=" in tr2: 36 | TP[src] += 1 37 | elif src == NTR: 38 | if "=" in tr1: 39 | FP[dst] += 1 40 | if "=" in tr2: 41 | FP[dst] += 1 42 | elif dst == NTR: 43 | if src == dst_: # take TR annot string directly 44 | if "=" in tr1: 45 | FN[src] += 1 46 | if "=" in tr2: 47 | FN[src] += 1 48 | else: # TR annot is not for src locus, rescan 49 | r1, r2 = vs[11], vs[13] 50 | trs = trdb[src] 51 | for km in vu.read2kmers_noshift(r1, 21): 52 | if km in trs: 53 | FN[src] += 1 54 | break 55 | for km in vu.read2kmers_noshift(r2, 21): 56 | if km in trs: 57 | FN[src] += 1 58 | break 59 | else: # FP-endo 60 | r1, r2 = vs[11], vs[13] 61 | trs = trdb[src] 62 | for km in vu.read2kmers_noshift(r1, 21): 63 | if km in trs: 64 | FN[src] += 1 65 | break 66 | for km in vu.read2kmers_noshift(r2, 21): 67 | if km in trs: 68 | FN[src] += 1 69 | break 70 | if "=" in tr1: 71 | FP[dst] += 1 72 | if "=" in tr2: 73 | FP[dst] += 1 74 | with open(f"{f_pref}.aln.f1.tsv", 'w') as f: 75 | f.write(f"TP\tFP\tFN\tPrecision\tRecall\tF1\n") 76 | for i in range(NTR): 77 | tp = TP[i] 78 | fp = FP[i] 79 | fn = FN[i] 80 | pc = tp / (tp+fp) if tp+fp else 0 81 | rc = tp / (tp+fn) if tp+fn else 0 82 | f1 = 2*tp / (2*tp + fp + fn) 83 | f.write(f"{TP[i]}\t{FP[i]}\t{FN[i]}\t{pc:.6f}\t{rc:.6f}\t{f1:.6f}\n") 84 | 85 | assert len(sys.argv) == 4 86 | PAN_TR_KMERS = sys.argv[1] # /project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/pan.tr.kmers 87 | FILE_PREF = sys.argv[2] # /project/mchaisso_100/cmb-17/vntr_genotyping/analysis/rare_var/sim/full/output2/hs1 88 | NTR = int(sys.argv[3]) 89 | 90 | trdb_full = load_trdb(PAN_TR_KMERS) 91 | 92 | write_f1_filter(trdb_full, FILE_PREF) 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /script/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import statsmodels.api as sm 3 | import matplotlib 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | matplotlib.rc('font', size=7) 8 | matplotlib.rc('axes', titlesize=7) 9 | matplotlib.rc('xtick', labelsize=5) 10 | matplotlib.rc('ytick', labelsize=5) 11 | 12 | def zscore(x): 13 | return (x - np.mean(x)) / np.std(x) 14 | 15 | def nanzscore(x): 16 | return (x - np.nanmean(x)) / np.nanstd(x) 17 | 18 | def box(df, xlab=None, ylab=None, hue=None, title=None, dpi=150, palette=["m","g"], lw=0.3, fs=0.5, figsize=(4,3), **kwargs): 19 | plt.figure(figsize=figsize, dpi=dpi) 20 | if xlab is None: 21 | xlab = df.columns[0] 22 | if ylab is None: 23 | ylab = df.columns[1] 24 | sns.boxplot(x=xlab, y=ylab, palette=["m", "g"], data=df, linewidth=lw, fliersize=fs, width=0.3) 25 | if title: 26 | plt.title(title) 27 | plt.show(); plt.close() 28 | 29 | def hist(x, xlab=None, ylab=None, title=None, dpi=150, alpha=1, color='gray', bins=100, figsize=(2,1.8), get_ax=False, **kwargs): 30 | fig, ax = plt.subplots(1, 1, dpi=dpi, figsize=figsize) 31 | ax.hist(x, bins=bins, color=color, alpha=alpha, **kwargs) 32 | ax.set_xlabel(xlab) 33 | ax.set_ylabel(ylab) 34 | ax.set_title(title) 35 | if get_ax: 36 | return fig, ax 37 | else: 38 | plt.show(); plt.close() 39 | 40 | def hist_int(x, xlab=None, ylab=None, title=None, rwidth=0.8, dpi=150, figsize=(2,1.8), get_ax=False): 41 | fig, ax = plt.subplots(1, 1, dpi=dpi, figsize=figsize) 42 | ax.hist(x, range=(min(x)-0.5, max(x)+0.5), bins=max(x)-min(x)+1, rwidth=rwidth, color='gray') 43 | ax.set_xlabel(xlab) 44 | ax.set_ylabel(ylab) 45 | ax.set_title(title) 46 | if get_ax: 47 | return fig, ax 48 | else: 49 | plt.show(); plt.close() 50 | 51 | def plot(x, y, xlab=None, ylab=None, title=None, alpha=1, dpi=150, figsize=(2,1.8), m='.', c='k', s=2, mew=0, get_ax=False, **kwargs): 52 | fig, ax = plt.subplots(1, 1, dpi=dpi, figsize=figsize) 53 | ax.plot(x, y, f'{m}{c}', mew=mew, markersize=s, alpha=alpha, **kwargs) 54 | ax.set_xlabel(xlab) 55 | ax.set_ylabel(ylab) 56 | ax.set_title(title) 57 | if get_ax: 58 | return fig, ax 59 | else: 60 | plt.show(); plt.close() 61 | 62 | def regplot(x, y, xlab=None, ylab=None, title=None, s=2, alpha=1, dpi=150, figsize=(2,1.8), get_ax=False): 63 | if np.unique(x).size == 1: 64 | plot(x, y, xlab=xlab, ylab=ylab, title=title, alpha=alpha) 65 | return 66 | reg = sm.OLS(y, sm.add_constant(x)).fit() 67 | b0, b1 = reg.params 68 | b_ = sm.OLS(zscore(y), sm.add_constant(zscore(x))).fit().params[1] 69 | p = reg.pvalues[1] 70 | xl = np.array([min(x), max(x)]) 71 | yl = b0 + b1*xl 72 | fig, ax = plt.subplots(1,1, dpi=dpi, figsize=figsize) 73 | ax.plot(x, y, '.k', mew=0, markersize=s, alpha=alpha) 74 | ax.plot(xl, yl, '--r', lw=1, label=f"y={b1:.2f}x+{b0:.2f},p={p:.1e},b*={b_:.2f}") 75 | ax.set_xlabel(xlab) 76 | ax.set_ylabel(ylab) 77 | ax.legend(fontsize=5) 78 | ax.set_title(title) 79 | if get_ax: 80 | return fig, ax 81 | else: 82 | plt.show(); plt.close() 83 | -------------------------------------------------------------------------------- /script/writeBEbed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import numpy as np 4 | import pickle 5 | 6 | def loadBE(): 7 | BE = np.full(ncore, None, dtype=object) 8 | for i in range(ncore): 9 | with open(f"seqjointnewpos{i}.pickle", 'rb') as f: 10 | BE[i] = pickle.load(f) # seqs, jointnewposs, poss 11 | return BE 12 | 13 | def loadQC(): 14 | with open("QCresults.pickle", 'rb') as f: 15 | QCbadloci, QCunresolvedloci = pickle.load(f) 16 | return QCbadloci, QCunresolvedloci 17 | 18 | def writeBEbeds(): 19 | nloci = np.loadtxt(sys.argv[2], usecols=1).shape[0] 20 | beds = [np.loadtxt(sys.argv[h+2], dtype=object) for h in [0,1]] 21 | nbeds = [np.full([nloci,7], None, dtype=object) for h in [0,1]] 22 | for i in range(nloci): 23 | if i in QCbad: 24 | continue 25 | b = i%ncore 26 | rs = BEresults[b] 27 | for h in [0,1]: 28 | s, e = int(beds[h][i,1]), int(int(beds[h][i,2])) 29 | os, oe = rs[2][h][i] 30 | ns, ne = rs[1][h][i] 31 | e5, e3 = os - ns, ne - oe 32 | nbeds[h][i,0] = beds[h][i,0] 33 | nbeds[h][i,1] = s - e5 34 | nbeds[h][i,2] = e + e3 35 | nbeds[h][i,3] = beds[h][i,3] 36 | nbeds[h][i,4] = beds[h][i,4] 37 | nbeds[h][i,5] = beds[h][i,5] 38 | nbeds[h][i,6] = beds[h][i,6] 39 | for h in [0,1]: 40 | m = ~np.all(nbeds[h] == None, axis=1) 41 | np.savetxt(sys.argv[h+4], nbeds[h][m], fmt="%s", delimiter="\t") 42 | 43 | ncore = int(sys.argv[1]) 44 | BEresults = loadBE() 45 | QCbadloci, QCunresolvedloci = loadQC() 46 | QCbad = set(list(QCbadloci.keys())) | QCunresolvedloci 47 | writeBEbeds() 48 | -------------------------------------------------------------------------------- /src/Num2seq.cpp: -------------------------------------------------------------------------------- 1 | #include "nuQueryFasta.h" 2 | 3 | int main (int argc, const char * argv[]) { 4 | 5 | if (argc < 2) { 6 | cerr << "usage: num2seq \n"; 7 | cerr << " e.g. num2seq 5 0\n"; 8 | cerr << " output: AAAAA\n"; 9 | cerr << " or pipe data through previous command\n"; 10 | cerr << " e.g. echo \"0\" | num2seq 5\n"; 11 | cerr << " output: AAAAA\n"; 12 | exit(0); 13 | } 14 | 15 | vector args(argv, argv+argc); 16 | size_t num; 17 | size_t k = stoi(args[1]); 18 | 19 | if (argc == 2) { 20 | while (cin >> num) { 21 | cout << decodeNumericSeq(num, k) << endl; 22 | } 23 | return 0; 24 | } 25 | 26 | if (args[2] == "/dev/stdin") { 27 | ifstream inf(args[2]); 28 | string line; 29 | while (true) { 30 | if (inf.peek() == EOF) { break; } 31 | getline(inf, line); 32 | num = stoul(line); 33 | cout << decodeNumericSeq(num, k) << endl; 34 | } 35 | } 36 | else { 37 | num = stoul(args[2]); 38 | cout << decodeNumericSeq(num, k) << endl; 39 | } 40 | 41 | 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /src/RVseq.cpp: -------------------------------------------------------------------------------- 1 | #include "nuQueryFasta.h" 2 | 3 | #include 4 | 5 | int main (int argc, const char * argv[]) { 6 | 7 | //if (argc < 2) { 8 | // cerr << "usage: rvseq \n"; 9 | // cerr << " e.g. rvseq AAATTTGGGCCC\n"; 10 | // cerr << " output: GGGCCCAAATTT\n\n"; 11 | //} 12 | 13 | if (argc == 1) { 14 | string seq; 15 | while (cin >> seq) { 16 | cout << getRC(seq) << endl; 17 | } 18 | return 0; 19 | } 20 | 21 | string args1 = string(argv[1]); 22 | 23 | if (args1 == "/dev/stdin") { 24 | ifstream inf(args1); 25 | string line; 26 | while (true) { 27 | if (inf.peek() == EOF) { break; } 28 | getline(inf, line); 29 | cout << getRC(line) << endl; 30 | } 31 | } 32 | else { 33 | cout << getRC(args1) << endl; 34 | } 35 | 36 | 37 | return 0; 38 | } 39 | -------------------------------------------------------------------------------- /src/Seq2num.cpp: -------------------------------------------------------------------------------- 1 | #include "nuQueryFasta.h" 2 | 3 | #include 4 | #include 5 | 6 | int main (int argc, const char * argv[]) { 7 | 8 | //cout << "usage: input seqeunce e.g. ATCGGTG to get its numeric represnetation" << endl; 9 | //cout << " where (A,C,G,T) = (0,1,2,3)" << endl; 10 | //cout << " enter q to quit program" << endl << endl; 11 | 12 | string args1 = string(argv[1]); 13 | if (args1 == "/dev/stdin") { 14 | ifstream inf(args1); 15 | string line; 16 | while (true) { 17 | if (inf.peek() == EOF) { break; } 18 | getline(inf, line); 19 | cout << encodeSeq(line) << endl; 20 | } 21 | } 22 | else { 23 | cout << encodeSeq(args1) << endl; 24 | } 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /src/bam2pe.cpp: -------------------------------------------------------------------------------- 1 | #include "aQueryFasta_thread.h" 2 | 3 | #include 4 | 5 | 6 | struct Read { 7 | string seq; 8 | uint8_t start, len; 9 | }; 10 | 11 | void prunePEinfo(string& title) { 12 | size_t endi = title.size(); 13 | if (title[endi-2] == '/') { 14 | if (title[endi-1] == '1' or title[endi-1] == '2') { 15 | title = title.substr(0,endi-2); 16 | } 17 | } 18 | } 19 | 20 | int main(int argc, char* argv[]) { 21 | if (argc < 2) { 22 | cerr << endl 23 | << "Usage: bam2pe -fai " << endl << endl 24 | 25 | << "Options:" << endl 26 | << " -fai Input file from samtools fasta -n" << endl << endl; 27 | return 1; 28 | } 29 | vector args(argv, argv+argc); 30 | 31 | size_t ind_fai = distance(args.begin(), find(args.begin(), args.begin()+argc, "-fai")); 32 | assert(ind_fai != argc); 33 | size_t ind_fname = ind_fai + 1; 34 | string fname = args[ind_fname]; 35 | 36 | cerr << "fname: " << fname << endl; 37 | 38 | ifstream fin(fname); 39 | assert(fin); 40 | 41 | unordered_map readDB; 42 | size_t nread = 0, nPEread = 0; 43 | size_t batch = 10000000, PEbatch = 1000000; 44 | time_t time1 = time(nullptr); 45 | 46 | while (fin.good()) { 47 | string title, seq, qualtitle, qual; 48 | Read read; 49 | 50 | getline(fin, title); 51 | getline(fin, read.seq); 52 | prunePEinfo(title); 53 | 54 | read.start = 0; 55 | read.len = read.seq.size(); 56 | 57 | if (readDB.count(title)) { 58 | Read& read1 = readDB[title]; 59 | 60 | if (read.len < 1 or read1.len < 1) { continue; } 61 | else { 62 | // output PE reads 63 | cout << title << "_0\n"; 64 | //cout << ">read " << to_string(nPEread) << "_0\n"; 65 | cout << read.seq.substr(read.start, read.len) << '\n'; 66 | cout << title << "_1\n"; 67 | //cout << ">read " << to_string(nPEread) << "_1\n"; 68 | cout << read1.seq.substr(read1.start, read1.len) << '\n'; 69 | 70 | nPEread += 2; 71 | if (nPEread % PEbatch == 0) { 72 | cerr << "time: " << time(nullptr) - time1 << " sec. " << nPEread << " PE reads found \ 73 | with " << readDB.size() << " reads in container " << endl; 74 | time1 = time(nullptr); 75 | } 76 | readDB.erase(title); 77 | } 78 | } 79 | else { 80 | readDB[title] = read; 81 | } 82 | nread++; 83 | if (nread % batch == 0) { 84 | cerr << nread << " reads processed" << endl; 85 | } 86 | } 87 | cerr << readDB.size() << " unpaired reads discarded" << endl; 88 | return 0; 89 | } 90 | -------------------------------------------------------------------------------- /src/binaryKmerIO.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BINARY_KMER_IO_HPP_ 2 | #define BINARY_KMER_IO_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | using std::string; 17 | using std::vector; 18 | using std::cout; 19 | using std::cerr; 20 | using std::endl; 21 | using std::ofstream; 22 | using std::ifstream; 23 | using std::ios; 24 | using std::unordered_set; 25 | using std::unordered_map; 26 | 27 | typedef vector> kset_db_t; 28 | typedef unordered_map kmerIndex_uint32_umap; 29 | typedef vector> bait_fps_db_t; 30 | 31 | template 32 | void flattenKmapDB(S& kmdb, uint64_t& nloci, uint64_t& nk, vector& index, vector& ks, vector& vs, int th=0) { 33 | nloci = kmdb.size(); 34 | index.resize(nloci); 35 | int nskip0 = 0; 36 | for (int tri = 0; tri < nloci; ++tri) { 37 | auto& kmap = kmdb[tri]; 38 | int nskip = 0; 39 | for (auto& p : kmap) { 40 | if (p.second >= th) { 41 | ks.push_back(p.first); 42 | vs.push_back(p.second); 43 | } 44 | else { ++nskip; } 45 | } 46 | index[tri] = kmap.size() - nskip; 47 | nskip0 += nskip; 48 | } 49 | cerr << "# kmer skipped: " << nskip0 << endl; 50 | nk = ks.size(); 51 | } 52 | 53 | template 54 | void serializeKmapDB(string tp, string pref, uint64_t& nloci, uint64_t& nk, vector& index, vector& ks, vector& vs) { 55 | string fn = pref + "." + tp + ".kmdb"; 56 | cerr << "serializing " << fn << endl; 57 | clock_t t = clock(); 58 | uint64_t sizeofval = sizeof(T); 59 | ofstream fout(fn, ios::binary); 60 | assert(fout); 61 | fout.write(reinterpret_cast( &nloci ), sizeof(uint64_t)); 62 | fout.write(reinterpret_cast( index.data() ), sizeof(uint64_t)*nloci); 63 | fout.write(reinterpret_cast( &nk ), sizeof(uint64_t)); 64 | fout.write(reinterpret_cast( &sizeofval ), sizeof(uint64_t)); 65 | fout.write(reinterpret_cast( ks.data() ), sizeof(uint64_t)*nk); 66 | fout.write(reinterpret_cast( vs.data() ), sizeofval*nk); 67 | cerr << fn << " written in " << (float)(clock()-t) / CLOCKS_PER_SEC << " sec" << endl; 68 | } 69 | 70 | template 71 | void deserializeKmapDB(string tp, string pref, uint64_t& nloci, uint64_t& nk, vector& index, vector& ks, vector& vs, S& kmdb) { 72 | string fn = pref + "." + tp + ".kmdb"; 73 | cerr << "deserializing "<< fn << endl; 74 | clock_t t = clock(); 75 | uint64_t sizeofval; 76 | ifstream fin(fn, ios::binary); 77 | assert(fin); 78 | fin.read((char*)( &nloci ), sizeof(uint64_t)); 79 | index.resize(nloci); 80 | fin.read((char*)( index.data() ), sizeof(uint64_t)*nloci); 81 | fin.read((char*)( &nk ), sizeof(uint64_t)); 82 | fin.read((char*)( &sizeofval ), sizeof(uint64_t)); 83 | ks.resize(nk); 84 | vs.resize(nk); 85 | fin.read((char*)( ks.data() ), sizeof(uint64_t)*nk); 86 | fin.read((char*)( vs.data() ), sizeofval*nk); 87 | cerr << fn << " read in " << (float)(clock()-t) / CLOCKS_PER_SEC << " sec" << endl; 88 | 89 | kmdb.resize(nloci); 90 | int ki = 0; 91 | for (int tri = 0; tri < nloci; ++tri) { 92 | int ei = index[tri]; 93 | for (int i = 0; i < ei; ++i, ++ki) { 94 | kmdb[tri][ks[ki]] = vs[ki]; 95 | } 96 | } 97 | cerr << fn << " read+reconstructed in " << (float)(clock()-t) / CLOCKS_PER_SEC << " sec" << endl; 98 | } 99 | 100 | template 101 | void validateKmapDB(T& kmdb, T& kmdb_) { 102 | cerr << "validating data" << endl; 103 | int nloci = kmdb.size(); 104 | for (int tri = 0; tri < nloci; ++tri) { 105 | assert(kmdb[tri].size() == kmdb_[tri].size()); 106 | auto& kmap_ = kmdb_[tri]; 107 | for (auto& p : kmdb[tri]) { 108 | auto it = kmap_.find(p.first); 109 | assert(it != kmap_.end()); 110 | assert(p.second == it->second); 111 | } 112 | } 113 | cerr << "done" << endl; 114 | } 115 | 116 | void flattenKsetDB(kset_db_t& ksdb, uint64_t& nloci, uint64_t& nk, vector& index, vector& ks) { 117 | nloci = ksdb.size(); 118 | index.resize(nloci); 119 | for (int tri = 0; tri < nloci; ++tri) { 120 | index[tri] = ksdb[tri].size(); 121 | for (auto km : ksdb[tri]) { 122 | ks.push_back(km); 123 | } 124 | } 125 | nk = ks.size(); 126 | } 127 | 128 | void serializeKsetDB(string tp, string pref, uint64_t& nloci, uint64_t& nk, vector& index, vector& ks) { 129 | string fn = pref + "." + tp + ".kdb"; 130 | cerr << "serializing " << fn << endl; 131 | clock_t t = clock(); 132 | ofstream fout(fn, ios::binary); 133 | assert(fout); 134 | fout.write(reinterpret_cast( &nloci ), sizeof(uint64_t)); 135 | fout.write(reinterpret_cast( index.data() ), sizeof(uint64_t)*nloci); 136 | fout.write(reinterpret_cast( &nk ), sizeof(uint64_t)); 137 | fout.write(reinterpret_cast( ks.data() ), sizeof(uint64_t)*nk); 138 | cerr << fn << " written in " << (float)(clock()-t) / CLOCKS_PER_SEC << " sec" << endl; 139 | } 140 | 141 | void deserializeKsetDB(string tp, string pref, uint64_t& nloci, uint64_t& nk, vector& index, vector& ks, kset_db_t& ksdb) { 142 | string fn = pref + "." + tp + ".kdb"; 143 | cerr << "deserializing " << fn << endl; 144 | clock_t t = clock(); 145 | ifstream fin(fn, ios::binary); 146 | assert(fin); 147 | fin.read((char*)( &nloci ), sizeof(uint64_t)); 148 | index.resize(nloci); 149 | fin.read((char*)( index.data() ), sizeof(uint64_t)*nloci); 150 | fin.read((char*)( &nk ), sizeof(uint64_t)); 151 | ks.resize(nk); 152 | fin.read((char*)( ks.data() ), sizeof(uint64_t)*nk); 153 | cerr << fn << " read in " << (float)(clock()-t) / CLOCKS_PER_SEC << " sec" << endl; 154 | 155 | ksdb.resize(nloci); 156 | int ki = 0; 157 | for (int tri = 0; tri < nloci; ++tri) { 158 | int ei = index[tri]; 159 | for (int i = 0; i < ei; ++i, ++ki) { 160 | ksdb[tri].insert(ks[ki]); 161 | } 162 | } 163 | cerr << fn << " read+reconstructed in " << (float)(clock()-t) / CLOCKS_PER_SEC << " sec" << endl; 164 | } 165 | 166 | void validateKsetDB(kset_db_t& ksdb, kset_db_t& ksdb_) { 167 | cerr << "validating data" << endl; 168 | int nloci = ksdb.size(); 169 | for (int tri = 0; tri < nloci; ++tri) { 170 | assert(ksdb[tri].size() == ksdb_[tri].size()); 171 | auto& ks_ = ksdb_[tri]; 172 | for (auto km : ksdb[tri]) { 173 | assert(ks_.count(km)); 174 | } 175 | } 176 | cerr << "done" << endl; 177 | } 178 | 179 | void serializeKarray(string tp, string pref, uint64_t& nk, vector& ks) { 180 | string fn = pref + "." + tp + ".ar"; 181 | cerr << "serializing " << fn << endl; 182 | clock_t t = clock(); 183 | ofstream fout(fn, ios::binary); 184 | assert(fout); 185 | fout.write(reinterpret_cast( &nk ), sizeof(uint64_t)); 186 | fout.write(reinterpret_cast( ks.data() ), sizeof(uint64_t)*nk); 187 | cerr << fn << " written in " << (float)(clock()-t) / CLOCKS_PER_SEC << " sec" << endl; 188 | } 189 | 190 | void deserializeKarray(string tp, string pref, uint64_t& nk, vector& ks) { 191 | string fn = pref + "." + tp + ".ar"; 192 | cerr << "deserializing " << fn << endl; 193 | clock_t t = clock(); 194 | ifstream fin(fn, ios::binary); 195 | assert(fin); 196 | fin.read((char*)( &nk ), sizeof(uint64_t)); 197 | ks.resize(nk); 198 | fin.read((char*)( ks.data() ), sizeof(uint64_t)*nk); 199 | cerr << fn << " read in " << (float)(clock()-t) / CLOCKS_PER_SEC << " sec" << endl; 200 | } 201 | 202 | //void deserializeKarray(string tp, string pref, uint64_t& nloci, uint64_t& nk, vector& index, vector& ks) { 203 | // string fn = pref + "." + tp + ".kdb"; 204 | // cerr << "deserializing " << fn << endl; 205 | // clock_t t = clock(); 206 | // ifstream fin(fn, ios::binary); 207 | // assert(fin); 208 | // fin.read((char*)( &nloci ), sizeof(uint64_t)); 209 | // index.resize(nloci); 210 | // fin.read((char*)( index.data() ), sizeof(uint64_t)*nloci); 211 | // fin.read((char*)( &nk ), sizeof(uint64_t)); 212 | // ks.resize(nk); 213 | // fin.read((char*)( ks.data() ), sizeof(uint64_t)*nk); 214 | // cerr << fn << " read in " << (float)(clock()-t) / CLOCKS_PER_SEC << " sec" << endl; 215 | //} 216 | 217 | void validateKarray(vector& ks, vector& ks_) { 218 | assert(ks.size() == ks_.size()); 219 | for (int i = 0; i < ks.size(); ++i) { 220 | assert(ks[i] == ks_[i]); 221 | } 222 | } 223 | 224 | #endif 225 | -------------------------------------------------------------------------------- /src/fa2kmers.cpp: -------------------------------------------------------------------------------- 1 | #include "aQueryFasta_thread.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | size_t ksize; 16 | 17 | 18 | void removeNodeFromGraph(size_t node, GraphType& graph) { // XXX test edge pruning 19 | static const size_t MASK = (1UL << (2*ksize)) - 1 - 3; // keep k-1 prefix 20 | static const size_t PREF = 1UL << (2*(ksize-1)); 21 | 22 | if (graph.count(node)) { // if the kmer is in the graph 23 | graph.erase(node); // remove the node 24 | 25 | uint8_t nucmask = 0xff - (1 << (node % 4)); 26 | size_t km1mer = (node & MASK) >> 2; 27 | for (size_t nuc = 0; nuc < 4; ++nuc) { // check all possible upstream nodes 28 | size_t prevkmer = nuc * PREF + km1mer; 29 | if (graph.count(prevkmer)) { graph[prevkmer] &= nucmask; } // remove the edge that points from the upstream node 30 | } 31 | } 32 | } 33 | 34 | 35 | int main(int argc, const char * argv[]) { 36 | 37 | if (argc < 2) { 38 | cerr << endl 39 | << "Usage: fa2kmers [-tr] [-th] [-g] [-p] [-m] [-h] -k -fsi -fso <-o|-on> -fa \n" 40 | << " -tr Output TR regions only, skip flanks\n" 41 | << " -th Filter out kmers w/ count below this threshold. Default: 0, i.e. no filtering\n" 42 | << " -g output *graph.kmers for threading-based kmer query.\n" 43 | << " -p Prune tr/graph kmers with the given kmers file.\n" 44 | << " -m Use orthology map to merge haps.\n" 45 | << " -h Write human readable outputs *.kmers instead of *.kmdb\n" 46 | << " Will turn on automatically if using -on\n" 47 | << " -k Kmer size\n" 48 | << " -fsi Length of input flanking sequence in *.tr.fasta.\n" 49 | << " -fso Length of output flanking sequence to be included in *.fl.kmers.\n" 50 | << " -o Output file prefix\n" 51 | << " -on Same as the -o option, but write locus and kmer name as well\n" 52 | << " -fa Use specified *.fasta in the \n" 53 | << " Count the first files and build kmers for the rest\n\n"; 54 | return 0; 55 | } 56 | vector args(argv, argv+argc); 57 | bool genGraph=false, prune=false, usemap=false, writeKmerName=false, TRonly=false, readable=false; 58 | size_t argi = 1, threshold = 0, nhap = 0, fso, fsi, nfile2count, nloci; 59 | string pruneFname, outPref, mapf; 60 | vector infnames; 61 | 62 | while (argi < argc) { 63 | if (args[argi] == "-th") { threshold = stoi(args[++argi]); } 64 | else if (args[argi] == "-g") { genGraph = true; } 65 | else if (args[argi] == "-p") { 66 | prune = true; 67 | pruneFname = args[++argi]; 68 | ifstream tmp(pruneFname); 69 | assert(tmp); 70 | tmp.close(); 71 | } 72 | else if (args[argi] == "-m") { 73 | usemap = true; 74 | mapf = args[++argi]; 75 | } 76 | else if (args[argi] == "-h") { readable = true; } 77 | else if (args[argi] == "-k") { ksize = stoi(args[++argi]); } 78 | else if (args[argi] == "-fsi") { fsi = stoi(args[++argi]); } 79 | else if (args[argi] == "-fso") { 80 | fso = stoi(args[++argi]); 81 | assert(fsi >= fso); 82 | } 83 | else if (args[argi] == "-o" or args[argi] == "-on") { 84 | writeKmerName = args[argi] == "-on"; 85 | if (writeKmerName) { readable = true; } 86 | outPref = args[++argi]; 87 | ofstream outf(outPref+".tr.kmers"); 88 | assert(outf); 89 | outf.close(); 90 | } 91 | else if (args[argi] == "-fa") { 92 | nfile2count = stoi(args[++argi]); 93 | while (++argi < argc) { 94 | ++nhap; 95 | infnames.push_back(args[argi]); 96 | ifstream inf(args[argi]); 97 | if (not inf) { cerr << args[argi] << endl; } 98 | assert(inf); 99 | inf.close(); 100 | } 101 | nhap = infnames.size(); 102 | if (not usemap) { 103 | cerr << "Not using orthology map, assuming all fasta files have the same number of loci\n" 104 | << "Total number of loci in " << infnames[0] << ": "; 105 | nloci = countLoci(infnames[0]); 106 | cerr << nloci << endl; 107 | } 108 | } 109 | else if (args[argi] == "-tr") { TRonly = true; } 110 | else { 111 | cerr << "invalid option" << endl; 112 | return 1; 113 | } 114 | ++argi; 115 | } 116 | vector> omap; 117 | if (usemap) { 118 | readOrthoMap(mapf, omap, nhap); 119 | nloci = omap.size(); 120 | cerr << "Using orthology map, total number of loci: " << nloci << endl; 121 | } 122 | 123 | 124 | // ----- 125 | // open each file and create a kmer database for each loci 126 | // combine the kmer databases of the same loci across different files 127 | // ----- 128 | vector TRkmersDB(nloci); 129 | vector FLkmersDB(nloci); 130 | vector graphDB(nloci); 131 | for (size_t n = 0; n < nhap; ++n) { 132 | bool count = n < nfile2count; 133 | size_t locus = 0; 134 | 135 | string read, line; 136 | ifstream fin(infnames[n]); 137 | assert(fin.is_open()); 138 | 139 | cerr << "building and counting " << infnames[n] << " kmers\n"; 140 | while (getline(fin, line)) { 141 | if (line[0] != '>') { 142 | read += line; 143 | } 144 | if (fin.peek() == '>' or fin.peek() == EOF) { 145 | if (read != "") { 146 | if (usemap) { while (not omap[locus][n]) { ++locus; } } 147 | 148 | size_t tr_l = fsi; 149 | size_t tr_r = fsi; 150 | size_t lFL_l = fsi - fso; 151 | size_t lFL_r = read.size() - fsi - (ksize-1); // seamless contenuation of kmers from FL to tr 152 | size_t rFL_l = read.size() - fsi - (ksize-1); // seamless contenuation of kmers from tr to FL 153 | size_t rFL_r = fsi - fso; 154 | 155 | buildNuKmers(TRkmersDB[locus], read, ksize, tr_l, tr_r, count); // (begin_pos, right_flank_size) 156 | if (not TRonly) { 157 | buildNuKmers(FLkmersDB[locus], read, ksize, lFL_l, lFL_r, count); 158 | buildNuKmers(FLkmersDB[locus], read, ksize, rFL_l, rFL_r, count); 159 | if (genGraph) { buildKmerGraph(graphDB[locus], read, ksize); } // no self loop 160 | } 161 | } 162 | read = ""; 163 | ++locus; 164 | } 165 | } 166 | fin.close(); 167 | } 168 | 169 | if (prune) { // obsolete 170 | cerr << "pruning unsupported kmers with " << pruneFname << endl; 171 | 172 | vector prunedkmersDB(nloci); 173 | readKmersFile2DB(prunedkmersDB, pruneFname); 174 | 175 | for (size_t locus = 0; locus < nloci; ++locus) { 176 | auto& TRkmers = TRkmersDB[locus]; 177 | auto& prunedkmers = prunedkmersDB[locus]; 178 | for (auto& p : prunedkmers) { TRkmers.erase(p.first); } 179 | 180 | if (genGraph) { 181 | auto& graph = graphDB[locus]; 182 | for (auto& p : prunedkmers) { 183 | removeNodeFromGraph(p.first, graph); 184 | removeNodeFromGraph(getNuRC(p.first, ksize), graph); 185 | } 186 | } 187 | } 188 | } 189 | 190 | 191 | // ----- 192 | // write kmers files for all kmer databases 193 | // ----- 194 | cerr << "writing outputs" << endl; 195 | if (writeKmerName) { 196 | writeKmersWithName(outPref + ".tr", TRkmersDB, threshold); 197 | if (not TRonly) { 198 | writeKmersWithName(outPref + ".fl", FLkmersDB, threshold); 199 | if (genGraph) { writeKmersWithName(outPref + ".graph", graphDB); } 200 | } 201 | } 202 | else { 203 | if (readable) { 204 | writeKmers(outPref + ".tr", TRkmersDB, threshold); 205 | if (not TRonly) { 206 | writeKmers(outPref + ".fl", FLkmersDB, threshold); 207 | if (genGraph) { writeKmers(outPref + ".graph", graphDB); } 208 | } 209 | } 210 | else { 211 | dumpKmerMapDB("tr", outPref, TRkmersDB, threshold); 212 | if (not TRonly) { 213 | dumpKmerMapDB("fl", outPref, FLkmersDB, threshold); 214 | if (genGraph) { dumpKmerMapDB("graph", outPref, graphDB); } 215 | } 216 | 217 | } 218 | } 219 | 220 | return 0; 221 | } 222 | -------------------------------------------------------------------------------- /src/genPanKmers.cpp: -------------------------------------------------------------------------------- 1 | #include "aQueryFasta_thread.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | // skip content in fin until target locus and merge content in target locus with the content in graph 17 | void readGraphLocus(GraphType& graph, ifstream& fin, size_t& current, size_t target) { 18 | string line; 19 | while (current < target) { 20 | getline(fin, line); 21 | if (line[0] == '>') { ++current; } 22 | } 23 | 24 | while (current == target) { 25 | if (fin.peek() == '>' or fin.peek() == EOF) { 26 | getline(fin, line); 27 | ++current; 28 | return; 29 | } 30 | else { 31 | getline(fin, line, '\t'); 32 | size_t kmer = stoul(line); 33 | getline(fin, line, '\n'); 34 | uint8_t nucBits = stoul(line); 35 | graph[kmer] |= nucBits; 36 | } 37 | } 38 | } 39 | 40 | void readKmersLocus(kmerCount_umap& kmers, ifstream& fin, size_t& current, size_t target) { 41 | string line; 42 | while (current < target) { 43 | getline(fin, line); 44 | if (line[0] == '>') { ++current; } 45 | } 46 | 47 | while (current == target) { 48 | if (fin.peek() == '>' or fin.peek() == EOF) { 49 | getline(fin, line); 50 | ++current; 51 | return; 52 | } 53 | else { 54 | getline(fin, line, '\t'); 55 | size_t kmer = stoul(line); 56 | getline(fin, line, '\n'); 57 | kmers[kmer] = 0; 58 | } 59 | } 60 | } 61 | 62 | void getgmap(vector>& omap, vector& gmap, vector his) { 63 | for (size_t i = 0; i < gmap.size(); ++i) { 64 | bool good = false; 65 | for (size_t hi : his) { 66 | good |= omap[i][hi]; 67 | } 68 | gmap[i] = good; 69 | } 70 | } 71 | 72 | int main(int argc, const char * argv[]) { 73 | 74 | if (argc < 2) { 75 | cerr << "Usage: genPanKmers [-tr] -o -m -k \n" 76 | << " -tr precess *.tr.kmers only, skipping tre, fl and graph\n" 77 | << " -tre precess *.tre.kmers only, skipping tr, fl and graph\n" 78 | << " -m if is '-', the program assumes no missing loci\n" 79 | << " full path name for is required in any case\n" 80 | << " -k requires PREFIX.TYPE.kmers\n" 81 | << " TYPE = tr, tre, fl or graph\n" 82 | << "mapping file format:\n" 83 | << " N columns; each column is a genome; order should be the same as specified in -k\n" 84 | << " M rows; each row is a locus in the pan-genome (pan locus)\n" 85 | << " NUMBER is the ordering of the pan locus in that genome\n" 86 | << " '.' means the pan locus in missing that genome\n\n"; 87 | return 0; 88 | } 89 | 90 | vector args(argv, argv+argc); 91 | bool nomissing = false, TRonly = false, TREonly = false; 92 | size_t argi = 1, ngenome, nloci; 93 | string indir, outpref, mapfname; 94 | vector kmerpref; 95 | 96 | while (argi < argc) { 97 | if (args[argi] == "-o") { outpref = args[++argi]; } 98 | else if (args[argi] == "-m") { 99 | mapfname = args[++argi]; 100 | nomissing = (mapfname == "-"); 101 | } 102 | else if (args[argi] == "-k") { 103 | kmerpref.resize(argc-argi); 104 | kmerpref.assign(argv+argi+1, argv+argc); 105 | ngenome = kmerpref.size(); 106 | break; 107 | } 108 | else if (args[argi] == "-tr") { TRonly = true; } 109 | else if (args[argi] == "-tre") { TREonly = true; } 110 | else { 111 | cerr << "Error: invalid option " << args[argi] << '\n'; 112 | return 1; 113 | } 114 | ++argi; 115 | } 116 | 117 | vector> omap; 118 | if (not nomissing){ 119 | readOrthoMap(mapfname, omap, 2*ngenome); 120 | nloci = omap.size(); 121 | } else { 122 | nloci = countLoci(kmerpref[0]+".tr.kmers"); 123 | } 124 | cerr << "# loci in pangenome: " << nloci << endl 125 | << ngenome << " genomes to merge" << endl; 126 | 127 | vector filetypes = {"tr", "fl", "graph", "tre"}; 128 | for (string& filetype : filetypes) { 129 | if (TRonly and filetype != "tr") { continue; } 130 | if (TREonly and filetype != "tre") { continue; } 131 | cerr << "merging " << filetype << ".kmers" << endl; 132 | 133 | bool graphmode = (filetype == "graph"); 134 | vector kmersDB; 135 | vector graphDB; 136 | if (graphmode) { graphDB.resize(nloci); } 137 | else { kmersDB.resize(nloci); } 138 | 139 | for (size_t gi = 0; gi < ngenome; ++gi) { 140 | vector gmap(nloci, 1); 141 | if (not nomissing) { 142 | getgmap(omap, gmap, vector{2*gi,2*gi+1}); 143 | if (graphmode) { 144 | mapKmersFile2DB(graphDB, kmerpref[gi]+"."+filetype+".kmers", gmap, true); 145 | } else { 146 | mapKmersFile2DB(kmersDB, kmerpref[gi]+"."+filetype+".kmers", gmap); 147 | } 148 | } 149 | else { 150 | if (graphmode) { 151 | readKmersFile2DB(graphDB, kmerpref[gi]+"."+filetype+".kmers", true); 152 | } else { 153 | readKmersFile2DB(kmersDB, kmerpref[gi]+"."+filetype+".kmers"); 154 | } 155 | } 156 | } 157 | 158 | cerr << "writing " << filetype << ".kmers" << endl; 159 | if (graphmode) { 160 | writeKmersWithName(outpref+"."+filetype, graphDB); 161 | } else { 162 | writeKmersWithName(outpref+"."+filetype, kmersDB); 163 | } 164 | } 165 | 166 | 167 | 168 | return 0; 169 | } 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /src/kmer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef KMER_HPP_ 2 | #define KMER_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using std::vector; 12 | using std::string; 13 | using std::find; 14 | using std::ifstream; 15 | using std::ofstream; 16 | using std::cerr; 17 | using std::endl; 18 | 19 | 20 | const unsigned char alphabet[] = {'A', 'C', 'G', 'T'}; 21 | 22 | const unsigned char baseNumConversion[] = { 23 | 'A','C','G','T',127,127,127,127, 24 | 127,127,127,127,127,127,127,127, 25 | 127,127,127,127,127,127,127,127, 26 | 127,127,127,127,127,127,127,127, 27 | 127,127,127,127,127,127,127,127, 28 | 127,127,127,127,127,127,127,127, 29 | 127,127,127,127,127,127,127,127, 30 | 127,127,127,127,127,127,127,127, 31 | 127, 0 ,127, 1 ,127,127,127, 2 , 32 | 127,127,127,127,127,127,127,127, 33 | 127,127,127,127, 3 ,127,127,127, 34 | 127,127,127,127,127,127,127,127, 35 | 127, 0 ,127, 1 ,127,127,127, 2 , 36 | 127,127,127,127,127,127,127,127, 37 | 127,127,127,127, 3 ,127,127,127, 38 | }; 39 | 40 | const unsigned char baseComplement[] = { 41 | 3, 2, 1, 0,127,127,127,127, 42 | 127,127,127,127,127,127,127,127, 43 | 127,127,127,127,127,127,127,127, 44 | 127,127,127,127,127,127,127,127, 45 | 127,127,127,127,127,127,127,127, 46 | 127,127,127,127,127,127,127,127, 47 | '3','2','1','0',127,127,127,127, 48 | 127,127,127,127,127,127,127,127, 49 | 127,'T',127,'G',127,127,127,'C', 50 | 127,'I',127,127,127,127,'N',127, 51 | 127,127,127,127,'A',127,127,127, 52 | 127,127,127,127,127,127,127,127, 53 | 127,'t',127,'g',127,127,127,'c', 54 | 127,127,127,127,127,127,'n',127, 55 | 127,127,127,127,'a',127,127,127, 56 | }; 57 | 58 | const unsigned char byteRC[] = { 59 | 255, 191, 127, 63, 239, 175, 111, 47, 223, 159, 60 | 95, 31, 207, 143, 79, 15, 251, 187, 123, 59, 61 | 235, 171, 107, 43, 219, 155, 91, 27, 203, 139, 62 | 75, 11, 247, 183, 119, 55, 231, 167, 103, 39, 63 | 215, 151, 87, 23, 199, 135, 71, 7, 243, 179, 64 | 115, 51, 227, 163, 99, 35, 211, 147, 83, 19, 65 | 195, 131, 67, 3, 254, 190, 126, 62, 238, 174, 66 | 110, 46, 222, 158, 94, 30, 206, 142, 78, 14, 67 | 250, 186, 122, 58, 234, 170, 106, 42, 218, 154, 68 | 90, 26, 202, 138, 74, 10, 246, 182, 118, 54, 69 | 230, 166, 102, 38, 214, 150, 86, 22, 198, 134, 70 | 70, 6, 242, 178, 114, 50, 226, 162, 98, 34, 71 | 210, 146, 82, 18, 194, 130, 66, 2, 253, 189, 72 | 125, 61, 237, 173, 109, 45, 221, 157, 93, 29, 73 | 205, 141, 77, 13, 249, 185, 121, 57, 233, 169, 74 | 105, 41, 217, 153, 89, 25, 201, 137, 73, 9, 75 | 245, 181, 117, 53, 229, 165, 101, 37, 213, 149, 76 | 85, 21, 197, 133, 69, 5, 241, 177, 113, 49, 77 | 225, 161, 97, 33, 209, 145, 81, 17, 193, 129, 78 | 65, 1, 252, 188, 124, 60, 236, 172, 108, 44, 79 | 220, 156, 92, 28, 204, 140, 76, 12, 248, 184, 80 | 120, 56, 232, 168, 104, 40, 216, 152, 88, 24, 81 | 200, 136, 72, 8, 244, 180, 116, 52, 228, 164, 82 | 100, 36, 212, 148, 84, 20, 196, 132, 68, 4, 83 | 240, 176, 112, 48, 224, 160, 96, 32, 208, 144, 84 | 80, 16, 192, 128, 64, 0}; 85 | 86 | 87 | size_t encodeSeq(string& seq, size_t start, size_t k) { // no extra copy 88 | size_t numericSeq = 0; 89 | for (size_t i = start; i < start+k; ++i) { 90 | numericSeq = (numericSeq<<2) + baseNumConversion[static_cast(seq[i])]; 91 | } 92 | return numericSeq; 93 | } 94 | 95 | size_t getNextKmer(size_t& kmer, size_t beg, string& read, size_t k) { 96 | size_t rlen = read.size(); 97 | if (beg + k > rlen) { 98 | return rlen; 99 | } 100 | size_t validlen = 0; 101 | while (validlen != k) { 102 | if (beg + k > rlen) { 103 | return rlen; 104 | } 105 | if (find(alphabet, alphabet+4, read[beg + validlen]) == alphabet+4) { 106 | beg = beg + validlen + 1; 107 | validlen = 0; 108 | } else { 109 | validlen += 1; 110 | } 111 | } 112 | kmer = encodeSeq(read, beg, k); 113 | return beg; 114 | } 115 | 116 | size_t getNuRC(size_t num, size_t k) { 117 | size_t num_rc = 0; 118 | while (k >= 4) { // convert a full byte 119 | num_rc <<= 8; 120 | num_rc += byteRC[num & 0xff]; 121 | num >>= 8; 122 | k -= 4; 123 | } 124 | if (k > 0) { // convert remaining bits 125 | num_rc <<= (k<<1); // was num_rc <<= (k*2); 126 | num_rc += (byteRC[num] >> ((4-k)<<1)); // was num_rc += (byteRC[num] >> ((4-k)*2)); 127 | } 128 | return num_rc; 129 | } 130 | 131 | void read2kmers(vector& kmers, string& read, size_t k, size_t leftflank = 0, size_t rightflank = 0, bool canonical = true, bool keepN = false) { 132 | const size_t rlen = read.size(); 133 | const size_t mask = (1ULL << 2*(k-1)) - 1; 134 | size_t beg, nbeg, canonicalkmer, kmer, rckmer; 135 | 136 | beg = getNextKmer(kmer, leftflank, read, k); 137 | if (beg == rlen) { return; } 138 | if (keepN) { kmers.resize(rlen-k+1, -1); } 139 | rckmer = getNuRC(kmer, k); 140 | 141 | for (size_t i = beg; i < rlen - k - rightflank + 1; ++i) { 142 | canonicalkmer = (kmer > rckmer ? rckmer : kmer); 143 | if (keepN) { kmers[i] = canonical ? canonicalkmer : kmer; } 144 | else { kmers.push_back(canonical ? canonicalkmer : kmer); } 145 | 146 | if (std::find(alphabet, alphabet+4, read[i + k]) == alphabet+4) { // XXX speedup 147 | nbeg = getNextKmer(kmer, i+k+1, read, k); 148 | if (nbeg == rlen) { return; } 149 | rckmer = getNuRC(kmer, k); 150 | i = nbeg - 1; 151 | } else { 152 | kmer = ( (kmer & mask) << 2 ) + baseNumConversion[static_cast(read[i + k])]; 153 | rckmer = (rckmer >> 2) + ( (baseNumConversion[baseComplement[static_cast(read[i + k])]] & mask) << (2*(k-1))); 154 | } 155 | } 156 | } 157 | 158 | //template 159 | //void readKmerSet(T& kmerDB, string fname) { // vector 160 | // ifstream f(fname); 161 | // assert(f); 162 | // cerr <<"reading kmers from " << fname << endl; 163 | // string line; 164 | // size_t idx; 165 | // while (getline(f, line)) { 166 | // if (line[0] == '>') { idx = stoul(line.substr(1)); } 167 | // else { kmerDB[idx].insert(stoul(line)); } 168 | // } 169 | // f.close(); 170 | //} 171 | 172 | template 173 | void buildNuKmers(T& kmers, string& read, size_t k, size_t leftflank = 0, size_t rightflank = 0, bool count = true) { 174 | size_t rlen = read.size(); 175 | size_t mask = (1UL << 2*(k-1)) - 1; 176 | size_t beg, nbeg, canonicalkmer, kmer, rckmer; 177 | 178 | beg = getNextKmer(kmer, leftflank, read, k); 179 | if (beg == rlen) { return; } 180 | rckmer = getNuRC(kmer, k); 181 | 182 | for (size_t i = beg; i < rlen - k - rightflank + 1; ++i) { 183 | if (kmer > rckmer) { 184 | canonicalkmer = rckmer; 185 | } else { 186 | canonicalkmer = kmer; 187 | } 188 | kmers[canonicalkmer] += (1 & count); 189 | 190 | if (find(alphabet, alphabet+4, read[i + k]) == alphabet+4) { 191 | nbeg = getNextKmer(kmer, i+k+1, read, k); 192 | if (nbeg == rlen) { return; } 193 | rckmer = getNuRC(kmer, k); 194 | i = nbeg - 1; 195 | } else { 196 | kmer = ( (kmer & mask) << 2 ) + baseNumConversion[static_cast(read[i + k])]; 197 | rckmer = (rckmer >> 2) + ( (baseNumConversion[baseComplement[static_cast(read[i + k])]] & mask) << (2*(k-1))); 198 | } 199 | } 200 | } 201 | 202 | #endif 203 | -------------------------------------------------------------------------------- /src/kmerIO.hpp: -------------------------------------------------------------------------------- 1 | #ifndef KMER_IO_HPP_ 2 | #define KMER_IO_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | using std::string; 17 | using std::vector; 18 | using std::cout; 19 | using std::cerr; 20 | using std::endl; 21 | using std::ofstream; 22 | using std::ifstream; 23 | using std::stringstream; 24 | using std::ios; 25 | using std::unordered_set; 26 | using std::unordered_map; 27 | 28 | typedef vector> kset_db_t; 29 | typedef unordered_map kmerIndex_uint32_umap; 30 | typedef vector> bait_fps_db_t; 31 | 32 | 33 | size_t countLoci(string fname) { 34 | ifstream inf(fname); 35 | assert(inf); 36 | string line; 37 | size_t nloci = 0; 38 | while (getline(inf, line)) { 39 | if (line[0] == '>') { 40 | ++nloci; 41 | } 42 | } 43 | inf.close(); 44 | return nloci; 45 | } 46 | 47 | void readKmerIndex(kmerIndex_uint32_umap& kmerDBi, vector>& kmerDBi_vec, string fname) { // optimized version 48 | ifstream f(fname); 49 | assert(f); 50 | cerr <<"reading kmers from " << fname << endl; 51 | uint32_t idx = -1; 52 | uint32_t vsize = kmerDBi_vec.size(); 53 | string line; 54 | while (getline(f, line)) { 55 | if (line[0] == '>') { ++idx; } 56 | else { 57 | size_t kmer = stoul(line); 58 | auto it = kmerDBi.find(kmer); 59 | if (it != kmerDBi.end()) { // kmer is not unique 60 | uint32_t vi = it->second; 61 | if (vi % 2) { // kmer freq. >= 2 in current db; kmerDBi_vec[(vi>>1)] records the list of mapped loci 62 | bool good = true; 63 | for (uint32_t x : kmerDBi_vec[vi>>1]) { if (x == idx) { good = false; break; } } 64 | if (good) { kmerDBi_vec[vi>>1].push_back(idx); } 65 | } 66 | else { // kmer freq. = 1 in current db; vi>>1 is the mapped locus. 67 | if ((vi >> 1) != idx) { 68 | kmerDBi_vec.push_back(vector{vi>>1, idx}); 69 | it->second = ((vsize++) << 1) + 1; 70 | } 71 | } 72 | } else { 73 | kmerDBi[kmer] = (idx << 1); 74 | } 75 | } 76 | } 77 | f.close(); 78 | } 79 | 80 | void readKmers_ksetDB(string fn, kset_db_t& ksdb) { 81 | cerr <<"reading kmers from " << fn << endl; 82 | ifstream fin(fn); 83 | assert(fin); 84 | string line; 85 | int tri = -1; 86 | while (getline(fin, line)) { 87 | if (line[0] == '>') { ++tri; } 88 | else { ksdb[tri].insert(stoull(line)); } // only converts the first field to ULL 89 | } 90 | } 91 | 92 | template 93 | void readFPSKmersV2(T& kmerDB, string fname) { 94 | size_t tri; 95 | string line; 96 | ifstream f; 97 | 98 | f.open(fname); 99 | assert(f); 100 | cerr << "reading kmers from " << fname << endl; 101 | while (getline(f, line)) { 102 | if (line[0] == '>') { tri = stoul(line.substr(1)); continue; } 103 | 104 | stringstream ss(line); 105 | size_t km, mi, ma; 106 | ss >> km >> mi >> ma; 107 | kmerDB[tri][km] = (mi<<8) + ma; 108 | } 109 | } 110 | 111 | void readQCFile(vector& out, string fn) { 112 | ifstream fin(fn); 113 | assert(fin); 114 | int ntr = out.size(); 115 | assert(ntr); 116 | fin.read((char*)( out.data() ), sizeof(uint8_t)*ntr); 117 | for (int tri = 0; tri < ntr; ++tri) { 118 | out[tri] -= 48; // ASCII code of `0` is 48 119 | } 120 | } 121 | 122 | #endif 123 | -------------------------------------------------------------------------------- /src/mapkmers.cpp: -------------------------------------------------------------------------------- 1 | #include "aQueryFasta_thread.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | 17 | 18 | 19 | int main(int argc, const char * argv[]) { 20 | 21 | if (argc < 2) { 22 | //cerr << "usage: program panbed pankmers kmers genome outpref\n"; 23 | cerr << "usage: program panbed target_field pankmers kmers outpref\n\n" 24 | 25 | << "target_field column index (0-indexed) to extract in panbed for locus mapping\n\n"; 26 | return 0; 27 | } 28 | 29 | vector args(argv, argv+argc); 30 | string panbedfname = args[1], pankmersfname = args[3], kmersfname = args[4], outpref = args[5]; 31 | size_t tf = stoi(args[2]); 32 | 33 | ifstream panbedin(panbedfname); 34 | assert(panbedin); 35 | 36 | size_t npanloci = countBedLoci(panbedfname); 37 | size_t nloci = countLoci(kmersfname); 38 | vector pankmersDB(npanloci); 39 | vector kmersDB(nloci); 40 | 41 | ifstream panfin(pankmersfname), fin(kmersfname); 42 | assert(panfin and fin); 43 | panfin.close(); 44 | fin.close(); 45 | 46 | readKmersFile2DB(pankmersDB, pankmersfname); 47 | readKmersFile2DB(kmersDB, kmersfname, 0, false); // start from 0th locus, do not count 48 | 49 | cerr << "mapping kmers" << endl; 50 | string line; 51 | size_t panlocus = 0, locus = 0; 52 | while (getline(panbedin, line)) { 53 | stringstream ss(line); 54 | string tmp; 55 | 56 | for (size_t ind = 0; ind < tf; ++ind) { ss >> tmp; } 57 | ss >> tmp; 58 | if (tmp != ".") { 59 | locus = stoul(tmp); 60 | assert(locus < nloci); 61 | kmerCount_umap& kmers = kmersDB[locus]; 62 | kmerCount_umap& pankmers = pankmersDB[panlocus]; 63 | for (auto& p : kmers) { 64 | kmers[p.first] = pankmers[p.first]; 65 | } 66 | } 67 | ++panlocus; 68 | } 69 | 70 | cerr << "writing kmers" << endl; 71 | writeKmers(outpref, kmersDB); 72 | 73 | return 0; 74 | } 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /src/pred.cpp: -------------------------------------------------------------------------------- 1 | #include "pred.h" 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | //using Eigen::MatrixXf; 11 | using Eigen::ArrayXXf; 12 | using Eigen::seq; 13 | using Eigen::seqN; 14 | 15 | int main(int argc, char* argv[]) { 16 | if (argc < 2) { 17 | cerr << endl 18 | << "Usage: danbing-tk-pred \n" 19 | << "INPUT1 metadata of *.trkmc.ar files, consisting of 2 columns.\n" 20 | << " col1 *.trkmc.ar file name\n" 21 | << " col2 read depth\n" 22 | << "INPUT2 invariant kmers of an RPGG build\n" 23 | << "OUTPUT1 raw genotype matrix. Row: sample. Column: kmer.\n" 24 | << "OUTPUT2 bias-corrected genotype matrix. Row: sample. Column: kmer.\n" 25 | << "OUTPUT3 bias matrix. Row: sample. Column: TR locus.\n" 26 | << "Developer mode:\n" 27 | << " -f Load GT matrix from file\n\n"; 28 | return 0; 29 | } 30 | vector args(argv, argv+argc); 31 | int argi = 1; 32 | string gtfn = "", finGtMeta, finIkMeta, foutRaw, fout, foutBias; 33 | while (args[argi][0] == '-') { 34 | if (args[argi] == "-f") { 35 | gtfn = args[++argi]; 36 | ++argi; 37 | } 38 | } 39 | finGtMeta = args[argi]; 40 | finIkMeta = args[argi+1]; 41 | foutRaw =args[argi+2] ; 42 | fout = args[argi+3]; 43 | foutBias = args[argi+4]; 44 | cout << "metadata of *.trkmc.ar: " << finGtMeta << endl; 45 | cout << "invariant kmers: " << finIkMeta << endl; 46 | cout << "raw genotype matrix will be written to: " << foutRaw << endl; 47 | cout << "bias-corrected genotype matrix will be written to: " << fout << endl; 48 | cout << "bias matrix will be written to: " << foutBias << endl; 49 | 50 | Eigen::IOFormat tsv_format(Eigen::StreamPrecision, Eigen::DontAlignCols, "\t", "\n", "", "", "", "", ' '); 51 | struct gt_meta gtm; 52 | read_gt_meta(finGtMeta, gtm); 53 | 54 | struct ikmer_meta ikmt; 55 | read_ikmer(finIkMeta, gtm.nk, gtm.n_tr, ikmt); 56 | 57 | //ArrayXXf gt(gtm.ns, gtm.nk); 58 | ArrayXXf gt(gtm.nk, gtm.ns); 59 | if (gtfn.size()) { // for testing load from bingt 60 | //load_binGTMat(gt, gtfn, KMC_BSIZE); 61 | //cout << gt(seqN(0,10),seqN(0,10)) << endl << endl; 62 | //save_matrix(foutRaw, gt); 63 | } else { 64 | //fill_gt(gt, gtm.fns); 65 | load_eachBinGT(gt, gtm); 66 | cout << gt(seqN(0,10),seqN(0,10)) << endl << endl; 67 | 68 | norm_rd(gt, gtm.rds); // gt transposed to (ns,nk) 69 | cout << gt(seqN(0,10),seqN(0,10)) << endl << endl; 70 | save_matrix(foutRaw, gt); 71 | } 72 | 73 | ArrayXXf Bias(gtm.ns, gtm.n_tr); 74 | bias_correction(gt, ikmt, Bias); 75 | cout << gt(seqN(0,10),seqN(0,10)) << endl << endl; 76 | cout << "Bias matrix:\n" 77 | << Bias(seqN(0,10),seqN(0,10)) << endl << endl; 78 | 79 | save_matrix(fout, gt); 80 | save_matrix(foutBias, Bias, tsv_format); 81 | 82 | return 0; 83 | } 84 | 85 | 86 | -------------------------------------------------------------------------------- /src/pred.h: -------------------------------------------------------------------------------- 1 | #ifndef PRED_H_ 2 | #define PRED_H_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using std::string; 15 | using std::vector; 16 | using std::ifstream; 17 | using std::ofstream; 18 | using std::stringstream; 19 | using std::ios; 20 | using std::cout; 21 | using std::cerr; 22 | using std::endl; 23 | using std::flush; 24 | using std::stof; 25 | 26 | struct gt_meta { 27 | uint64_t ns; // number of samples 28 | uint64_t nk; // number of kmers 29 | uint64_t n_tr; // number of TR loci 30 | vector fns; // *.tr.kmers of each sample 31 | vector rds; // read depth of each sample 32 | }; 33 | 34 | struct ikmer_meta { 35 | vector nik; // cummulative count of ikmers util the i-th locus 36 | vector nk; // cummulative count of kmers until the i-th locus 37 | Eigen::ArrayXi iki; // indices of ikmers in the `nk` vector 38 | Eigen::ArrayXf ikmc; // count of ikmers in `iki` 39 | }; 40 | 41 | void read_gt_meta(string& fn, gt_meta& gtm) { 42 | ifstream fin(fn); 43 | string f1, f2; 44 | while (getline(fin, f1, '\t') and getline(fin, f2)) { 45 | gtm.fns.push_back(f1); 46 | gtm.rds.push_back(stof(f2)); 47 | } 48 | gtm.ns = gtm.fns.size(); 49 | } 50 | 51 | uint64_t le2uint64(char* in) { 52 | return (in[0]&0x00000000000000ff) << 0 | 53 | (in[1]&0x00000000000000ff) << 8 | 54 | (in[2]&0x00000000000000ff) << 16 | 55 | (in[3]&0x00000000000000ff) << 24 | 56 | (in[4]&0x00000000000000ff) << 32 | 57 | (in[5]&0x00000000000000ff) << 40 | 58 | (in[6]&0x00000000000000ff) << 48 | 59 | (in[7]&0x00000000000000ff) << 56; 60 | } 61 | 62 | uint32_t le2uint32(char* in) { 63 | return (in[0]&0x000000ff) << 0 | 64 | (in[1]&0x000000ff) << 8 | 65 | (in[2]&0x000000ff) << 16 | 66 | (in[3]&0x000000ff) << 24; 67 | } 68 | 69 | void read_ikmer(string& fn, uint64_t& n_kmer, uint64_t& n_tr, ikmer_meta& ikmt) { 70 | // check endianess 71 | /*uint32_t data = 1; 72 | char* ptr = (char*)&data; 73 | bool little_endian = *ptr == 1; 74 | */ 75 | 76 | ifstream fin(fn, ios::binary); 77 | assert(fin); 78 | uint64_t n_ikmer; 79 | uint32_t ki; 80 | uint8_t kc; 81 | /*if (little_endian == false) { 82 | fin.read((char*)&n_kmer, sizeof(n_kmer)); 83 | fin.read((char*)&n_ikmer, sizeof(n_ikmer)); 84 | cout << n_ikmer << '/' << n_kmer << " kmers are invariant" << endl; 85 | ikmc.resize(n_kmer); 86 | for (int iki = 0; iki < n_ikmer; ++iki) { 87 | fin.read((char*)&ki, sizeof(ki)); 88 | fin.read((char*)&kc, sizeof(kc)); 89 | ikmc[ki] = kc; 90 | } 91 | } 92 | */ 93 | // endian-agnostic, only requires that data is in little endian format 94 | char t64[8]; 95 | char t32[4]; 96 | fin.read(t64, sizeof(t64)); 97 | n_kmer = le2uint64(&t64[0]); 98 | fin.read(t64, sizeof(t64)); 99 | n_ikmer = le2uint64(&t64[0]); 100 | fin.read(t64, sizeof(t64)); 101 | n_tr = le2uint64(&t64[0]); 102 | cout << n_tr << " loci in total." << endl; 103 | cout << n_ikmer << '/' << n_kmer << " kmers are invariant." << endl; 104 | ikmt.iki.resize(n_ikmer); 105 | ikmt.ikmc.resize(n_ikmer); 106 | cout << "iki shape: " << ikmt.iki.rows() << "," << ikmt.iki.cols() << endl; 107 | cout << "ikmc shape: " << ikmt.ikmc.rows() << "," << ikmt.ikmc.cols() << endl; 108 | ikmt.nk.resize(n_tr); 109 | ikmt.nik.resize(n_tr); 110 | for (int tri = 0; tri < n_tr; ++tri) { 111 | fin.read(t32, sizeof(t32)); 112 | ikmt.nk[tri] = le2uint32(&t32[0]); 113 | } 114 | for (int tri = 0; tri < n_tr; ++tri) { 115 | fin.read(t32, sizeof(t32)); 116 | ikmt.nik[tri] = le2uint32(&t32[0]); 117 | } 118 | for (int ikii = 0; ikii < n_ikmer; ++ikii) { 119 | fin.read(t32, sizeof(t32)); 120 | ki = le2uint32(&t32[0]); 121 | ikmt.iki(ikii) = ki; 122 | fin.read((char*)&kc, sizeof(kc)); 123 | ikmt.ikmc(ikii) = kc; 124 | } 125 | fin.close(); 126 | } 127 | 128 | template 129 | void fill_gt(T& gt, vector& fns) { 130 | std::time_t t0 = std::time(nullptr); 131 | cout << "reading gt"; 132 | for (int i0=0; i0 146 | //void load_gt(T& gt, string& fn) { 147 | // std::time_t t0 = std::time(nullptr); 148 | // cout << "loading gt" << endl; 149 | // ifstream fin(fn); 150 | // assert(fin); 151 | // string line; 152 | // int nrow = 14752039, ncol = 879; 153 | // for (int i0 = 0; i0 < nrow; ++i0) { 154 | // if (i0 % 100000 == 0) { cout << '.' << flush; } 155 | // for (int i1 = 0; i1 < ncol-1; ++i1) { 156 | // getline(fin, line, '\t'); 157 | // gt(i1,i0) = stof(line); 158 | // } 159 | // getline(fin, line); 160 | // gt(ncol-1,i0) = stof(line); 161 | // } 162 | // fin.close(); 163 | // cout << "finished in " << (std::time(nullptr) - t0) << " sec" << endl; 164 | //} 165 | 166 | template 167 | void load_eachBinGT(T& gt, gt_meta& gtm) { 168 | Eigen::Array tmp(gtm.nk, gtm.ns); 169 | auto fns = gtm.fns; 170 | size_t nk; 171 | size_t nk_ = gtm.nk; 172 | size_t sizeof64 = 8; 173 | std::time_t t0 = std::time(nullptr); 174 | cout << "reading " << fns.size() << " gt files" << endl; 175 | for (int i0=0; i0(); 186 | } 187 | 188 | //template 189 | //void load_binGTMat(T& gt, string& fn) { 190 | // std::time_t t0 = std::time(nullptr); 191 | // cout << "loading gt" << endl; 192 | // ifstream fin(fn, ios::in | ios::binary); 193 | // assert(fin); 194 | // size_t sizeof32 = 4; 195 | // uint32_t nrow, ncol; 196 | // fin.read((char*)(&nrow), sizeof32); 197 | // fin.read((char*)(&ncol), sizeof32); 198 | // cout << "size = (" << nrow << ',' << ncol << ')' << endl; 199 | // gt.resize(nrow, ncol); 200 | // fin.read((char*)(gt.data()), (size_t)nrow*(size_t)ncol*sizeof32); 201 | // cout << "finished in " << (std::time(nullptr) - t0) << " sec" << endl; 202 | //} 203 | 204 | template 205 | void norm_rd(T& gt, vector& rd_) { 206 | cout << "normalizaing read depth" << endl; 207 | Eigen::Array rd(rd_.size()); 208 | for (int i=0; i 213 | void bias_correction(T& gt, ikmer_meta& ikmt, P& Bias) { 214 | cout << "computing/correcting bias" << endl; 215 | std::time_t t0 = std::time(nullptr); 216 | int ns = gt.rows(); 217 | for (int tri=0; tri 236 | void save_matrix(string& fn, T& mat) { 237 | cout << "saving matrix to " << fn << endl; 238 | std::time_t t0 = std::time(nullptr); 239 | size_t sizeof32 = 4; 240 | //size_t sizeofscalar = sizeof(typename T::Scalar); 241 | size_t nrow = mat.rows(), ncol = mat.cols(); 242 | ofstream fout(fn, ios::out | ios::binary); 243 | fout.write(reinterpret_cast( &nrow ), sizeof32); 244 | fout.write(reinterpret_cast( &ncol ), sizeof32); 245 | fout.write(reinterpret_cast( mat.data() ), nrow*ncol*sizeof32); 246 | cout << "matrix dim: (" << nrow << ',' << ncol << ") " << "size: " << nrow*ncol*sizeof32 << " bytes" << endl; 247 | cout << "finished in " << (std::time(nullptr) - t0) << " sec" << endl; 248 | } 249 | 250 | template 251 | void save_matrix(string& fn, T& mat, Eigen::IOFormat& tsv_format) { 252 | cout << "saving matrix to " << fn << endl; 253 | std::time_t t0 = std::time(nullptr); 254 | ofstream fout(fn); 255 | fout << mat.format(tsv_format); 256 | cout << "finished in " << (std::time(nullptr) - t0) << " sec" << endl; 257 | } 258 | 259 | #endif 260 | 261 | -------------------------------------------------------------------------------- /src/sim_reads.cpp: -------------------------------------------------------------------------------- 1 | #include "stdlib.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | //#include 12 | 13 | using std::string; 14 | using std::vector; 15 | using std::cout; 16 | using std::cerr; 17 | using std::endl; 18 | using std::ifstream; 19 | using std::ofstream; 20 | using std::toupper; 21 | using std::to_string; 22 | using std::stringstream; 23 | using std::sort; 24 | using std::unordered_map; 25 | 26 | size_t FLEN = 500; 27 | size_t RLEN = 150; 28 | size_t NBEG = FLEN - RLEN; 29 | size_t SHFT = 20; 30 | size_t MIN_CTG_LEN = 50000; 31 | 32 | const char baseComplement[] = { 33 | 3, 2, 1, 0,127,127,127,127, 34 | 127,127,127,127,127,127,127,127, 35 | 127,127,127,127,127,127,127,127, 36 | 127,127,127,127,127,127,127,127, 37 | 127,127,127,127,127,127,127,127, 38 | 127,127,127,127,127,127,127,127, 39 | 127,127,127,127,127,127,127,127, 40 | 127,127,127,127,127,127,127,127, 41 | 127,'T',127,'G',127,127,127,'C', 42 | 127,127,127,127,127,127,'N',127, 43 | 127,127,127,127,'A',127,127,127, 44 | 127,127,127,127,127,127,127,127, 45 | 127,'t',127,'g',127,127,127,'c', 46 | 127,127,127,127,127,127,'n',127, 47 | 127,127,127,127,'a',127,127,127, 48 | }; 49 | 50 | const char tocap[] = { 51 | 127,127,127,127,127,127,127,127, 52 | 127,127,127,127,127,127,127,127, 53 | 127,127,127,127,127,127,127,127, 54 | 127,127,127,127,127,127,127,127, 55 | 127,127,127,127,127,127,127,127, 56 | 127,127,127,127,127,127,127,127, 57 | 127,127,127,127,127,127,127,127, 58 | 127,127,127,127,127,127,127,127, 59 | 127,'A',127,'C',127,127,127,'G', 60 | 127,127,127,127,127,127,'N',127, 61 | 127,127,127,127,'T',127,127,127, 62 | 127,127,127,127,127,127,127,127, 63 | 127,'A',127,'C',127,127,127,'G', 64 | 127,127,127,127,127,127,'N',127, 65 | 127,127,127,127,'T',127,127,127, 66 | }; 67 | 68 | inline void print_forward_read(const string& ctg, size_t beg) { 69 | for (size_t i = beg; i < beg+RLEN; ++i) { 70 | cout << tocap[ctg[i]]; 71 | } 72 | cout << '\n'; 73 | } 74 | 75 | inline void print_reverse_read(const string& ctg, size_t beg) { 76 | for (size_t i = beg+FLEN-1; i >= beg+NBEG; --i) { 77 | cout << tocap[baseComplement[ctg[i]]]; 78 | } 79 | cout << '\n'; 80 | } 81 | 82 | inline string get_forward_read(const string& ctg, size_t beg) { 83 | stringstream ss; 84 | for (size_t i = beg; i < beg+RLEN; ++i) { 85 | ss << tocap[ctg[i]]; 86 | } 87 | return ss.str(); 88 | } 89 | 90 | inline string get_reverse_read(const string& ctg, size_t beg) { 91 | stringstream ss; 92 | for (size_t i = beg+FLEN-1; i >= beg+NBEG; --i) { 93 | ss << tocap[baseComplement[ctg[i]]]; 94 | } 95 | return ss.str(); 96 | } 97 | 98 | void sample_read_locations(int nread, std::uniform_int_distribution& dis, std::mt19937& generator, vector& v, unordered_map& um) { 99 | for (int i = 0; i < nread; ++i) { 100 | int j = dis(generator); // Generate the random number 101 | um[j]++; 102 | v.push_back(j); 103 | } 104 | sort(v.begin(), v.end()); 105 | } 106 | 107 | int main(int argc, char* argv[]) { 108 | vector args(argv, argv+argc); 109 | if (argc == 1) { 110 | cerr << "Usage: simreads -pe -no-err [-c] [-fs] [-rlen] [-ml] [-uni] [-bed] [-split] [-o] -i ASSEMBLY.FASTA" << endl 111 | << " Options:" << endl 112 | << " -c INT Simulate reads from each seqeunce at THIS coverage. [15]" << endl 113 | << " -fs INT Fragment size. [500]" << endl 114 | << " -rlen INT Read length. [150]" << endl 115 | << " -ml INT Contigs shorter than MIN_CTG_LEN are ignored. [50000]" << endl 116 | << " -uni Sample read position from a uniform distribution" << endl 117 | << " -bed Output in bed format as chr, start, end, read1, read2" << endl 118 | << " -split split output by chromosome/contig. Requires -o" << endl 119 | << " -o STR output prefix" << endl 120 | << " -i STR Input fasta sequence to simulate reads from." << endl 121 | << "Interleaved 150 bp paired-end reads are written to STDOUT" << endl << endl; 122 | return 0; 123 | } 124 | 125 | bool pe, err; 126 | bool uni = false; 127 | bool split = false; 128 | bool bed = false; 129 | size_t cv = 15; 130 | string ifname; 131 | string ofname = ""; 132 | for (size_t argi = 1; argi < argc; ++argi) { 133 | if (args[argi] == "-pe") { pe = true; } 134 | else if (args[argi] == "-no-err") { err = false; } 135 | else if (args[argi] == "-i") { ifname = args[++argi]; } 136 | else if (args[argi] == "-c") { cv = stoul(args[++argi]); } 137 | else if (args[argi] == "-fs") { FLEN = stoul(args[++argi]); } 138 | else if (args[argi] == "-rlen") { RLEN = stoul(args[++argi]); } 139 | else if (args[argi] == "-ml") { MIN_CTG_LEN = stoul(args[++argi]); } 140 | else if (args[argi] == "-uni") { uni = true; } 141 | else if (args[argi] == "-bed") { bed = true; } 142 | else if (args[argi] == "-split") { split = true; } 143 | else if (args[argi] == "-o") { ofname = args[++argi]; } 144 | else { cerr << "Invalid option: " << args[argi] << endl; return 1; } 145 | } 146 | NBEG = FLEN - RLEN; 147 | SHFT = 2*RLEN/cv; 148 | std::random_device randdevice; // Obtain a seed from the operating system 149 | std::mt19937 generator(randdevice()); // Standard Mersenne Twister engine 150 | 151 | // fragment length = 500 = 150 forward strand + 200 gap + 150 reverse strand 152 | if (pe) { 153 | if (not err) { 154 | ifstream fin(ifname); 155 | ofstream fout; 156 | assert(fin); 157 | if (not split) { 158 | if (bed) { fout.open(ofname + ".allctgs.reads.bed"); } 159 | else { fout.open(ofname + ".allctgs.reads.fa"); } 160 | assert(fout); 161 | } 162 | 163 | string header, ctg, ctg_; 164 | while (getline(fin, header) and getline(fin, ctg)) { 165 | string ctgname = header.substr(1,header.size()-1); 166 | 167 | while (fin.peek() != EOF and fin.peek() != '>') { getline(fin, ctg_); ctg += ctg_; } 168 | if (ctg.size() < MIN_CTG_LEN) { 169 | cerr << "Contig " << header << " ignored, size = " << ctg.size() << " < MIN_CTG_LEN" << endl; 170 | continue; 171 | } 172 | if (split) { 173 | string pref = ofname + string{'.'} + ctgname; 174 | if (bed) { fout.open(pref + ".reads.bed"); } 175 | else { fout.open(pref + ".reads.fa"); } 176 | assert(fout); 177 | } 178 | 179 | size_t beg = 0; 180 | if (ofname.size()) { 181 | if (uni) { 182 | size_t nread = (ctg.size() * cv) / (2*RLEN); 183 | cerr << header << " SIZE=" << ctg.size() << " N_READ SIMULATED=" << nread << endl; 184 | 185 | std::uniform_int_distribution dis(0, ctg.size()-FLEN); // Define the distribution 186 | vector pos; 187 | unordered_map pos2c; 188 | sample_read_locations(nread, dis, generator, pos, pos2c); 189 | for (int beg : pos) { 190 | string f = get_forward_read(ctg, beg); 191 | string r = get_reverse_read(ctg, beg); 192 | for (int j = 0; j < pos2c[beg]; j++) { 193 | if (bed) { 194 | fout << ctgname << '\t' << beg << '\t' << beg+FLEN << '\t' << f << '\t' << r << '\n'; 195 | } 196 | else { 197 | fout << header << ':' << beg << '-' << beg+FLEN << "/1" << '\n' 198 | << f << '\n' 199 | << header << ':' << beg << '-' << beg+FLEN << "/2" << '\n' 200 | << r << '\n'; 201 | } 202 | } 203 | } 204 | } 205 | else { 206 | while (beg + FLEN <= ctg.size()) { 207 | string f = get_forward_read(ctg, beg); 208 | string r = get_reverse_read(ctg, beg); 209 | if (bed) { 210 | fout << ctgname << '\t' << beg << '\t' << beg+FLEN << '\t' << f << '\t' << r << '\n'; 211 | } 212 | else { 213 | fout << header << ':' << beg << '-' << beg+FLEN << "/1" << '\n' 214 | << f << '\n' 215 | << header << ':' << beg << '-' << beg+FLEN << "/2" << '\n' 216 | << r << '\n'; 217 | } 218 | beg += SHFT; 219 | } 220 | } 221 | if (split) { fout.close(); } 222 | 223 | } 224 | else { 225 | while (beg + FLEN <= ctg.size()) { 226 | cout << header << ':' << beg << '-' << beg+FLEN << "/1" << '\n'; 227 | print_forward_read(ctg, beg); 228 | cout << header << ':' << beg << '-' << beg+FLEN << "/2" << '\n'; 229 | print_reverse_read(ctg, beg); 230 | beg += SHFT; 231 | } 232 | } 233 | } 234 | 235 | } 236 | } 237 | cerr << "All done!" << endl; 238 | 239 | return 0; 240 | } 241 | -------------------------------------------------------------------------------- /test/QC/fn1a.sim.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH --ntasks=2 3 | #SBATCH --time=48:00:00 4 | #SBATCH --mem=4000 5 | #SBATCH --partition=qcb 6 | #SBATCH --account=mchaisso_100 7 | #SBATCH -N 1 8 | #SBATCH --job-name=rsim 9 | #SBATCH --output=slurm.%A_%a.%x.log 10 | ###SBATCH --constraint=xeon-2665,avx 11 | ###SBATCH --exclude=b10-10 12 | ###SBATCH --mail-type=ALL 13 | ###SBATCH --mail-user=tsungyul@usc.edu 14 | ###SBATCH --array=0,1 15 | 16 | set -eu 17 | module load gcc #usc samtools 18 | 19 | 20 | date 21 | #idir=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hgsvc/input 22 | od=/project/mchaisso_100/cmb-17/vntr_genotyping/analysis/readsim/hprc/errfree/ 23 | gs=($(cat genomes.txt) ) # n=7 = 48-40-hs1 24 | g=${gs[$((SLURM_ARRAY_TASK_ID / 2))]} 25 | h=$((SLURM_ARRAY_TASK_ID % 2)) 26 | fa=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/input4/$g.$h.fa 27 | reads=$od/output/$g/$h/reads 28 | ML=500 29 | # 30x for hs1, 15x for diploid genome 30 | mkdir -p $od/output/$g/$h 31 | 32 | sim_reads.v20241125 -pe -no-err -c 15 -ml $ML -bed -split -o $reads -i $fa 33 | 34 | date 35 | -------------------------------------------------------------------------------- /test/QC/fn1b.annot.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH --ntasks=3 3 | #SBATCH --time=48:00:00 4 | #SBATCH --mem=4000 5 | #SBATCH --partition=qcb 6 | #SBATCH --account=mchaisso_100 7 | #SBATCH -N 1 8 | #SBATCH --job-name=annot_p 9 | #SBATCH --output=slurm.%A_%a.%x.log 10 | ###SBATCH --constraint=xeon-2665,avx 11 | ###SBATCH --exclude=b10-10 12 | ###SBATCH --mail-type=ALL 13 | ###SBATCH --mail-user=tsungyul@usc.edu 14 | ###SBATCH --array=0,1 15 | 16 | source ~/.bashrc 17 | set -eu 18 | module load gcc #usc samtools 19 | conda activate snakepgg 20 | 21 | 22 | date 23 | indir=/project/mchaisso_100/cmb-17/vntr_genotyping/analysis/readsim/hprc/errfree/output 24 | od=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v2/hprc 25 | #gs=( $(cat /project/mchaisso_100/cmb-16/tsungyul/work/vntr/hapdb/config/genomes.1kg_plus_related.gt_HPRC.txt) ) # n=40 26 | gs=( $(cat genomes.txt) ) # n=7 27 | g=${gs[$((SLURM_ARRAY_TASK_ID / 2))]} 28 | h=$((SLURM_ARRAY_TASK_ID % 2)) 29 | gi=$( awk -v g=$g '{if ($1==g) {print NR-1; exit}}' /project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/input/genomes.HPRC.txt ) 30 | hi=$(( 2*gi + h )) 31 | panbed=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/pan.tr.mbe.v2.bed 32 | fai=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/input4/$g.$h.fa.fai 33 | annotReads=$od/annot_reads/$g.$h.annot.fa 34 | ML=500 35 | 36 | 37 | echo $g $h $gi $hi 38 | # no slop for err free reads 39 | for ctg in $(awk -v ML=$ML '{if ($2 >= ML) {print $1}}' $fai); do 40 | ls $indir/$g/$h/reads.$ctg.reads.bed >&2 41 | bedtools map -c 4 -o distinct_sort_num \ 42 | -a $indir/$g/$h/reads.$ctg.reads.bed \ 43 | -b <(awk -v i=$hi -v ctg=$ctg 'BEGIN {OFS="\t"; i=4+4*i} 44 | { if ($i == ctg) {print $i, $(i+1), $(i+2), NR-1} }' $panbed | 45 | sort -k1,1 -k2,2n -k3,3n) 46 | done | 47 | awk '{hd=">"$1":"$2"-"$3":"$6; print hd"/1"; print $4; print hd"/2"; print $5}' > $annotReads 48 | date 49 | -------------------------------------------------------------------------------- /test/QC/fn1c.extract.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH --ntasks=12 3 | #SBATCH --time=48:00:00 4 | #SBATCH --mem=20000 5 | #SBATCH --partition=qcb 6 | #SBATCH --account=mchaisso_100 7 | #SBATCH -N 1 8 | #SBATCH --job-name=sim.extract 9 | #SBATCH --output=slurm.%A_%a.%x.log 10 | ###SBATCH --exclude=b18-11 11 | ###SBATCH --constraint=xeon-2665,avx 12 | ###SBATCH --exclude=b10-10 13 | ###SBATCH --mail-type=ALL 14 | ###SBATCH --mail-user=tsungyul@usc.edu 15 | ###SBATCH --array=0,1 16 | 17 | source ~/.bashrc 18 | set -eu 19 | module load gcc #usc samtools 20 | #conda activate art 21 | 22 | 23 | date 24 | #gs=( $(cat /project/mchaisso_100/cmb-16/tsungyul/work/vntr/hapdb/config/genomes.1kg_plus_related.gt_HPRC.txt) ) # n=40 25 | gs=( $(cat genomes.txt) ) #n=7 26 | g=${gs[$SLURM_ARRAY_TASK_ID]} 27 | read0=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v2/hprc/annot_reads/$g.0.annot.fa 28 | read1=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v2/hprc/annot_reads/$g.1.annot.fa 29 | rpgg=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/pan 30 | cth=5 31 | od=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v2/hprc/extract_reads 32 | out=$od/$g.fa.gz 33 | 34 | echo $g 35 | sleep $((SLURM_ARRAY_TASK_ID * 7)) 36 | cat $read0 $read1 | 37 | /project/mchaisso_100/cmb-16/tsungyul/work/vntr/danbing-tk/bin/danbing-tk.genotyper.20241118a.O3 -cth 5 -e 1 -qs $rpgg -fa /dev/stdin -p 11 | gzip >$out 38 | date 39 | -------------------------------------------------------------------------------- /test/QC/fn2a1.raw.map.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH --ntasks=12 3 | #SBATCH --time=48:00:00 4 | #SBATCH --mem=40000 5 | #SBATCH --partition=qcb 6 | #SBATCH --account=mchaisso_100 7 | #SBATCH -N 1 8 | #SBATCH --job-name=sim.aln 9 | #SBATCH --output=slurm.%A_%a.%x.log 10 | ###SBATCH --exclude=b18-11 11 | ###SBATCH --constraint=xeon-2665,avx 12 | ###SBATCH --exclude=b10-10 13 | ###SBATCH --mail-type=ALL 14 | ###SBATCH --mail-user=tsungyul@usc.edu 15 | ###SBATCH --array=0,1 16 | 17 | source ~/.bashrc 18 | set -eu 19 | module load gcc #usc samtools 20 | #conda activate art 21 | 22 | 23 | date 24 | #gs=( $(cat /project/mchaisso_100/cmb-16/tsungyul/work/vntr/hapdb/config/genomes.1kg_plus_related.gt_HPRC.txt) ) # n=40 25 | #gs=( $(cat /project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/input/genomes.HPRC.txt) ) #n=48 26 | gs=( $(cat genomes.txt) ) #n=47 27 | g=${gs[$SLURM_ARRAY_TASK_ID]} 28 | od=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v4.2/hprc/ 29 | indir=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v2/hprc/ 30 | reads=$indir/extract_reads/$g.fa.gz 31 | rpgg=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/pan 32 | cth=10 33 | kam=$od/aln/$g.kam.gz 34 | kout=$od/aln/$g 35 | echo $cth 36 | 37 | ls $reads 38 | sleep $((SLURM_ARRAY_TASK_ID * 5)) 39 | zcat $reads | 40 | danbing-tk.genotyper.20241204b.O3 -cth $cth -xg -c asgn 40 -s 2 -qs $rpgg -fa /dev/stdin -o $kout -p 11 | gzip >$kam 41 | date 42 | -------------------------------------------------------------------------------- /test/QC/fn2a2.build.FPSkmer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH --ntasks=2 3 | #SBATCH --time=48:00:00 4 | #SBATCH --mem=80000 5 | #SBATCH --partition=qcb 6 | #SBATCH --account=mchaisso_100 7 | #SBATCH -N 1 8 | #SBATCH --job-name=pf 9 | #SBATCH --output=slurm.%A_%a.%x.log 10 | ###SBATCH --exclude=b18-11 11 | ###SBATCH --constraint=xeon-2665,avx 12 | ###SBATCH --exclude=b10-10 13 | ###SBATCH --mail-type=ALL 14 | ###SBATCH --mail-user=tsungyul@usc.edu 15 | ###SBATCH --array=0,1 16 | 17 | source ~/.bashrc 18 | set -eu 19 | module load gcc #usc samtools 20 | #conda activate art 21 | 22 | 23 | date 24 | indir=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v4.2/map 25 | od=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v4.2/map 26 | out=$od/FPSkmer.v0.tsv 27 | TP=$indir/hs1.FP_pf.txt 28 | 29 | baitBuilder.v20250105a v2 30488 21 $out $TP $indir/hs1.TP_pf.txt $(ls $indir/../hprc/profile/*txt) 30 | date 31 | -------------------------------------------------------------------------------- /test/QC/fn2a2.build.profile.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH --ntasks=2 3 | #SBATCH --time=48:00:00 4 | #SBATCH --mem=20000 5 | #SBATCH --partition=qcb 6 | #SBATCH --account=mchaisso_100 7 | #SBATCH -N 1 8 | #SBATCH --job-name=pf 9 | #SBATCH --output=slurm.%A_%a.%x.log 10 | ###SBATCH --exclude=b18-11 11 | ###SBATCH --constraint=xeon-2665,avx 12 | ###SBATCH --exclude=b10-10 13 | ###SBATCH --mail-type=ALL 14 | ###SBATCH --mail-user=tsungyul@usc.edu 15 | ###SBATCH --array=0,1 16 | 17 | source ~/.bashrc 18 | set -eu 19 | module load gcc #usc samtools 20 | #conda activate art 21 | 22 | 23 | date 24 | indir=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v4.2/hprc/aln 25 | od=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v4.2/hprc/profile 26 | #kDB_pref=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/pan 27 | gs=( $(cat genomes.txt) ) # n=47 28 | pref=${gs[$SLURM_ARRAY_TASK_ID]} 29 | cth=10 30 | kam=$indir/$pref.kam.gz 31 | out=$od/$pref 32 | 33 | baitBuilder.v20241217b v1.pf <(zcat $kam) 30488 21 $out -tp 34 | date 35 | -------------------------------------------------------------------------------- /test/QC/fn2a3.bait.map.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH --ntasks=12 3 | #SBATCH --time=24:00:00 4 | #SBATCH --mem=30000 5 | #SBATCH --partition=qcb 6 | #SBATCH --account=mchaisso_100 7 | #SBATCH -N 1 8 | #SBATCH --job-name=bait.aln 9 | #SBATCH --output=slurm.%A_%a.%x.log 10 | ###SBATCH --exclude=b18-11 11 | ###SBATCH --constraint=xeon-2665,avx 12 | ###SBATCH --exclude=b10-10 13 | ###SBATCH --mail-type=ALL 14 | ###SBATCH --mail-user=tsungyul@usc.edu 15 | ###SBATCH --array=0,1 16 | 17 | source ~/.bashrc 18 | set -eu 19 | module load gcc #usc samtools 20 | #conda activate art 21 | 22 | 23 | date 24 | #idir=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hgsvc/input 25 | gs=( $(cat genomes.txt) ) #n=47 26 | g=${gs[$SLURM_ARRAY_TASK_ID]} 27 | indir=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v2/hprc/ 28 | reads=$indir/extract_reads/$g.fa.gz 29 | rpgg=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/output8/pan 30 | od=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v4.2/hprc/wbait 31 | bait=/project/mchaisso_100/cmb-17/vntr_genotyping/rpgg2_k21_84k/hprc/full.v1/analysis8/mismap/v4.2/map/FPSkmer.v0.tsv 32 | kam=$od/$g.kam.gz 33 | kout=$od/$g 34 | echo $g 35 | 36 | ls $reads 37 | sleep $((SLURM_ARRAY_TASK_ID * 3)) 38 | zcat $reads | 39 | danbing-tk.genotyper.20250104a.O3 -b $bait -cth 10 -xg -c asgn 40 -s 2 -qs $rpgg -fa /dev/stdin -o $kout -p 11 | gzip >$kam 40 | date 41 | -------------------------------------------------------------------------------- /test/QC/input/HG002.0.fa: -------------------------------------------------------------------------------- 1 | >chr12_MATERNAL:34032000-34032580 2 | AGGTCTCATGGGCATTTTCTATGAGCTGGTGGACTGAGAGCACTGTAGGGCTCTACAACGATGTCTGACACCTTGGGCAAGGGTAGGACGCTGAATTTGTTTATGATCCTGTCTGGATACTCCTCCCGGATCTTACTAATGAGAAGGGTACCCATCCCAGACCCAGTCCCCCAACCCAGGAAGTGGGTCAGCTGGAAACCCTACAGGCAGTCACAGCTCTCAGACTCCTTTCTGACAATGTCCATCACTGACTCCATCAGTTCCGCGCCTTCTGTGTAGTGCCCCCTAGCCCAGTTGTTTCTGGCCCCACACTGACCTGTAAGACAGTACAGCCAGTCACTCAATGGCCAGGTATATGGTCATCAGTGGTCACCACAATGCAAAATGCTCCAAGTGTCACGTGTGAGGTGAGAGCACCATTCACCCTGCAGGTGGAGCAAATGAAACCCCCTACCCCAGAGTTACAGGACAGCAGCCTCCCCTGTTAGAAATTAGATCAGGAGCCAAACCTGAGACAGGCTAACAGACCTCACTGCAGGTGGCTCCTGCCCATTTCCAGAGAAGGCAGTAGCCACGGCCC 3 | >chr12_MATERNAL:38376260-38376860 4 | GGAGCTGAGCTGGGGCCGTGGCTACTGCCTTCCCTGAAAATGGGCAGGAGCCACCTGCAGGGAGGTCTGTTAGCCTGTCTCAGGTTTGGCTCGTGATTTAATTTCTAACAGGGGAGGCTGCTGTCCTGTAACTCTGGGGGAGGGGGTTTCATTTGCTCCACCTGCAGGGTAATTGGTGTTCTCACCTCACACCTGACACTTGGTGCATTTTGCATTGTGGTGGTGACCACTGATGACCATATACCTGGCCATTGAGTGACTGGCTGTACTGTCTTACAGGTCAGTGTGGGGCCGGAAACAACTGGGCCAAGGGGCACTATACAAAATGTGCAGAGCTGATGGAGTCAGTGATAGACATTGTCAGAAAGGAGTCTGAGAGCTGTGACTGCCTGTAGGGTTTCCAGCTGACCCACTCCCTGGGTGCGGGGACTGAGTCTGGGATGGGTACCCTTCTCATTAGTAAGATCCGGGAGGAGTATCCAGACAGGATCATAAACCATTCAGCATCCTGCCCACACCCAAGGTGTCAGACACTGTTGTGGAACCCTGCAATGCCACCCTCTCAGTACACCAGCTCATAGAAAATGCAGATGAGACCTT 5 | >chr1_MATERNAL:723308:725277 6 | TGGTCCTCAGGGTTGAGCAGCAGCCTCACCTCCCTCTGCTGTCCACGCTGGCCGGCATCAGGGTCATGGTTCACGGCCGTAACCACACGCCCTTCCTGGGGCACCACAGCTTCAGCGTCCGGCCAGGGACGGAGGCCACCATCAGCATCCGAGAGGTGAGCTGGCCTCTGCAGCCAACCTCCGGCCCAGGCCTCCTGCCCAACCTGGGCTTTGGGGGGTGAGGGCAGGGCCCATGGAACTGAAGCGTCCCCTCCCAGGACGAGGTGCACCGGCTCGGGAGCCCCTACGGCCACTGCACCGCCGGCGGGGAAGGCGTGGAGGTGGAGCTGCTACACAACACCTCCTACACCAGGCAGGTGAGGCTGGGCTGGCAGGGGGTGCGGGGGCAGGTGAGGCTGGGCTGGCCAGGGGGTGTGGGCGGGTGGAACGGGGGAGGGGTCTGGGAGAGTACTAGAGGGCCTGGGAACGGGGCAGTCCCCGTGGAGGCCCGCACTCCATCCCCCGTGTCCCCGCTCCATTCCCTGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCATGTCTCCGCTCCATCCCATGTCCCTGCTCATCCCCCCTGTCCCCAGGCCTGCCTGGTGTCCTGCTTCCAGCAGCTGATGGTGGAGACCTGCTCCTGTGGCTACTACCTCCACCCTCTGCCGGCGGGGGCTGAGTACTGCAGCTCTGCCCGGCACCCTGCCTGGGGTGAGTCCTGCTCGCTGCCTCCCACTCTGTCAGCCATTAGCCGGGGGGTCACAGCGAGCCTCACACATGCCTCTGACCCCTCCCCAAGGACACTGCTTCTACCGCCTCTACCAGGACCTGGAGACCCACCGGCTCCCCTGTACCTCCCGCTGCCCCAGGCCCTGCAGGTGAGACGGGGGTGTTGGGGTCGCGGCCAGGGATCATTGCCCCAGGTAGCGTGGCAGGTGACACCCGGCTGTCTCTTCCAGGGAGTCTGCATTCAAGCTCTCCACTGGGACCTCCAGGTGGCCTTCCGCCAAGTCAGCTGTGAGTCCCCAAAGTGGTGGGGTGGGGGTGTGGACAGCCAGGCAGACCCCACAGGTCCC 7 | -------------------------------------------------------------------------------- /test/QC/input/HG002.0.fa.fai: -------------------------------------------------------------------------------- 1 | chr12_MATERNAL:34032000-34032580 580 34 580 581 2 | chr12_MATERNAL:38376260-38376860 600 649 600 601 3 | chr1_MATERNAL:723308:725277 1969 1279 1969 1970 4 | -------------------------------------------------------------------------------- /test/QC/input/HG002.1.fa: -------------------------------------------------------------------------------- 1 | >chr12_PATERNAL:34066640-34067220 2 | TGTTGTATAGAGCTTCGTTATCTATGCAAAAGGTCTCATGGGCATTTTCTATGAGCTGGTGGACTGAGAGCACTGTAGGGCTCTACAACGATGTCTGACACCTTGGGCAAGGGTAGGACGCTGAATTTGTTTATGATCCTGTCTGGATACTCCTCCCGGATCTTACTAATGAGAAGGGTACCCATCCCAGACCCAGTCCCCCAACCCAGGAAGTGGGTCAGCTGGAAACCCTACAGGCAGTCACAGCTCTCAGACTCCTTTCTGACAATGTCCATCACTGACTCCATCAGTTCCGCGCCTTCTGTGTAGTGCCCCCTAGCCCAGTTGTTTCTGGCCCCACACTGACCTGTAAGACAGTACAGCCAGTCACTCAATGGCCAGGTATATGGTCATCAGTGGTCACCACAATGCAAAATGCTCCAAGTGTCACGTGTGAGGTGAGAGCACCATTCACCCTGCAGGTGGAGCAAATGAAACCCCCTACCCCAGAGTTACAGGACAGCAGCCTCCCCTGTTAGAAATTAGATCAGGAGCCAAACCTGAGACAGGCTAACAGACCTCACTGCAGGTGGCTCCTGCC 3 | >chr12_PATERNAL:38378100-38378940 4 | GGGGCAGAACTTCAGGCCAGACAACTTCATCTTTGGTGAGCCTTGGGTGAGAACTGGGGTGCGGCTCCTTAGCCAGGGTAGCTTAAAATCTGGGAATGCCCCAAGGTCATCGCTGTGGGAACTGTGGAGCCAGGGCCCCTGAACACCCTCCTATCCTCCTAGTCGCTTGATCTGCCTCTCCTAAACGGGCTTTGGGAGGAAGGCCCAGGTGTCTCAATGTGAGGAGCTACTGATGTAAACTCCCTGCAGGTCGCTGAGCTGGGGCCGTGGCTACTGCCTTCCCTGAAAATGGGCAGGAGCCACCTGCAGGGAGGTCTGTTAGCCTGTCTCAGGTTTGGCTCGTGATTTAATTTCTAACAGGGGAGGCTGCTGTCCTGTAACTCTGGGGGAGGGGGTTTCATTTGCTCCACCTGCAGGGTAATTGGTGTTCTCACCTCACACCTGACACTTGGTGCATTTTGCATTGTGGTGGTGACCACTGATGACCATATACCTGGCCATTGAGTGACTGGCTGTACTGTCTTACAGGTCAGTGTGGGGCCGGAAACAACTGGGCCAAGGGGCACTATACAAAATGTGCAGAGCTGATGGAGTCAGTGATAGACATTGTCAGAAAGGAGTCTGAGAGCTGTGACTGCCTGTAGGGTTTCCAGCTGACCCACTCCCTGGGTGCAGGGACTGAGTCTGGGATGGGTACCCTTCTCATTAGTAAGATCCGGGAGGAGTATCCAGACAGGATCATAAACCATTCAGCATCCTGCCCACACCCAAGGTGTCAGACACTGTTGTGGAACCCTGCAATGCCACCCTCTCAGTACACCAGCTCATAGAAAATGCAGA 5 | >chr1_PATERNAL:726308-730843 6 | CCCGTGTCCCCGCTCCATTCCCTGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGTTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCCCTGCCCCGTCCCGTGTCCCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCATGTCTCCGCTCCATCCCATGTCCCTGCTC 7 | -------------------------------------------------------------------------------- /test/QC/input/HG002.1.fa.fai: -------------------------------------------------------------------------------- 1 | chr12_PATERNAL:34066640-34067220 580 34 580 581 2 | chr12_PATERNAL:38378100-38378940 840 649 840 841 3 | chr1_PATERNAL:726308-730843 4535 1519 4535 4536 4 | -------------------------------------------------------------------------------- /test/QC/input/genomes.txt: -------------------------------------------------------------------------------- 1 | HG002 2 | hs1 3 | -------------------------------------------------------------------------------- /test/QC/input/hs1.0.fa: -------------------------------------------------------------------------------- 1 | >chr1:717384-721779 2 | CCCGTGTCCCCGCTCCATTCCCTGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCTGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCATGTCTCCGCTCCATCCCATGTCCCTGCTC 3 | >chr12:34043120-34043710 4 | GTTGTATAGAGCTTCGTTATCTATGCAAAAGGTCTCATGGGCATTTTCTATGAGCTGGTGGACTGAGAGCACTGTAGGGCTCTACAACGATGTCTGACACCTTGGGCAAGGGTAGGACGCTGAATTTGTTTATGATCCTGTCTGGATACTCCTCCCGGATCTTACTAATGAGAAGGGTACCCATCCCAGACCCAGTCCCCCAACCCAGGAAGTGGGTCAGCTGGAAACCCTACAGGCAGTCACAGCTCTCAGACTCCTTTCTGACAATGTCCATCACTGACTCCATCAGTTCCGCGCCTTCTGTGTAGTGCCCCCTAGCCCAGTTGTTTCTGGCCCCACACTGACCTGTAAGACAGTACAGCCAGTCACTCAATGGCCAGGTATATGGTCATCAGTGGTCACCACAATGCAAAATGCTCCAAGTGTCACGTGTGAGGTGAGAGCACCATTCACCCTGCAGGTGGAGCAAATGAAACCCCCTACCCCAGAGTTACAGGACAGCAGCCTCCCCTGTTAGAAATTAGATCAGGAGCCAAACCTGAGACAGGCTAACAGACCTCACTGCAGGTGGCTCCTGCCCATTTCCAGAG 5 | >chr12:38151830-38152700 6 | ACTCTGTGCGCTCGGGGCCCTTGGGGCAGAACTTCAGGCCAGACAACTTCATCTTTGGTGAGCCTTGGGTGAGAACTGGGGTGCGGCTCCTTAGCCAGGGTAGCTTAAAATCTGGGAATGCCCCAAGGTCATCGCTGTGGGAACTGTGGAGCCAGGGCCCCTGAACACCCTCCTATCCTCCTAGTCGCTTGATCTGCCTCTCCTAAACGGGCTTTGGGAGGAAGGCCCAGGTGTCTCAATGTGAGGAGCTACTGATGTAAACTCCCTGCAGGTCGCTGAGCTGGGGCCGTGGCTACTGCCTTCCCTGAAAATGGGCAGGAGCCACCTGCAGGGAGGTCTGTTAGCCTGTCTCAGGTTTGGCTCGTGATTTAATTTCTAACAGGGGAGGCTGCTGTCCTGTAACTCTGGGGGAGGGGGTTTCATTTGCTCCACCTGCAGGGTAATTGGTGTTCTCACCTCACACCTGACACTTGGTGCATTTTGCATTGTGGTGGTGACCACTGATGACCATATACCTGGCCATTGAGTGACTGGCTGTACTGTCTTACAGGTCAGTGTGGGGCCGGAAACAACTGGGCCAAGGGGCACTATACAAAATGTGCAGAGCTGATGGAGTCAGTGATAGACATTGTCAGAAAGGAGTCTGAGAGCTGTGACTGCCTGTAGGGTTTCCAGCTGACCCACTCCCTGGGTGCAGGGACTGAGTCTGGGATGGGTACCCTTCTCATTAGTAAGATCCGGGAGGAGTATCCAGACAGGATCATAAACCATTCAGCATCCTGCCCACACCCAAGGTGTCAGACACTGTTGTGGAACCCTGCAATGCCACCCTCTCAGTACACCAGCTCATAGAAAATGCAGATGAGACCT 7 | -------------------------------------------------------------------------------- /test/QC/input/hs1.0.fa.fai: -------------------------------------------------------------------------------- 1 | chr1:717384-721779 4395 20 4395 4396 2 | chr12:34043120-34043710 590 4441 590 591 3 | chr12:38151830-38152700 870 5057 870 871 4 | -------------------------------------------------------------------------------- /test/QC/input/hs1.1.fa: -------------------------------------------------------------------------------- 1 | >chr1:717384-721779 2 | CCCGTGTCCCCGCTCCATTCCCTGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCTGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCCCCGTCCCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCCCGAGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCCCGTGTCTCTGCTCCGTCCCGTGTCTCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCCCTGCTCCGTCCCGTGTCTCTGCCCCGTCCCCCATGTCTCCGCTCCATCCCATGTCCCTGCTC 3 | >chr12:34043120-34043710 4 | GTTGTATAGAGCTTCGTTATCTATGCAAAAGGTCTCATGGGCATTTTCTATGAGCTGGTGGACTGAGAGCACTGTAGGGCTCTACAACGATGTCTGACACCTTGGGCAAGGGTAGGACGCTGAATTTGTTTATGATCCTGTCTGGATACTCCTCCCGGATCTTACTAATGAGAAGGGTACCCATCCCAGACCCAGTCCCCCAACCCAGGAAGTGGGTCAGCTGGAAACCCTACAGGCAGTCACAGCTCTCAGACTCCTTTCTGACAATGTCCATCACTGACTCCATCAGTTCCGCGCCTTCTGTGTAGTGCCCCCTAGCCCAGTTGTTTCTGGCCCCACACTGACCTGTAAGACAGTACAGCCAGTCACTCAATGGCCAGGTATATGGTCATCAGTGGTCACCACAATGCAAAATGCTCCAAGTGTCACGTGTGAGGTGAGAGCACCATTCACCCTGCAGGTGGAGCAAATGAAACCCCCTACCCCAGAGTTACAGGACAGCAGCCTCCCCTGTTAGAAATTAGATCAGGAGCCAAACCTGAGACAGGCTAACAGACCTCACTGCAGGTGGCTCCTGCCCATTTCCAGAG 5 | >chr12:38151830-38152700 6 | ACTCTGTGCGCTCGGGGCCCTTGGGGCAGAACTTCAGGCCAGACAACTTCATCTTTGGTGAGCCTTGGGTGAGAACTGGGGTGCGGCTCCTTAGCCAGGGTAGCTTAAAATCTGGGAATGCCCCAAGGTCATCGCTGTGGGAACTGTGGAGCCAGGGCCCCTGAACACCCTCCTATCCTCCTAGTCGCTTGATCTGCCTCTCCTAAACGGGCTTTGGGAGGAAGGCCCAGGTGTCTCAATGTGAGGAGCTACTGATGTAAACTCCCTGCAGGTCGCTGAGCTGGGGCCGTGGCTACTGCCTTCCCTGAAAATGGGCAGGAGCCACCTGCAGGGAGGTCTGTTAGCCTGTCTCAGGTTTGGCTCGTGATTTAATTTCTAACAGGGGAGGCTGCTGTCCTGTAACTCTGGGGGAGGGGGTTTCATTTGCTCCACCTGCAGGGTAATTGGTGTTCTCACCTCACACCTGACACTTGGTGCATTTTGCATTGTGGTGGTGACCACTGATGACCATATACCTGGCCATTGAGTGACTGGCTGTACTGTCTTACAGGTCAGTGTGGGGCCGGAAACAACTGGGCCAAGGGGCACTATACAAAATGTGCAGAGCTGATGGAGTCAGTGATAGACATTGTCAGAAAGGAGTCTGAGAGCTGTGACTGCCTGTAGGGTTTCCAGCTGACCCACTCCCTGGGTGCAGGGACTGAGTCTGGGATGGGTACCCTTCTCATTAGTAAGATCCGGGAGGAGTATCCAGACAGGATCATAAACCATTCAGCATCCTGCCCACACCCAAGGTGTCAGACACTGTTGTGGAACCCTGCAATGCCACCCTCTCAGTACACCAGCTCATAGAAAATGCAGATGAGACCT 7 | -------------------------------------------------------------------------------- /test/QC/input/hs1.1.fa.fai: -------------------------------------------------------------------------------- 1 | chr1:717384-721779 4395 20 4395 4396 2 | chr12:34043120-34043710 590 4441 590 591 3 | chr12:38151830-38152700 870 5057 870 871 4 | -------------------------------------------------------------------------------- /test/QC/input/pan.graph.umap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChaissonLab/danbing-tk/d29ade3963cedd82d54ed2fe983d3227cbd45e39/test/QC/input/pan.graph.umap -------------------------------------------------------------------------------- /test/QC/input/pan.kmerDBi.umap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChaissonLab/danbing-tk/d29ade3963cedd82d54ed2fe983d3227cbd45e39/test/QC/input/pan.kmerDBi.umap -------------------------------------------------------------------------------- /test/QC/input/pan.kmerDBi.vv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/QC/input/pan.tr.kmers: -------------------------------------------------------------------------------- 1 | >0 2 | 2719446221024 3 3 | 1779373183032 3 4 | 1095033655906 3 5 | 402087667728 3 6 | 100521916932 3 7 | 589078821834 1 8 | 1929153509853 1 9 | 2785596495804 1 10 | 2888876646113 1 11 | 774337874421 1 12 | 2050771876384 1 13 | 2161747235464 1 14 | 2711716224648 1 15 | 2921242417080 1 16 | 730310604270 1 17 | 320522319710 1 18 | 80130579927 1 19 | 170151884919 1 20 | 2241561226781 1 21 | 749623017493 1 22 | 2937711467845 1 23 | 734427866961 1 24 | 2382630222292 1 25 | 2787492500853 1 26 | 2353876981205 1 27 | 619414902613 1 28 | 2477659610453 1 29 | 318402852190 1 30 | 60135167314 1 31 | 962162677026 1 32 | 513962323956 1 33 | 368651153170 1 34 | 2015718478340 3 35 | 2128072899240 3 36 | 680683028958 3 37 | 170170757239 3 38 | 2241565944861 3 39 | 2888876940993 3 40 | 730310622700 3 41 | 309544482142 3 42 | 629168124064 13 43 | 264078270622 13 44 | 684341340220 13 45 | 2332425130664 13 46 | 1476137119924 13 47 | 2113798941992 13 48 | 506301838833 13 49 | 533607500448 13 50 | 1506501968594 13 51 | 1256803658792 13 52 | 172808284340 13 53 | 2025207355332 13 54 | 2737365360880 13 55 | 867890230622 14 56 | 1114545419604 1 57 | 320522320732 14 58 | 730310620620 14 59 | 1500371931432 14 60 | 2888876907713 14 61 | 810844047861 14 62 | 680674509278 14 63 | 1411916632649 14 64 | 1452490785938 14 65 | 1462634324260 14 66 | 2119348746920 14 67 | 2015718486660 14 68 | 1474604612810 14 69 | 1468162780978 14 70 | 2015718347396 14 71 | 367040695244 14 72 | 91760173811 14 73 | 1452455134354 14 74 | 1462625411364 14 75 | 1465167980617 14 76 | 2119491353256 14 77 | 91760173777 14 78 | 22940043444 14 79 | 2218660812732 3 80 | 553794183888 92 81 | 1471223734564 92 82 | 2215176735552 92 83 | 1006906138014 85 84 | 2916358674916 85 85 | 2094701354472 188 86 | 404960763634 85 87 | 1465170208841 14 88 | 2204758266413 14 89 | 1200751818684 85 90 | 101081313373 107 91 | 404325253494 107 92 | 240607812936 189 93 | 1263454999412 107 94 | 2352195139193 107 95 | 1617166796249 187 96 | 1507307274962 484 97 | 612687534564 107 98 | 655753160013 85 99 | 1114549613908 369 100 | 578327709137 95 101 | 404960768596 107 102 | 100521916964 291 103 | 2134561073794 92 104 | 749623017561 33 105 | 716174255953 33 106 | 310808300264 85 107 | 3928723033600 1 108 | 2378066819540 33 109 | 2353877050837 33 110 | 680607539678 1 111 | 240611972424 33 112 | 962447889698 33 113 | 1619843054537 85 114 | 96055141105 33 115 | 384220564420 33 116 | 2599883567400 33 117 | 1599760736320 109 118 | 1474604610634 14 119 | 699275905258 307 120 | 2563511154805 307 121 | 2681630247656 294 122 | 1603441247393 372 123 | 240607778120 369 124 | 1474604612682 339 125 | 457150323998 95 126 | 692104156652 13 127 | 404325253409 200 128 | 1336992996728 307 129 | 457151716938 187 130 | 2719446352096 376 131 | 91760173809 341 132 | 3239862676472 282 133 | 2325598715260 13 134 | 2161747235460 188 135 | 69659350869 112 136 | 755408275395 589 137 | 5760174268 1 138 | 193956414946 589 139 | 1105829209861 107 140 | 1411774030409 1 141 | 1639948436641 188 142 | 2015718478468 372 143 | 2081691861888 112 144 | 2382361786836 349 145 | 1468733205778 189 146 | 170168660087 277 147 | 574752075401 101 148 | 655773494738 187 149 | 2933416508741 112 150 | 2343605204601 187 151 | 2012381225508 1 152 | 2081692386176 572 153 | 128490580733 355 154 | 1092886172258 376 155 | 3436983429300 92 156 | 1411908112969 3 157 | 80130580215 3 158 | 2921242482480 14 159 | 1243233201058 85 160 | 1243199646624 109 161 | 775825659784 589 162 | 730310620652 369 163 | 1599760736832 101 164 | 60151953234 189 165 | 1886748810616 282 166 | 2888876908257 202 167 | 513819716596 189 168 | 749623017557 538 169 | 2921242482616 202 170 | 643025272349 282 171 | 404960768598 187 172 | 747041549778 95 173 | 1286917382149 1 174 | 506301840833 92 175 | 1263455001460 187 176 | 2325598715248 484 177 | 2119482956456 92 178 | 1608350671424 376 179 | 1105828685572 95 180 | 2356315287336 1 181 | 258455680005 589 182 | 404291698977 95 183 | 2764932549440 497 184 | 629168127136 484 185 | 2768416626608 13 186 | 506301838785 484 187 | 2520752601089 1 188 | 128490580989 1 189 | 2325598715760 92 190 | 1639948436642 1 191 | 1452488656018 3 192 | 699275905130 282 193 | 2514885734996 85 194 | 733354125137 538 195 | 404291699062 187 196 | 1373270041784 109 197 | 520422965472 109 198 | 1695957538013 85 199 | 2382361787348 112 200 | 170168660085 92 201 | 2737366147312 484 202 | 2025207363332 92 203 | 170168662135 202 204 | 2093627612648 1 205 | 684341536828 484 206 | 2520752601093 571 207 | 2134426856066 484 208 | 2299008301604 101 209 | 80130580183 572 210 | 1462633799972 277 211 | 1095033656034 109 212 | 1465170077769 277 213 | 1089867514489 92 214 | 1411908244041 277 215 | 2921242490800 3 216 | 808696564213 369 217 | 318134416734 571 218 | 278637403477 112 219 | 240607778152 92 220 | 2119482964648 277 221 | 2728896093832 188 222 | 1056313082489 13 223 | 533640268448 92 224 | 92162788292 1 225 | 1288363696624 589 226 | 505372388340 92 227 | 680674640350 277 228 | 2119491353120 188 229 | 962431112610 92 230 | 170168627319 14 231 | 1105829209860 200 232 | 126343097085 92 233 | 1476336349364 92 234 | 2012381225252 188 235 | 1486848427154 92 236 | 2737366139120 92 237 | 2015852696196 92 238 | 1256803659560 483 239 | 22948956340 189 240 | 3286717253092 571 241 | 91760206577 92 242 | 1476338446516 484 243 | 2126549626152 92 244 | 1616992235594 92 245 | 3103302639137 282 246 | 1411774026313 202 247 | 949925475809 307 248 | 1828601295993 95 249 | 367040826308 92 250 | 1399699582757 294 251 | 2787492517237 349 252 | 1229617369144 109 253 | 619415164893 189 254 | 1779373215800 376 255 | 3239829122040 307 256 | 1507298886354 92 257 | 2332433322664 92 258 | 1603474801825 92 259 | 3251685712572 3 260 | 1549347197513 92 261 | 1474606709834 92 262 | 60151944538 92 263 | 680674640342 92 264 | 1468163305234 92 265 | 629168127104 92 266 | 314200914698 13 267 | 1256803659552 92 268 | 1467317561417 92 269 | 2888876908225 369 270 | 2768416626620 572 271 | 101072924744 95 272 | 1468162780434 14 273 | 1274330604090 307 274 | 684341534780 92 275 | 1114549613910 92 276 | 314200914890 483 277 | 263876944030 484 278 | 1474604612680 3 279 | 2241565420573 369 280 | 1286917382165 538 281 | 2025207355140 484 282 | 1603441214625 14 283 | 612687447513 85 284 | 78550228722 3 285 | 2135311940077 13 286 | 22940043452 355 287 | 60151944530 369 288 | 513962306548 33 289 | 240540669256 1 290 | 272466878622 92 291 | 1500371939624 339 292 | 1929153509869 572 293 | 962431112482 369 294 | 1468162780946 341 295 | 1500371939616 3 296 | 25268231186 95 297 | 1608350670912 3 298 | 77699977914 210 299 | 1603441249441 14 300 | 749488799829 112 301 | 2241565421085 202 302 | 1200760207292 210 303 | 1372733170840 376 304 | 574752075393 109 305 | 574886293129 85 306 | 2921242482608 369 307 | 320522320734 571 308 | 513962322932 369 309 | 2477660659029 349 310 | 793192113517 95 311 | 1500371940136 14 312 | 1286883827733 112 313 | 367040695236 341 314 | 1105828685573 187 315 | 2477660724565 33 316 | 2353877046741 349 317 | 2753132778869 112 318 | 2216438093269 112 319 | 619415164757 349 320 | 2520744212485 112 321 | 2081325786477 187 322 | 310799911656 210 323 | 1930427968416 3 324 | 2299008301572 109 325 | 2299545172516 85 326 | 2701929898684 291 327 | 404994318066 210 328 | 2916358672868 210 329 | 128490576637 33 330 | 1619977272265 210 331 | 2933416500549 538 332 | 399940184080 109 333 | 2332424934056 484 334 | 2523475669588 115 335 | 1373270041752 3 336 | 1286917382166 33 337 | 1627918084262 33 338 | 3347023987064 589 339 | 1617166795909 95 340 | 691233137360 497 341 | 2263637175553 497 342 | 1616992107594 13 343 | 2572101089397 282 344 | 2343605182836 95 345 | 77702075066 85 346 | 1500380328232 92 347 | 773192817308 282 348 | 314200914888 92 349 | 1200751819925 294 350 | 655773486546 107 351 | 730310620654 202 352 | 578327796196 187 353 | 1092886172386 101 354 | 22940051644 92 355 | 533606714016 484 356 | 60152993106 33 357 | 2126683843880 484 358 | 3471343167668 484 359 | 3092771269232 282 360 | 2081862577957 210 361 | 101072924765 187 362 | 25268231191 187 363 | 2787492518261 33 364 | 1055507776121 484 365 | 680674648542 202 366 | 1462633791780 3 367 | 1274330604058 282 368 | 1509498736936 189 369 | 733354127185 112 370 | 1229617401912 101 371 | 1617301013977 107 372 | 23040697073 1 373 | 1465170075721 3 374 | 1476886312010 189 375 | 320522320862 3 376 | 367183301444 189 377 | 1452488688786 277 378 | 91795825361 189 379 | 25270328343 107 380 | 1603441247361 3 381 | 367040695108 14 382 | 2204760494637 189 383 | 64613920001 92 384 | 962431251746 189 385 | 2385783642996 95 386 | 1114549616084 189 387 | 2081325707045 85 388 | 2244857888373 189 389 | 2241565412381 14 390 | 183338531284 189 391 | 520423096544 101 392 | 808663009781 3 393 | 1372733170872 101 394 | 1114549876052 33 395 | 619415181141 33 396 | 399940184208 101 397 | 1243199646626 101 398 | 1006906137886 115 399 | 1616992104522 484 400 | 2864697023813 33 401 | 2134430001794 13 402 | 1006907530826 107 403 | 793192113485 115 404 | 2477660659573 189 405 | 2081325786445 107 406 | 561214472093 189 407 | 612687447505 115 408 | 24013785276 33 409 | 3286719350244 112 410 | 2352195117428 115 411 | 2385783640948 200 412 | 747041541586 200 413 | 774336825845 202 414 | 1617301013637 200 415 | 101081313352 200 416 | 25270328338 200 417 | 3092771302000 307 418 | 1536882257682 33 419 | 773192825500 307 420 | 640877788701 307 421 | 402087667856 376 422 | 1930427968418 291 423 | -------------------------------------------------------------------------------- /test/QC/input/pan.tr.mbe.v2.bed: -------------------------------------------------------------------------------- 1 | chr1 717384 721779 chr1_MATERNAL 723808 724777 1 chr1_PATERNAL 726308 730843 1 chr1 717384 721779 1 chr1 717384 721779 1 2 | -------------------------------------------------------------------------------- /test/goodPanGenomeGraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "srcDir" : "/$PREFIX/danbing-tk/", 3 | "inputDir" : "/$PREFIX/danbing-tk/test/input/", 4 | "outputDir" : "/$PREFIX/danbing-tk/test/output/", 5 | "pairs" : "/$PREFIX/danbing-tk/test/input/genome.test.txt", 6 | "AsmAligner" : "minimap2", 7 | "ksize" : 21, 8 | "flankSize" : 500, 9 | "dist_merge" : 500, 10 | "dist_scan" : 500, 11 | "countThreashold" : 45, 12 | "ratioThreashold" : 0.5, 13 | "threadingCountThreshold" : 50, 14 | "sizeLowerBound" : 50, 15 | "liftover_min_len" : 200, 16 | "TRwindow" : 100000, 17 | "MBE_th1" : 0.31, 18 | "MBE_th2" : 0.31, 19 | "pruning" : false, 20 | "ref" : "/$PREFIX/danbing-tk/test/input/hg38.chr12.2155791.2356090.fasta", 21 | "refTR" : "/$PREFIX/danbing-tk/test/input/tr.bed", 22 | "refctrl" : "", 23 | "clusterOpts" : "sbatch --time=48:00:00 --partition=qcb --account=mchaisso_100 -N 1" 24 | } 25 | -------------------------------------------------------------------------------- /test/input/HG00514.0.fa.fai: -------------------------------------------------------------------------------- 1 | stitch.054.4.122019.chr12.2170000.2330000/56127/72110 15983 55 15983 15984 2 | -------------------------------------------------------------------------------- /test/input/HG00514.1.fa.fai: -------------------------------------------------------------------------------- 1 | stitch.000.109.2450148.chr12.590000.3130000/1600880/1616840 15960 61 15960 15961 2 | -------------------------------------------------------------------------------- /test/input/HG00514.filtered.reads.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChaissonLab/danbing-tk/d29ade3963cedd82d54ed2fe983d3227cbd45e39/test/input/HG00514.filtered.reads.bam -------------------------------------------------------------------------------- /test/input/HG00514.filtered.reads.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChaissonLab/danbing-tk/d29ade3963cedd82d54ed2fe983d3227cbd45e39/test/input/HG00514.filtered.reads.bam.bai -------------------------------------------------------------------------------- /test/input/HG00733.0.fa.fai: -------------------------------------------------------------------------------- 1 | stitch.000.90.1879921.chr12.930000.2890000/1282738/1298427 15689 60 15689 15690 2 | -------------------------------------------------------------------------------- /test/input/HG00733.1.fa.fai: -------------------------------------------------------------------------------- 1 | stitch.000.311.6394959.chr12.50000.6470000/2179666/2194456 14790 60 14790 14791 2 | -------------------------------------------------------------------------------- /test/input/HG00733.filtered.reads.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChaissonLab/danbing-tk/d29ade3963cedd82d54ed2fe983d3227cbd45e39/test/input/HG00733.filtered.reads.bam -------------------------------------------------------------------------------- /test/input/HG00733.filtered.reads.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChaissonLab/danbing-tk/d29ade3963cedd82d54ed2fe983d3227cbd45e39/test/input/HG00733.filtered.reads.bam.bai -------------------------------------------------------------------------------- /test/input/archive/HG00514.filtered.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChaissonLab/danbing-tk/d29ade3963cedd82d54ed2fe983d3227cbd45e39/test/input/archive/HG00514.filtered.fasta.gz -------------------------------------------------------------------------------- /test/input/archive/HG00733.filtered.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChaissonLab/danbing-tk/d29ade3963cedd82d54ed2fe983d3227cbd45e39/test/input/archive/HG00733.filtered.fasta.gz -------------------------------------------------------------------------------- /test/input/archive/getfasta.514.bed: -------------------------------------------------------------------------------- 1 | stitch.054.4.122019.chr12.2170000.2330000 56127 72110 2 | stitch.000.109.2450148.chr12.590000.3130000 1600880 1616840 3 | -------------------------------------------------------------------------------- /test/input/archive/getfasta.733.bed: -------------------------------------------------------------------------------- 1 | stitch.000.90.1879921.chr12.930000.2890000 1282738 1298427 2 | stitch.000.311.6394959.chr12.50000.6470000 2179666 2194456 3 | -------------------------------------------------------------------------------- /test/input/ctrl.bed: -------------------------------------------------------------------------------- 1 | chr12 2255091 2255791 2 | chr12 2256090 2256790 3 | -------------------------------------------------------------------------------- /test/input/genome.test.txt: -------------------------------------------------------------------------------- 1 | HG00514 2 | HG00733 3 | -------------------------------------------------------------------------------- /test/input/hg38.chr12.2155791.2356090.fasta.fai: -------------------------------------------------------------------------------- 1 | chr12/2155791/2356090 200299 23 200299 200300 2 | -------------------------------------------------------------------------------- /test/input/tr.bed: -------------------------------------------------------------------------------- 1 | chr12/2155791/2356090 100000 100299 2 | --------------------------------------------------------------------------------