├── .gitignore ├── .gitmodules ├── .vscode └── settings.json ├── CondaEnvironment.yml ├── LICENSE.md ├── README.md ├── Snakemakes └── ErrorCorrect │ ├── README.md │ ├── Snakefile │ └── config.yaml ├── edlib ├── include │ └── edlib.h └── src │ └── edlib.cpp ├── makefile ├── scripts ├── 10fold_test │ ├── gen_test.py │ ├── sos.py │ └── vg_pb2.py ├── summary.py ├── test.sh └── vg_pb2.py ├── src ├── Aligner.cpp ├── Aligner.h ├── AlignerMain.cpp ├── AlignmentCorrectnessEstimation.cpp ├── AlignmentCorrectnessEstimation.h ├── AlignmentGraph.cpp ├── AlignmentGraph.h ├── AlignmentSelection.cpp ├── AlignmentSelection.h ├── AlignmentSubsequenceIdentity.cpp ├── ArrayPriorityQueue.h ├── BigraphToDigraph.cpp ├── BigraphToDigraph.h ├── BruteForceExactPrefixSeeds.cpp ├── CommonUtils.cpp ├── CommonUtils.h ├── ComponentPriorityQueue.h ├── DijkstraQueue.h ├── EValue.cpp ├── EValue.h ├── EstimateRepeatCount.cpp ├── ExtractCorrectedReads.cpp ├── ExtractExactPathSubgraph.cpp ├── ExtractPathSequence.cpp ├── ExtractPathSubgraphNeighbourhood.cpp ├── FusionFinder.cpp ├── GfaGraph.cpp ├── GfaGraph.h ├── GraphAligner.h ├── GraphAlignerBitvectorBanded.h ├── GraphAlignerBitvectorCommon.h ├── GraphAlignerBitvectorDijkstra.h ├── GraphAlignerCommon.h ├── GraphAlignerGAFAlignment.h ├── GraphAlignerVGAlignment.h ├── GraphAlignerWrapper.cpp ├── GraphAlignerWrapper.h ├── MafToAlignment.cpp ├── MinimizerSeeder.cpp ├── MinimizerSeeder.h ├── MummerSeeder.cpp ├── MummerSeeder.h ├── NodePosCsv.cpp ├── NodeSlice.h ├── PickAdjacentAlnPairs.cpp ├── PickMummerSeeds.cpp ├── Postprocess.cpp ├── ReadCorrection.cpp ├── ReadCorrection.h ├── ReverseReads.cpp ├── SelectLongestAlignment.cpp ├── SimulateReads.cpp ├── SupportedSubgraph.cpp ├── ThreadReadAssertion.cpp ├── ThreadReadAssertion.h ├── UnitigifyDBG.cpp ├── UntipRelative.cpp ├── VisualizeAlignment.cpp ├── WordSlice.h ├── fastqloader.cpp ├── fastqloader.h ├── stream.hpp └── vg.proto └── test ├── graph.gfa └── read.fa /.gitignore: -------------------------------------------------------------------------------- 1 | obj/* 2 | bin/* 3 | src/vg.pb.cc 4 | src/vg.pb.h -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "zstr"] 2 | path = zstr 3 | url = https://github.com/mateidavid/zstr.git 4 | [submodule "concurrentqueue"] 5 | path = concurrentqueue 6 | url = https://github.com/cameron314/concurrentqueue.git 7 | [submodule "parallel-hashmap"] 8 | path = parallel-hashmap 9 | url = https://github.com/greg7mdp/parallel-hashmap.git 10 | [submodule "BBHash"] 11 | path = BBHash 12 | url = https://github.com/maickrau/BBHash.git 13 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.associations": { 3 | "*.geojson": "json", 4 | "vector": "cpp", 5 | "__split_buffer": "cpp", 6 | "__tree": "cpp", 7 | "algorithm": "cpp", 8 | "array": "cpp", 9 | "deque": "cpp", 10 | "list": "cpp", 11 | "locale": "cpp", 12 | "queue": "cpp", 13 | "random": "cpp", 14 | "regex": "cpp", 15 | "stack": "cpp", 16 | "string": "cpp", 17 | "string_view": "cpp", 18 | "type_traits": "cpp", 19 | "typeinfo": "cpp", 20 | "__bit_reference": "cpp", 21 | "__config": "cpp", 22 | "__debug": "cpp", 23 | "__errc": "cpp", 24 | "__functional_base": "cpp", 25 | "__hash_table": "cpp", 26 | "__locale": "cpp", 27 | "__mutex_base": "cpp", 28 | "__node_handle": "cpp", 29 | "__nullptr": "cpp", 30 | "__string": "cpp", 31 | "__threading_support": "cpp", 32 | "__tuple": "cpp", 33 | "any": "cpp", 34 | "atomic": "cpp", 35 | "bit": "cpp", 36 | "bitset": "cpp", 37 | "cctype": "cpp", 38 | "chrono": "cpp", 39 | "cinttypes": "cpp", 40 | "cmath": "cpp", 41 | "complex": "cpp", 42 | "condition_variable": "cpp", 43 | "csignal": "cpp", 44 | "cstdarg": "cpp", 45 | "cstddef": "cpp", 46 | "cstdint": "cpp", 47 | "cstdio": "cpp", 48 | "cstdlib": "cpp", 49 | "cstring": "cpp", 50 | "ctime": "cpp", 51 | "cwchar": "cpp", 52 | "cwctype": "cpp", 53 | "exception": "cpp", 54 | "forward_list": "cpp", 55 | "fstream": "cpp", 56 | "functional": "cpp", 57 | "future": "cpp", 58 | "initializer_list": "cpp", 59 | "iomanip": "cpp", 60 | "ios": "cpp", 61 | "iosfwd": "cpp", 62 | "iostream": "cpp", 63 | "istream": "cpp", 64 | "iterator": "cpp", 65 | "limits": "cpp", 66 | "map": "cpp", 67 | "memory": "cpp", 68 | "mutex": "cpp", 69 | "new": "cpp", 70 | "numeric": "cpp", 71 | "optional": "cpp", 72 | "ostream": "cpp", 73 | "ratio": "cpp", 74 | "scoped_allocator": "cpp", 75 | "set": "cpp", 76 | "shared_mutex": "cpp", 77 | "sstream": "cpp", 78 | "stdexcept": "cpp", 79 | "streambuf": "cpp", 80 | "system_error": "cpp", 81 | "thread": "cpp", 82 | "tuple": "cpp", 83 | "unordered_map": "cpp", 84 | "unordered_set": "cpp", 85 | "utility": "cpp", 86 | "filesystem": "cpp" 87 | } 88 | } -------------------------------------------------------------------------------- /CondaEnvironment.yml: -------------------------------------------------------------------------------- 1 | name: GraphChainer 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - asn1crypto=0.24.0=py36_0 9 | - bcrypt=3.1.4=py36h14c3975_0 10 | - binutils=2.31=0 11 | - binutils_impl_linux-64=2.28.1=had2808c_3 12 | - binutils_linux-64=7.2.0=had2808c_27 13 | - blas=1.0=mkl 14 | - boost=1.67.0=py36_4 15 | - bzip2=1.0.6=h14c3975_5 16 | - ca-certificates=2019.9.11=hecc5488_0 17 | - certifi=2019.9.11=py36_0 18 | - cffi=1.11.5=py36h9745a5d_0 19 | - chardet=3.0.4=py36_1 20 | - cloog=0.18.0=0 21 | - cryptography=2.2.2=py36h14c3975_0 22 | - docutils=0.14=py36_0 23 | - dropbox=8.9.0=py36_0 24 | - filechunkio=1.6=py36_0 25 | - ftputil=3.2=py36_0 26 | - gcc_impl_linux-64=7.2.0=habb00fd_3 27 | - gcc_linux-64=7.2.0=h550dcbe_27 28 | - gmp=6.1.2=h6c8ec71_1 29 | - gxx_impl_linux-64=7.2.0=hdf63c60_3 30 | - gxx_linux-64=7.2.0=h550dcbe_27 31 | - icu=58.2=h9c2bf20_1 32 | - idna=2.7=py36_0 33 | - intel-openmp=2018.0.3=0 34 | - isl=0.17.1=0 35 | - jemalloc=5.0.1=hf484d3e_1 36 | - libboost=1.67.0=h46d08c1_4 37 | - libdivsufsort=2.0.2=h470a237_2 38 | - libedit=3.1.20170329=h6b74fdf_2 39 | - libffi=3.2.1=hd88cf55_4 40 | - libgcc-ng=8.2.0=hdf63c60_1 41 | - libgfortran-ng=7.2.0=hdf63c60_3 42 | - libprotobuf=3.6.0=hdbcaa40_0 43 | - libsodium=1.0.16=h1bed415_0 44 | - libstdcxx-ng=8.2.0=hdf63c60_1 45 | - mkl=2018.0.3=1 46 | - mkl_fft=1.0.4=py36h4414c95_1 47 | - mkl_random=1.0.1=py36h4414c95_1 48 | - mpc=1.1.0=h10f8cd9_1 49 | - mpfr=4.0.1=hdf1c602_3 50 | - mummer4=4.0.0beta2=pl526hf484d3e_4 51 | - ncurses=6.1=hf484d3e_0 52 | - numpy=1.15.0=py36h1b885b7_0 53 | - numpy-base=1.15.0=py36h3dfced4_0 54 | - openssl=1.0.2r=h14c3975_0 55 | - pandas=0.23.3=py36h04863e7_0 56 | - paramiko=2.4.1=py36_0 57 | - perl=5.26.2=h14c3975_0 58 | - pip=10.0.1=py36_0 59 | - pkg-config=0.29.2=h1bed415_8 60 | - protobuf=3.6.0=py36hf484d3e_0 61 | - psutil=5.4.6=py36h14c3975_0 62 | - py-boost=1.67.0=py36h04863e7_4 63 | - pyasn1=0.4.4=py36_0 64 | - pycparser=2.18=py36_1 65 | - pynacl=1.2.1=py36h14c3975_0 66 | - pyopenssl=18.0.0=py36_0 67 | - pysftp=0.2.9=py36_0 68 | - pysocks=1.6.8=py36_0 69 | - python=3.6.6=hc3d631a_0 70 | - python-dateutil=2.7.3=py36_0 71 | - pytz=2018.5=py36_0 72 | - pyyaml=3.13=py36h14c3975_0 73 | - readline=7.0=ha6073c6_4 74 | - requests=2.19.1=py36_0 75 | - sdsl-lite=2.1.1=hc9558a2_1001 76 | - setuptools=39.2.0=py36_0 77 | - six=1.11.0=py36_1 78 | - snakemake=3.13.3=py36_0 79 | - sparsehash=2.0.3=0 80 | - sqlite=3.24.0=h84994c4_0 81 | - tk=8.6.7=hc745277_3 82 | - urllib3=1.23=py36_0 83 | - wheel=0.31.1=py36_0 84 | - wrapt=1.10.11=py36h14c3975_2 85 | - xz=5.2.4=h14c3975_4 86 | - yaml=0.1.7=had09818_2 87 | - zlib=1.2.11=ha838bed_2 88 | 89 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-2021 Jun Ma, Manuel Cáceres, Alexandru Tomescu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GraphChainer 2 | 3 | GraphChainer is an accurate aligner of long reads to a variation graph, based on co-linear chaining. 4 | 5 | ### Compiling 6 | 7 | To compile, run these: 8 | 9 | - Install [miniconda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) 10 | - `git submodule update --init --recursive` 11 | - `conda env create -f CondaEnvironment.yml` 12 | - `conda activate GraphChainer` 13 | - `make bin/GraphChainer` 14 | 15 | ### Running 16 | 17 | Quickstart: `./bin/GraphChainer -t 4 -f reads.fastq -g graph.gfa -a out.gam` 18 | 19 | Key parameters: 20 | - `-t` Number of threads (optional, default 1). 21 | - `-f` Input reads. Format .fasta / .fastq / .fasta.gz / .fastq.gz. You can input multiple files with `-f file1 -f file2 ...` or `-f file1 file2 ...`. 22 | - `-g` Input graph, format .gfa / .vg. **This graph must be acyclic**, see below how to construct an acyclic graph with vg. 23 | - `-a` Output file name. Format .gam or .json. 24 | 25 | Parameters related to colinear chaining: 26 | - `--sampling-step ` Sampling step factor (default 1). Use >1 (<1, >0) for faster (slower), but less (more) accurate alignments. It increases (decreases) the sampling sparsity of fragments. 27 | - `--colinear-split-len ` The length of the fragments in which the long read is split to create anchors (default 35). 28 | - `--colinear-split-gap ` The distance between consecutive fragments (default 35). If `--sampling-step` is set, then always `--colinear-split-gap = ceil(--sampling-step * --colinear-split-len`). 29 | - `--colinear-gap ` When converting an optimal chain of anchors into an alignment path, split the path if the distance in the graph between consecutive anchors is greater than this value (default 10000). 30 | 31 | ### Constructing an (acyclic) variation graph 32 | 33 | Use [vg](https://github.com/vgteam/vg) and run: 34 | 35 | `vg construct -t 30 -a -r {ref} -v {vcf} -R 22 -p -m 3000000` 36 | 37 | ### Datasets availability 38 | 39 | The graphs built for the experiments of GraphChainer can be found in Zenodo at [https://doi.org/10.5281/zenodo.7729494 40 | ](https://doi.org/10.5281/zenodo.7729494 41 | ), [https://doi.org/10.5281/zenodo.6875064](https://doi.org/10.5281/zenodo.6875064) and at [https://doi.org/10.5281/zenodo.6587252](https://doi.org/10.5281/zenodo.6587252) 42 | 43 | The real read sets can be found in Zenodo ar [TODO](TODO) 44 | 45 | The evaluation pipeline used in the paper can be found at [https://github.com/algbio/GraphChainer-scripts](https://github.com/algbio/GraphChainer-scripts) 46 | 47 | ### Citation 48 | 49 | If you use GraphChainer, please cite as: 50 | 51 | Jun Ma, Manuel Cáceres, Leena Salmela, Veli Mäkinen, Alexandru I. Tomescu. Chaining for accurate alignment of erroneous long reads to acyclic variation graphs. Bioinformatics, 2023, 39(8), btad460 [https://doi.org/10.1093/bioinformatics/btad460](https://doi.org/10.1093/bioinformatics/btad460). 52 | 53 | ### Credits 54 | 55 | GraphChainer is built on the excellent code base of [GraphAligner](https://github.com/maickrau/GraphAligner), which is released under [MIT License](https://github.com/maickrau/GraphAligner/blob/master/LICENSE.md). GraphAligner is described in the paper [GraphAligner: Rapid and Versatile Sequence-to-Graph Alignment](https://doi.org/10.1186/s13059-020-02157-2) by Mikko Rautiainen and Tobias Marschall. 56 | -------------------------------------------------------------------------------- /Snakemakes/ErrorCorrect/README.md: -------------------------------------------------------------------------------- 1 | A Snakemake pipeline for error correcting long reads based on short reads. 2 | 3 | Installation: 4 | 5 | - Install snakemake, lighter, bcalm2 and GraphAligner: `conda install -c bioconda snakemake lighter bcalm graphaligner` 6 | - Download the bcalm2 GFA conversion script from https://github.com/GATB/bcalm/blob/master/scripts/convertToGFA.py 7 | 8 | Running: 9 | 10 | - Save `Snakefile` and `config.yaml` to a folder 11 | - Edit the parameters in `config.yaml` 12 | - Run `snakemake --cores 8 all` (you can use more than 8 cores) 13 | - The corrected reads will be in the output folder: 14 | - `corrected.fasta` has the reads with the aligned sequence replaced by the alignment path. Uppercase sequence are corrected and lowercase are uncorrected. 15 | - `corrected_clipped.fasta` has the reads cut across non-corrected parts. All sequence is corrected and the read name contains the position in the original read. 16 | -------------------------------------------------------------------------------- /Snakemakes/ErrorCorrect/Snakefile: -------------------------------------------------------------------------------- 1 | configfile: "config.yaml" 2 | GRAPHALIGNERPATH = config["GraphAlignerPath"] 3 | BCALMPATH = config["BcalmPath"] 4 | BCALMCONVERTPATH = config["BcalmConvertPath"] 5 | LIGHTERPATH = config["LighterPath"] 6 | GENOMESIZE = config["GenomeSize"] 7 | SHORTREADCOVERAGE = config["ShortreadCoverage"] 8 | TMPDIR = config["TempDirectory"] 9 | OUTDIR = config["OutputDirectory"] 10 | SHORTREADDIR = config["ShortReadDirectory"] 11 | SHORTREADS = config["ShortReads"] 12 | LONGREADDIR = config["LongReadDirectory"] 13 | LONGREADS = config["LongReads"] 14 | SMALLK = config["SmallK"] 15 | BIGK = config["BigK"] 16 | ABUNDANCE = config["Abundance"] 17 | GRAPHALIGNERPARAMS = config["GraphAlignerParams"] 18 | 19 | SHORTREADNAMES = [n.split('.')[0] for n in SHORTREADS] 20 | SHORTREADEXTENSIONS = ['.'.join(n.split('.')[1:]) for n in SHORTREADS] 21 | 22 | rule all: 23 | input: 24 | OUTDIR + "corrected.fa", 25 | OUTDIR + "corrected_clipped.fa", 26 | OUTDIR + "stats.txt" 27 | 28 | rule correct_short_reads: 29 | input: 30 | expand(SHORTREADDIR + "{name}.{ext}", zip, name=SHORTREADNAMES, ext=SHORTREADEXTENSIONS) 31 | output: 32 | temp(expand(TMPDIR + "{name}.cor.{ext}", zip, name=SHORTREADNAMES, ext=SHORTREADEXTENSIONS)) 33 | params: 34 | files = lambda wildcards, input: ' '.join(["-r " + name for name in input]), 35 | alpha = 7.0 / SHORTREADCOVERAGE 36 | threads: 40 37 | log: 38 | stdout = TMPDIR + "lighter_stdout.txt", 39 | stderr = TMPDIR + "lighter_stderr.txt" 40 | shell: 41 | "/usr/bin/time -v {LIGHTERPATH} -od {TMPDIR} -t {threads} -k {SMALLK} {GENOMESIZE} {params.alpha} {params.files} 1> {log.stdout} 2> {log.stderr}" 42 | 43 | rule read_names: 44 | input: rules.correct_short_reads.output 45 | output: temp("filenames") 46 | shell: "readlink -f {input} > {output}" 47 | 48 | rule run_bcalm: 49 | input: 50 | name = "filenames", 51 | files = rules.correct_short_reads.output 52 | output: temp("filenames.unitigs.fa") 53 | shadow: "full" 54 | log: 55 | stdout = TMPDIR + "bcalm_stdout.txt", 56 | stderr = TMPDIR + "bcalm_stderr.txt" 57 | threads: 40 58 | shell: "/usr/bin/time -v {BCALMPATH} -nb-cores {threads} -in {input.name} -kmer-size {BIGK} -abundance-min {ABUNDANCE} > {log.stdout} 2> {log.stderr}" 59 | 60 | rule convert_bcalm: 61 | input: rules.run_bcalm.output 62 | output: TMPDIR + "graph.gfa" 63 | shell: "{BCALMCONVERTPATH} {input} {output} {BIGK}" 64 | 65 | rule align_reads: 66 | input: 67 | graph = TMPDIR + "graph.gfa", 68 | reads = expand(LONGREADDIR + "{name}", name=LONGREADS) 69 | params: 70 | readconcat = lambda wildcards, input: ' '.join(input.reads) 71 | output: 72 | corrected = OUTDIR + "corrected.fa", 73 | clipped = OUTDIR + "corrected_clipped.fa" 74 | log: 75 | stdout = TMPDIR + "aligner_stdout.txt", 76 | stderr = TMPDIR + "aligner_stderr.txt" 77 | threads: 40 78 | shell: 79 | "/usr/bin/time -v {GRAPHALIGNERPATH} -g {input.graph} --corrected-out {output.corrected} --corrected-clipped-out {output.clipped} -f {params.readconcat} -t {threads} {GRAPHALIGNERPARAMS} 1> {log.stdout} 2> {log.stderr}" 80 | 81 | rule get_stats: 82 | input: 83 | aligner_stdout = TMPDIR + "aligner_stdout.txt", 84 | aligner_stderr = TMPDIR + "aligner_stderr.txt", 85 | bcalm_stdout = TMPDIR + "bcalm_stdout.txt", 86 | bcalm_stderr = TMPDIR + "bcalm_stderr.txt", 87 | lighter_stdout = TMPDIR + "lighter_stdout.txt", 88 | lighter_stderr = TMPDIR + "lighter_stderr.txt" 89 | output: 90 | OUTDIR + "stats.txt" 91 | run: 92 | shell("grep 'Input reads' < {input.aligner_stdout} >> {output}") 93 | shell("grep 'Reads with a seed' < {input.aligner_stdout} >> {output}") 94 | shell("grep 'Reads with an alignment' < {input.aligner_stdout} >> {output}") 95 | shell("grep 'Alignments' < {input.aligner_stdout} >> {output}") 96 | shell("grep 'End-to-end alignments' < {input.aligner_stdout} >> {output}") 97 | shell("echo 'Lighter' >> {output}"), 98 | shell("grep 'User time' < {input.lighter_stderr} >> {output}") 99 | shell("grep 'System time' < {input.lighter_stderr} >> {output}") 100 | shell("grep 'Elapsed (wall clock)' < {input.lighter_stderr} >> {output}") 101 | shell("grep 'Maximum resident set size' < {input.lighter_stderr} >> {output}") 102 | shell("echo 'BCalm' >> {output}"), 103 | shell("grep 'User time' < {input.bcalm_stderr} >> {output}") 104 | shell("grep 'System time' < {input.bcalm_stderr} >> {output}") 105 | shell("grep 'Elapsed (wall clock)' < {input.bcalm_stderr} >> {output}") 106 | shell("grep 'Maximum resident set size' < {input.bcalm_stderr} >> {output}") 107 | shell("echo 'Aligner' >> {output}"), 108 | shell("grep 'User time' < {input.aligner_stderr} >> {output}") 109 | shell("grep 'System time' < {input.aligner_stderr} >> {output}") 110 | shell("grep 'Elapsed (wall clock)' < {input.aligner_stderr} >> {output}") 111 | shell("grep 'Maximum resident set size' < {input.aligner_stderr} >> {output}") 112 | -------------------------------------------------------------------------------- /Snakemakes/ErrorCorrect/config.yaml: -------------------------------------------------------------------------------- 1 | ### Change these!! 2 | GenomeSize: 4600000 3 | ShortreadCoverage: 200 4 | 5 | ShortReadDirectory: shortreads/ 6 | # NOTE: short read endings MUST be .fq or .fa instead of .fastq or .fasta 7 | # gzip is allowed 8 | ShortReads: 9 | - reads1.fq 10 | - reads2.fq.gz 11 | 12 | LongReadDirectory: longreads/ 13 | LongReads: 14 | - reads1.fq 15 | - reads2.fq.gz 16 | 17 | TempDirectory: tmp/ 18 | OutputDirectory: output/ 19 | 20 | # https://github.com/maickrau/GraphAligner 21 | GraphAlignerPath: GraphAligner 22 | # https://github.com/GATB/bcalm 23 | BcalmPath: bcalm 24 | # https://github.com/GATB/bcalm/blob/master/scripts/convertToGFA.py 25 | BcalmConvertPath: bcalm/scripts/convertToGFA.py 26 | # https://github.com/mourisl/Lighter 27 | LighterPath: lighter 28 | 29 | 30 | ### Misc params. Defaults might work 31 | 32 | # k for error correcting the reads. Try between 10-30 33 | SmallK: 23 34 | # k for the de Bruijn graph. Try between ~1/2 and ~2/3 of short read length 35 | BigK: 63 36 | # minimum k-mer abundance for the de Bruijn graph. Try between 1/100 to 2/100 of short read coverage, but not below 2. 37 | Abundance: 3 38 | # Parameters for GraphAligner 39 | GraphAlignerParams: -x dbg 40 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | GPP=$(CXX) 2 | CPPFLAGS=-w -std=c++17 -O3 -Iconcurrentqueue -I edlib/include -IBBHash -Izstr/src -Iparallel-hashmap/parallel_hashmap/ `pkg-config --cflags protobuf` `pkg-config --cflags libsparsehash` `pkg-config --cflags mummer` -fopenmp -Wno-unused-parameter 3 | 4 | ODIR=obj 5 | BINDIR=bin 6 | SRCDIR=src 7 | 8 | LIBS=-lm -lz -lboost_serialization -lboost_program_options `pkg-config --libs mummer` `pkg-config --libs protobuf` -lsdsl 9 | JEMALLOCFLAGS= -L`jemalloc-config --libdir` -Wl,-rpath,`jemalloc-config --libdir` -Wl,-Bstatic -ljemalloc -Wl,-Bdynamic `jemalloc-config --libs` 10 | 11 | _DEPS = vg.pb.h fastqloader.h GraphAlignerWrapper.h vg.pb.h BigraphToDigraph.h stream.hpp Aligner.h ThreadReadAssertion.h AlignmentGraph.h CommonUtils.h GfaGraph.h AlignmentCorrectnessEstimation.h MummerSeeder.h ReadCorrection.h MinimizerSeeder.h AlignmentSelection.h EValue.h 12 | DEPS = $(patsubst %, $(SRCDIR)/%, $(_DEPS)) 13 | 14 | _OBJ = Aligner.o vg.pb.o fastqloader.o BigraphToDigraph.o ThreadReadAssertion.o AlignmentGraph.o CommonUtils.o GraphAlignerWrapper.o GfaGraph.o AlignmentCorrectnessEstimation.o MummerSeeder.o ReadCorrection.o MinimizerSeeder.o AlignmentSelection.o EValue.o 15 | OBJ = $(patsubst %, $(ODIR)/%, $(_OBJ)) 16 | 17 | LINKFLAGS = $(CPPFLAGS) -Wl,-Bstatic $(LIBS) -Wl,-Bdynamic -Wl,--as-needed -lpthread -pthread -static-libstdc++ $(JEMALLOCFLAGS) `pkg-config --libs libdivsufsort` `pkg-config --libs libdivsufsort64` 18 | 19 | VERSION := Branch $(shell git rev-parse --abbrev-ref HEAD) commit $(shell git rev-parse HEAD) $(shell git show -s --format=%ci) 20 | 21 | $(shell mkdir -p bin) 22 | $(shell mkdir -p obj) 23 | 24 | $(BINDIR)/GraphChainer: $(ODIR)/AlignerMain.o $(OBJ) edlib/src/edlib.cpp 25 | $(GPP) -o $@ $^ $(LINKFLAGS) 26 | 27 | $(ODIR)/GraphAlignerWrapper.o: $(SRCDIR)/GraphAlignerWrapper.cpp $(SRCDIR)/GraphAligner.h $(SRCDIR)/NodeSlice.h $(SRCDIR)/WordSlice.h $(SRCDIR)/ArrayPriorityQueue.h $(SRCDIR)/ComponentPriorityQueue.h $(SRCDIR)/GraphAlignerVGAlignment.h $(SRCDIR)/GraphAlignerGAFAlignment.h $(SRCDIR)/GraphAlignerBitvectorBanded.h $(SRCDIR)/GraphAlignerBitvectorCommon.h $(SRCDIR)/GraphAlignerCommon.h $(SRCDIR)/DijkstraQueue.h $(SRCDIR)/GraphAlignerBitvectorDijkstra.h $(DEPS) 28 | 29 | $(ODIR)/AlignerMain.o: $(SRCDIR)/AlignerMain.cpp $(DEPS) 30 | $(GPP) -c -o $@ $< $(CPPFLAGS) -DVERSION="\"$(VERSION)\"" 31 | 32 | $(ODIR)/%.o: $(SRCDIR)/%.cpp $(DEPS) 33 | $(GPP) -c -o $@ $< $(CPPFLAGS) 34 | 35 | $(ODIR)/vg.pb.o: $(SRCDIR)/vg.pb.cc 36 | $(GPP) -c -o $@ $< $(CPPFLAGS) 37 | 38 | $(SRCDIR)/%.pb.cc $(SRCDIR)/%.pb.h: $(SRCDIR)/%.proto 39 | protoc -I=$(SRCDIR) --cpp_out=$(SRCDIR) $< 40 | 41 | $(BINDIR)/FusionFinder: $(SRCDIR)/FusionFinder.cpp $(OBJ) 42 | $(GPP) -o $@ $^ $(LINKFLAGS) -DVERSION="\"$(VERSION)\"" 43 | 44 | $(BINDIR)/ExtractPathSequence: $(SRCDIR)/ExtractPathSequence.cpp $(ODIR)/CommonUtils.o $(ODIR)/GfaGraph.o $(ODIR)/ThreadReadAssertion.o $(ODIR)/fastqloader.o $(ODIR)/vg.pb.o 45 | $(GPP) -o $@ $^ $(LINKFLAGS) 46 | 47 | $(BINDIR)/SelectLongestAlignment: $(SRCDIR)/SelectLongestAlignment.cpp $(ODIR)/CommonUtils.o $(ODIR)/vg.pb.o $(ODIR)/fastqloader.o 48 | $(GPP) -o $@ $^ $(LINKFLAGS) 49 | 50 | $(BINDIR)/Postprocess: $(SRCDIR)/Postprocess.cpp $(ODIR)/AlignmentSelection.o $(ODIR)/CommonUtils.o $(ODIR)/vg.pb.o $(ODIR)/fastqloader.o 51 | $(GPP) -o $@ $^ $(LINKFLAGS) 52 | 53 | $(BINDIR)/AlignmentSubsequenceIdentity: $(SRCDIR)/AlignmentSubsequenceIdentity.cpp $(ODIR)/CommonUtils.o $(ODIR)/vg.pb.o $(ODIR)/GfaGraph.o $(ODIR)/fastqloader.o $(ODIR)/ThreadReadAssertion.o 54 | $(GPP) -o $@ $^ $(LINKFLAGS) 55 | 56 | $(BINDIR)/UntipRelative: $(SRCDIR)/UntipRelative.cpp $(ODIR)/CommonUtils.o $(ODIR)/vg.pb.o $(ODIR)/GfaGraph.o $(ODIR)/fastqloader.o $(ODIR)/ThreadReadAssertion.o 57 | $(GPP) -o $@ $^ $(LINKFLAGS) 58 | 59 | $(BINDIR)/PickAdjacentAlnPairs: $(SRCDIR)/PickAdjacentAlnPairs.cpp $(ODIR)/CommonUtils.o $(ODIR)/vg.pb.o $(ODIR)/GfaGraph.o $(ODIR)/fastqloader.o $(ODIR)/ThreadReadAssertion.o 60 | $(GPP) -o $@ $^ $(LINKFLAGS) 61 | 62 | $(BINDIR)/ExtractCorrectedReads: $(SRCDIR)/ExtractCorrectedReads.cpp $(ODIR)/ReadCorrection.o $(ODIR)/CommonUtils.o $(ODIR)/vg.pb.o $(ODIR)/GfaGraph.o $(ODIR)/fastqloader.o $(ODIR)/ThreadReadAssertion.o 63 | $(GPP) -o $@ $^ $(LINKFLAGS) 64 | 65 | all: $(BINDIR)/GraphAligner $(BINDIR)/ExtractPathSequence $(BINDIR)/SelectLongestAlignment $(BINDIR)/AlignmentSubsequenceIdentity $(BINDIR)/PickAdjacentAlnPairs $(BINDIR)/ExtractCorrectedReads $(BINDIR)/UntipRelative 66 | 67 | clean: 68 | rm -f $(ODIR)/* 69 | rm -f $(BINDIR)/* 70 | rm -f $(SRCDIR)/vg.pb.cc 71 | rm -f $(SRCDIR)/vg.pb.h 72 | -------------------------------------------------------------------------------- /scripts/10fold_test/gen_test.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | 5 | N = 2 6 | Bin = "./bin/GraphAligner" 7 | Graphs = ["/mnt/c/Code/Summer/GCimplements/data/LRC/LRC.vg", "/mnt/c/Code/Summer/GCimplements/data/MHC/MHC1.vg"] 8 | Data = "./data/" 9 | Gams = "./gams/" 10 | Logs = "./logs/" 11 | force_redo = True 12 | Threads = 4 13 | # default badread is length ~ (mean=15000,std=10000) 14 | # default pbsim clr is length ~ (mean=3000,std=2300) 15 | # or more `real` length ~ (mean=15000,std=10000) 16 | read_length_mean, read_length_std = 3000, 2300 17 | 18 | def mkdir_safe(path): 19 | if not os.path.exists(path): 20 | os.system(f"mkdir -p {path}") 21 | mkdir_safe(Data) 22 | mkdir_safe(Gams) 23 | mkdir_safe(Logs) 24 | 25 | 26 | 27 | def time_cmd(cmd, log): 28 | return f"/usr/bin/time -o {log} -a -v {cmd}" 29 | def log_cmd(cmd, log): 30 | return f"{cmd} 1>>{log} 2>>{log}" 31 | def run(cmd, log = "", t = True, l = True, v = True): 32 | if v: 33 | print(cmd) 34 | if log != "": 35 | open(log, 'a').write(cmd + '\n') 36 | if t: 37 | cmd = time_cmd(cmd, log) 38 | if l: 39 | cmd = log_cmd(cmd, log) 40 | try: 41 | os.system(cmd) 42 | except KeyboardInterrupt: 43 | exit(1) 44 | 45 | params = [] 46 | params.append((150,150,10000)) 47 | # [(100,31), (100,17), ] 48 | # for L, S in [(150,150)]: 49 | # for G in [10000]: 50 | # params.append((L, S, G)) 51 | 52 | gen_log = f"{Logs}/gen.log.txt" 53 | for Graph in Graphs: 54 | for idx in range(N): 55 | seed = idx 56 | id = Graph.split('/')[-1].split('.')[0] + '_' + str(idx) 57 | Reads_prefix = f"{Data}/{id}" 58 | Reads = f"{Reads_prefix}.fastq" 59 | if not os.path.exists(Reads): 60 | print(f"generating read set #{idx} to {id}") 61 | Ref = f"{Data}/{id}.fasta" 62 | run(f"{Bin} --generate-path --generate-path-seed {seed} -g {Graph} -f {Ref} -x vg -a {Data}/{id}.gam", gen_log) 63 | # id.path.txt has the node indices of the generated path 64 | run(f"mv {Data}/{id}.gam {Data}/{id}.path.txt", gen_log) 65 | # # simulate a PacBio long read dataset of 15x coverage using `badread` (commit 9e030e84849281e7dc92f0c9767b601c4dc9701e from https://github.com/rrwick/Badread.git) 66 | # run(f"badread simulate --seed {seed} --reference {Ref} --quantity 15x --length 15000,10000 --error_model pacbio --junk_reads 0 --random_reads 0 --chimeras 0 > {Reads} 2>{Data}/{id}_br.log.txt", gen_log, True, False) 67 | 68 | # simulate a PacBio long read dataset of 20x coverage using 'pbsim' (commit e014b1dd40e87a8799346a9835d70a4da3dc857c from https://github.com/pfaucon/PBSIM-PacBio-Simulator.git) 69 | prefix = 'xxxx' 70 | run(f"./bin/pbsim --data-type CLR --depth 20 --seed {seed} --model_qc ./bin/model_qc_clr {Ref} --prefix {prefix} --length-mean {read_length_mean} --length-sd {read_length_std}", gen_log, True, True) 71 | # rename the reads generated 72 | run(f"mv ./{prefix}_0001.fastq {Reads}") 73 | ReadsMaf = f"{Reads_prefix}.maf" 74 | run(f"mv ./{prefix}_0001.maf {ReadsMaf}") 75 | 76 | # run(f"badread simulate --seed {seed} --reference {Ref} --quantity 15x --length 15000,10000 --error_model pacbio --junk_reads 0 --random_reads 0 --chimeras 0 > {Reads} 2>{Data}/{id}_br.log.txt", gen_log, True, False) 77 | 78 | long_log = f"{Logs}/{id}_long.log.txt" 79 | long_gam = f"{Gams}/{id}_long.gam" 80 | if force_redo or not os.path.exists(long_gam): 81 | run(f"{Bin} -t {Threads} -x vg -f {Reads} -g {Graph} -a {long_gam}", long_log) 82 | 83 | aln_clc_log = f"{Logs}/{id}_long.log.txt" 84 | for L, S, G in params: 85 | clc_gam = f"{Gams}/{id}_clc_{L}_{S}_{G}.gam" 86 | clc_log = f"{Gams}/{id}_clc_{L}_{S}_{G}.log.txt" 87 | if force_redo or not os.path.exists(clc_gam): 88 | run(f"{Bin} -t {Threads} --colinear-chaining -x vg -f {Reads} -g {Graph} -a {clc_gam} --colinear-gap {G} --colinear-split-len {L} --colinear-split-gap {S} --short-verbose", clc_log) 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /scripts/summary.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import os.path 4 | import json 5 | import edlib 6 | # 0 1 2 3 4 7 | # python summary.py ../data/LRC/LRC.gfa ./data/badreads.fastq ga outdir 8 | 9 | graph = sys.argv[1] 10 | reads = sys.argv[2] 11 | id = sys.argv[3] 12 | 13 | outdir = "out" 14 | if 4 < len(sys.argv): 15 | outdir = sys.argv[4] 16 | if not outdir.endswith(id + '/'): 17 | outdir = outdir + '/' + id + '/' 18 | 19 | def LoadGfaGraph(filename): 20 | VL, E = {}, {} 21 | for line in open(filename).readlines(): 22 | if line[0] == 'S': 23 | # S 92533 A 24 | i, s = line[1:].strip().split() 25 | VL[int(i)] = s 26 | elif line[0] == 'L': 27 | # L 104890 + 104892 + 0M 28 | li, lr, ri, rr, ov = line[1:].strip().split() 29 | li, ri = int(li), int(ri) 30 | if li not in E: 31 | E[li] = [] 32 | E[li].append(ri) 33 | return VL, E 34 | 35 | VL, E = LoadGfaGraph(graph) 36 | 37 | ed_global = lambda s1, s2 : edlib.align(s1, s2, mode='NW')['editDistance'] 38 | ed_local = lambda s1, s2 : edlib.align(s1, s2, mode='HW')['editDistance'] 39 | list2idx = lambda a: { a[i] : i for i in range(len(a)) } 40 | revc = lambda s: ''.join({"A":"T","T":"A","C":"G","G":"C"}[c] for c in s[::-1]) 41 | 42 | import vg_pb2 43 | import gzip 44 | from google.protobuf.internal.encoder import _VarintBytes 45 | from google.protobuf.internal.decoder import _DecodeVarint32, _VarintDecoder 46 | def _VarintDecoder(mask): 47 | local_ord = ord 48 | def DecodeVarint(buffer, pos): 49 | result = 0 50 | shift = 0 51 | while 1: 52 | b = local_ord(buffer[pos]) 53 | result |= ((b & 0x7f) << shift) 54 | pos += 1 55 | if not (b & 0x80): 56 | result &= mask 57 | return (result, pos) 58 | shift += 7 59 | # if shift >= 64: 60 | # raise _DecodeError('Too many bytes when decoding varint.') 61 | return DecodeVarint 62 | _DecodeVarint64 = _VarintDecoder((1 << 64) - 1) 63 | def read_alignments(gam_filename): 64 | with open(gam_filename, 'rb') as f: 65 | buf = gzip.GzipFile(fileobj=f).read() 66 | n = 0 67 | while n < len(buf): 68 | an, n = _DecodeVarint32(buf, n) 69 | for i in range(an): 70 | msg_len, n = _DecodeVarint32(buf, n) 71 | msg_buf = buf[n:n+msg_len] 72 | n += msg_len 73 | aln = vg_pb2.Alignment() 74 | aln.ParseFromString(msg_buf) 75 | yield aln 76 | 77 | def parse_alignment(aln): 78 | # bps = sum(len(VL[x.position.node_id]) for x in a.path.mapping) 79 | name = aln.name.split()[0] 80 | seq = '' 81 | 82 | rev_cnt = 0 83 | for x in aln.path.mapping: 84 | idx = x.position.node_id 85 | ll = VL[idx] 86 | if x.position.is_reverse: 87 | rev_cnt += 1 88 | seq += revc(ll) 89 | else: 90 | seq += ll 91 | return {'name':name, 'seq':seq, 'path_cnt':len(aln.path.mapping), 'revcnt':rev_cnt, 'path_bps':len(seq)} 92 | 93 | def parse_gam(filename): 94 | ret = {} 95 | for aln in read_alignments(filename): 96 | a = parse_alignment(aln) 97 | ret[a['name']] = a 98 | return ret 99 | 100 | seqs_long = parse_gam(f'{outdir}{id}_long.gam') 101 | seqs_clcs = parse_gam(f'{outdir}{id}_clc.gam') 102 | 103 | def read_fastq(fastq_filename): 104 | reads_lines = open(fastq_filename).readlines() 105 | for i, line in enumerate(reads_lines): 106 | if line[0] == '@': 107 | info = reads_lines[i].strip() 108 | # name = info.split()[0][1:] 109 | seq = reads_lines[i + 1].strip() 110 | yield (info, seq) 111 | 112 | seqs_read = {info.split()[0][1:] : (seq, info) for info, seq in read_fastq(f'{reads}')} 113 | 114 | class CSV: 115 | def __init__(self): 116 | self.h = [] 117 | self.hidx = {} 118 | self.r = [] 119 | self.ridx = {} 120 | self.data = [] 121 | def add_headers(self, headers): 122 | for h in headers: 123 | if h not in self.hidx: 124 | self.h.append(h) 125 | self.hidx[h] = len(self.h) - 1 126 | def get_hids(self, headers): 127 | return [self.hidx[x] for x in headers] 128 | def add(self, row, hids = []): 129 | if len(hids) == 0: 130 | self.data.append(row[:]) 131 | else: 132 | tmp = [''] * len(self.h) 133 | for i in range(len(hids)): 134 | tmp[hids[i]] = row[i] 135 | self.data.append(tmp[:]) 136 | def save(self, filename): 137 | fout = open(filename, 'w') 138 | fout.write(','.join(self.h) + '\n') 139 | for d in self.data: 140 | if len(d) < len(self.h): 141 | d += [''] * (len(self.h) - len(d)) 142 | fout.write(','.join(d) + '\n') 143 | fout.close() 144 | 145 | csv = CSV() 146 | csv.add_headers(['name', 'length', 'br_id_rate']) #0,1,2 147 | csv.add_headers(['long_pathcnt', 'long_path_bps', 'long_revcnt']) #3,4,5 148 | csv.add_headers(['clcs_pathcnt', 'clcs_path_bps', 'clcs_revcnt']) #6,7,8 149 | csv.add_headers(['long_align_rate']) #9 150 | csv.add_headers([ 151 | 'global_ed_read_long', #10 152 | # 'global_ed_long_true', 153 | 'global_ed_read_clcs', #11 154 | # 'global_ed_clcs_true', 155 | # 'local_ed_read_long', 156 | # 'local_ed_long_read', 157 | # 'local_ed_true_long', 158 | # 'local_ed_long_true', 159 | # 'local_ed_read_clcs', 160 | # 'local_ed_clcs_read', 161 | # 'local_ed_true_clcs', 162 | # 'local_ed_clcs_true', 163 | # 'global_ed_read_clcs', 164 | # 'global_ed_long_clcs', 165 | # 'global_ed_read_true', 166 | # 'global_ed_clcs_true', 167 | ]) 168 | 169 | reads_cnt = 0 170 | for name in seqs_read: 171 | reads_cnt += 1 172 | if reads_cnt % (len(seqs_read) // 5 + 1) == 0: 173 | print(reads_cnt, '/', len(seqs_read)) 174 | seq, info = seqs_read[name] 175 | row = [''] * len(csv.h) 176 | row[0] = name 177 | for t in info.split(): 178 | if t.startswith('length='): 179 | row[1] = t.split('=')[-1] 180 | row[2] = str('%.3f'%(float(info.split()[-1].split('=')[-1][:-1]) / 100)) 181 | 182 | long_seq = '' 183 | if name in seqs_long: 184 | a = seqs_long[name] 185 | long_seq = a['seq'] 186 | row[3] = str(a['path_cnt']) 187 | row[4] = str(a['path_bps']) 188 | row[5] = str(a['revcnt']) 189 | row[10] = str(ed_global(seq, long_seq)) 190 | row[9] = str(len(long_seq) / len(seq)) 191 | clcs_seq = '' 192 | if name in seqs_clcs: 193 | a = seqs_clcs[name] 194 | clcs_seq = a['seq'] 195 | row[6] = str(a['path_cnt']) 196 | row[7] = str(a['path_bps']) 197 | row[8] = str(a['revcnt']) 198 | row[11] = str(ed_global(seq, clcs_seq)) 199 | csv.add(row) 200 | 201 | csv.save(f'{outdir}{id}_summary.csv') 202 | -------------------------------------------------------------------------------- /scripts/test.sh: -------------------------------------------------------------------------------- 1 | # output cmd to screen before execution 2 | set -x 3 | 4 | # parameters 5 | G=1000 6 | L=150 7 | S=33 8 | 9 | # identifiers for this experiment 10 | idx=LRC_15x 11 | 12 | # data files 13 | Graph=/mnt/c/Code/Summer/GCimplements/data/LRC/LRC.gfa 14 | Reads=/mnt/d/summer/data/$idx.fastq 15 | 16 | # path to binary 17 | Bin=../bin/GraphAligner 18 | 19 | # check whether dataset exists 20 | if [ ! -f "$Reads" ]; then 21 | # sample a longest path on the graph as referrence sequence 22 | Ref=/mnt/d/summer/data/$idx.fasta 23 | time $Bin --generate-path -g $Graph -f $Ref -x vg -a /mnt/d/summer/data/$idx.gam 24 | # idx.txt has the node indices of the generated path 25 | time mv /mnt/d/summer/data/$idx.gam /mnt/d/summer/data/$idx.txt 26 | # simulate a PacBio long read dataset of 15x coverage using `badread` (commit 9e030e84849281e7dc92f0c9767b601c4dc9701e from https://github.com/rrwick/Badread.git) 27 | time badread simulate --reference $Ref --quantity 15x --length 15000,10000 --error_model pacbio --junk_reads 0 --random_reads 0 --chimeras 0 > $Reads 2>/mnt/d/summer/data/$idx\_br.log 28 | fi 29 | 30 | # experiment output folder 31 | Out=out_$idx/ga 32 | mkdir -p $Out/ 33 | Log=$Out/log.txt 34 | 35 | echo "params=" $G $L $S > $Log 36 | 37 | # use /usr/bin/time to measure "Max memory(kb)", "User time(s)", "System time(s)" 38 | TimeCmd="/usr/bin/time -o $Log -a -f '%Mkb,%Us,%Ss'" 39 | 40 | # align directly by original GraphAligner 41 | $TimeCmd $Bin -x vg -f $Reads -g $Graph -a $Out/ga_long.gam 1>>$Log 2>>$Log 42 | 43 | # align by colinear chaining 44 | $TimeCmd $Bin --colinear-chaining -x vg -f $Reads -g $Graph -a $Out/ga_clc.gam --colinear-gap $G --colinear-split-len $L colinear-split-gap $S 1>>$Log 2>>$Log 45 | 46 | # generate summary.csv 47 | $TimeCmd python summary.py $Graph $Reads ga out_$idx 1>>$Log 2>>$Log 48 | 49 | 50 | -------------------------------------------------------------------------------- /src/Aligner.h: -------------------------------------------------------------------------------- 1 | #ifndef Aligner_h 2 | #define Aligner_h 3 | 4 | #include 5 | #include 6 | #include "AlignmentGraph.h" 7 | #include "vg.pb.h" 8 | #include "AlignmentSelection.h" 9 | 10 | struct AlignerParams 11 | { 12 | std::string graphFile; 13 | std::vector fastqFiles; 14 | size_t numThreads; 15 | size_t initialBandwidth; 16 | size_t rampBandwidth; 17 | bool dynamicRowStart; 18 | size_t maxCellsPerSlice; 19 | std::vector seedFiles; 20 | std::string outputGAMFile; 21 | std::string outputJSONFile; 22 | std::string outputGAFFile; 23 | std::string outputCorrectedFile; 24 | std::string outputCorrectedClippedFile; 25 | std::string IndexMpcFile; 26 | bool verboseMode; 27 | bool shortVerboseMode; 28 | bool tryAllSeeds; 29 | bool highMemory; 30 | size_t mxmLength; 31 | size_t mumCount; 32 | size_t memCount; 33 | std::string seederCachePrefix; 34 | AlignmentSelection::SelectionMethod alignmentSelectionMethod; 35 | double selectionECutoff; 36 | bool forceGlobal; 37 | bool compressCorrected; 38 | bool compressClipped; 39 | bool preciseClipping; 40 | size_t minimizerLength; 41 | size_t minimizerWindowSize; 42 | double minimizerSeedDensity; 43 | size_t seedClusterMinSize; 44 | double minimizerDiscardMostNumerousFraction; 45 | double seedExtendDensity; 46 | bool nondeterministicOptimizations; 47 | bool optimalDijkstra; 48 | double preciseClippingIdentityCutoff; 49 | int Xdropcutoff; 50 | size_t DPRestartStride; 51 | bool cigarMatchMismatchMerge; 52 | 53 | bool colinearChaining; 54 | bool generatePath; 55 | bool graphStatistics; 56 | long long generatePathSeed; 57 | long long colinearGap; 58 | long long colinearSplitLen; 59 | long long colinearSplitGap; 60 | double samplingStep; 61 | bool fastMode; 62 | 63 | }; 64 | 65 | void alignReads(AlignerParams params); 66 | void replaceDigraphNodeIdsWithOriginalNodeIds(vg::Alignment& alignment, const AlignmentGraph& graph); 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /src/AlignmentCorrectnessEstimation.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "AlignmentCorrectnessEstimation.h" 4 | #include "ThreadReadAssertion.h" 5 | 6 | //empirically from aligning one ONT to its correct position in the genome 7 | const double correctMean = 0.1875; 8 | const double correctStddev = 0.0955; 9 | //empirically from aligning one random read to a position 10 | const double wrongMean = 0.5; 11 | const double wrondStddev = 0.0291; 12 | 13 | const int wordSize = 64; 14 | 15 | const double falseToCorrectTransitionLogProbability = log(0.00001); //10^-5. arbitrary. 16 | const double falseToFalseTransitionLogProbability = log(1.0 - 0.00001); 17 | const double correctToFalseTransitionLogProbability = log(0.0000000001); //10^-10. arbitrary. 18 | const double correctToCorrectTransitionLogProbability = log(1.0 - 0.0000000001); 19 | 20 | double stddistlog(double val, double mean, double stddev) 21 | { 22 | return -(val-mean)*(val-mean)/(2*stddev*stddev); 23 | } 24 | 25 | void normalize(std::vector& logs) 26 | { 27 | double sum = 0; 28 | for (auto x : logs) 29 | { 30 | sum += exp(x); 31 | } 32 | double add = log(1.0/sum); 33 | for (auto& x : logs) 34 | { 35 | x += add; 36 | } 37 | } 38 | 39 | std::vector getCorrectLogOdds() 40 | { 41 | std::vector result; 42 | for (int i = 0; i <= wordSize/2; i++) 43 | { 44 | result.push_back(stddistlog(i, correctMean*wordSize, correctStddev*wordSize)); 45 | } 46 | normalize(result); 47 | for (int i = wordSize/2; i < wordSize; i++) 48 | { 49 | result.push_back(result.back()); 50 | } 51 | return result; 52 | } 53 | 54 | std::vector getWrongLogOdds() 55 | { 56 | std::vector result; 57 | for (int i = 0; i <= wordSize/2; i++) 58 | { 59 | result.push_back(stddistlog(i, wrongMean*wordSize, wrondStddev*wordSize)); 60 | } 61 | normalize(result); 62 | for (int i = wordSize/2; i < wordSize; i++) 63 | { 64 | result.push_back(result.back()); 65 | } 66 | return result; 67 | } 68 | 69 | const std::vector precomputedCorrectLogOdds = getCorrectLogOdds(); 70 | const std::vector precomputedWrongLogOdds = getWrongLogOdds(); 71 | 72 | AlignmentCorrectnessEstimationState::AlignmentCorrectnessEstimationState() : 73 | correctLogOdds(log(0.8)), //80% arbitrarily 74 | falseLogOdds(log(0.2)), //20% arbitrarily 75 | correctFromCorrectTrace(false), 76 | falseFromCorrectTrace(false) 77 | { 78 | } 79 | 80 | bool AlignmentCorrectnessEstimationState::CurrentlyCorrect() const 81 | { 82 | return correctLogOdds > falseLogOdds; 83 | } 84 | 85 | bool AlignmentCorrectnessEstimationState::CorrectFromCorrect() const 86 | { 87 | return correctFromCorrectTrace; 88 | } 89 | 90 | bool AlignmentCorrectnessEstimationState::FalseFromCorrect() const 91 | { 92 | return falseFromCorrectTrace; 93 | } 94 | 95 | double AlignmentCorrectnessEstimationState::CorrectLogOdds() const 96 | { 97 | return correctLogOdds; 98 | } 99 | 100 | double AlignmentCorrectnessEstimationState::FalseLogOdds() const 101 | { 102 | return falseLogOdds; 103 | } 104 | 105 | AlignmentCorrectnessEstimationState AlignmentCorrectnessEstimationState::NextState(int mismatches, int rowSize) const 106 | { 107 | assert(rowSize == 64); 108 | // assert(rowSize == 64 || rowSize == 1); 109 | assert(mismatches >= 0); 110 | AlignmentCorrectnessEstimationState result; 111 | result.correctFromCorrectTrace = correctLogOdds + correctToCorrectTransitionLogProbability >= falseLogOdds + falseToCorrectTransitionLogProbability; 112 | result.falseFromCorrectTrace = correctLogOdds + correctToFalseTransitionLogProbability >= falseLogOdds + falseToFalseTransitionLogProbability; 113 | double newCorrectProbability = std::max(correctLogOdds + correctToCorrectTransitionLogProbability, falseLogOdds + falseToCorrectTransitionLogProbability); 114 | double newFalseProbability = std::max(correctLogOdds + correctToFalseTransitionLogProbability, falseLogOdds + falseToFalseTransitionLogProbability); 115 | assert(precomputedCorrectLogOdds.size() == precomputedWrongLogOdds.size()); 116 | if ((size_t)mismatches < precomputedCorrectLogOdds.size()) 117 | { 118 | newCorrectProbability += precomputedCorrectLogOdds[(size_t)mismatches]; 119 | newFalseProbability += precomputedWrongLogOdds[(size_t)mismatches]; 120 | } 121 | else 122 | { 123 | newCorrectProbability += precomputedCorrectLogOdds.back(); 124 | newFalseProbability += precomputedWrongLogOdds.back(); 125 | } 126 | result.correctLogOdds = newCorrectProbability; 127 | result.falseLogOdds = newFalseProbability; 128 | return result; 129 | } 130 | -------------------------------------------------------------------------------- /src/AlignmentCorrectnessEstimation.h: -------------------------------------------------------------------------------- 1 | #ifndef AlignmentCorrectnessEstimation_h 2 | #define AlignmentCorrectnessEstimation_h 3 | 4 | class AlignmentCorrectnessEstimationState 5 | { 6 | public: 7 | AlignmentCorrectnessEstimationState(); 8 | bool CurrentlyCorrect() const; 9 | bool CorrectFromCorrect() const; 10 | bool FalseFromCorrect() const; 11 | double CorrectLogOdds() const; 12 | double FalseLogOdds() const; 13 | AlignmentCorrectnessEstimationState NextState(int mismatches, int rowSize) const; 14 | private: 15 | double correctLogOdds; 16 | double falseLogOdds; 17 | bool correctFromCorrectTrace; 18 | bool falseFromCorrectTrace; 19 | }; 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /src/AlignmentGraph.h: -------------------------------------------------------------------------------- 1 | #ifndef AlignmentGraph_h 2 | #define AlignmentGraph_h 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "ThreadReadAssertion.h" 12 | 13 | 14 | class AlignmentGraph 15 | { 16 | public: 17 | //determines extra band size, shouldn't be too high because of extra slices 18 | //should be 0 mod (wordsize/2 == 32), otherwise storage has overhead 19 | //64 is the fastest out of 32, 64, 96 20 | static constexpr int SPLIT_NODE_SIZE = 64; 21 | static constexpr size_t BP_IN_CHUNK = sizeof(size_t) * 8 / 2; 22 | static constexpr size_t CHUNKS_IN_NODE = (SPLIT_NODE_SIZE + BP_IN_CHUNK - 1) / BP_IN_CHUNK; 23 | 24 | struct NodeChunkSequence 25 | { 26 | size_t& operator[](size_t pos) 27 | { 28 | return s[pos]; 29 | } 30 | size_t operator[](size_t pos) const 31 | { 32 | return s[pos]; 33 | } 34 | size_t s[CHUNKS_IN_NODE]; 35 | }; 36 | struct AmbiguousChunkSequence 37 | { 38 | static_assert(SPLIT_NODE_SIZE == sizeof(size_t)*8); 39 | //weird interface because it should behave like NodeChunkSequence, which is just a number 40 | AmbiguousChunkSequence operator[](size_t pos) const 41 | { 42 | AmbiguousChunkSequence result = *this; 43 | result.A >>= pos * BP_IN_CHUNK; 44 | result.C >>= pos * BP_IN_CHUNK; 45 | result.G >>= pos * BP_IN_CHUNK; 46 | result.T >>= pos * BP_IN_CHUNK; 47 | return result; 48 | } 49 | //weird interface because it should behave like NodeChunkSequence, which is just a number 50 | AmbiguousChunkSequence operator>>=(size_t amount) 51 | { 52 | assert(amount % 2 == 0); 53 | A >>= amount / 2; 54 | T >>= amount / 2; 55 | C >>= amount / 2; 56 | G >>= amount / 2; 57 | return *this; 58 | } 59 | //weird interface because it should behave like NodeChunkSequence, which is just a number 60 | AmbiguousChunkSequence operator&(size_t val) 61 | { 62 | return *this; 63 | } 64 | size_t A; 65 | size_t T; 66 | size_t C; 67 | size_t G; 68 | }; 69 | 70 | struct MatrixPosition 71 | { 72 | MatrixPosition(size_t node, size_t nodeOffset, size_t seqPos); 73 | bool operator==(const MatrixPosition& other) const; 74 | bool operator!=(const MatrixPosition& other) const; 75 | size_t node; 76 | size_t nodeOffset; 77 | size_t seqPos; 78 | }; 79 | 80 | class SeedHit 81 | { 82 | public: 83 | SeedHit(size_t seqPos, int nodeId, size_t nodePos) : sequencePosition(seqPos), nodeId(nodeId), nodePos(nodePos) {}; 84 | size_t sequencePosition; 85 | int nodeId; 86 | size_t nodePos; 87 | }; 88 | AlignmentGraph(); 89 | void ReserveNodes(size_t numNodes, size_t numSplitNodes); 90 | void AddNode(int nodeId, const std::string& sequence, const std::string& name, bool reverseNode, const std::vector& breakpoints); 91 | void AddEdgeNodeId(int node_id_from, int node_id_to, size_t startOffset); 92 | void Finalize(int wordSize); 93 | AlignmentGraph GetSubgraph(const std::unordered_map& nodeMapping) const; 94 | std::pair GetReversePosition(int nodeId, size_t offset) const; 95 | size_t GetReverseNode(size_t node) const; 96 | size_t NodeSize() const; 97 | size_t SizeInBP() const; 98 | size_t NodeOffset(size_t node) const; 99 | size_t NodeID(size_t node) const; 100 | size_t NodeLength(size_t nodeIndex) const; 101 | char NodeSequences(size_t node, size_t offset) const; 102 | NodeChunkSequence NodeChunks(size_t node) const; 103 | AmbiguousChunkSequence AmbiguousNodeChunks(size_t node) const; 104 | size_t GetUnitigNode(int nodeId, size_t offset) const; 105 | // size_t MinDistance(size_t pos, const std::vector& targets) const; 106 | // std::set ProjectForward(const std::set& startpositions, size_t amount) const; 107 | std::string OriginalNodeName(int nodeId) const; 108 | size_t ComponentSize() const; 109 | static AlignmentGraph DummyGraph(); 110 | size_t getDBGoverlap() const; 111 | 112 | struct Anchor { 113 | std::vector path; 114 | size_t x, y; 115 | }; 116 | void buildMPC(); 117 | void buildComponentsMap(); 118 | void loadMPC(const std::string &filename); 119 | void saveMPC(const std::string &filename); 120 | std::vector generatePath(const std::string &seq_out, const std::string &path_out, const size_t seed = 0); 121 | std::vector colinearChaining(const std::vector &anchors, long long sep_limit) const; 122 | std::vector getChainPath(size_t s, size_t t, long long sep_limit) const; 123 | 124 | private: 125 | void fixChainApproxPos(const size_t start); 126 | std::pair findBubble(const size_t start, const std::vector& ignorableTip); 127 | void chainBubble(const size_t start, const std::vector& ignorableTip, std::vector& rank); 128 | phmap::flat_hash_map> chainTips(std::vector& rank, std::vector& ignorableTip); 129 | void chainCycles(std::vector& rank, std::vector& ignorableTip); 130 | void findChains(); 131 | void findLinearizable(); 132 | void AddNode(int nodeId, int offset, const std::string& sequence, bool reverseNode); 133 | void RenumberAmbiguousToEnd(); 134 | void doComponentOrder(); 135 | 136 | std::vector> greedyCover(size_t cid) const; 137 | std::vector> shrink(size_t cid, const std::vector> &pc); 138 | void computeMPCIndex(size_t cid, const std::vector> &pc); 139 | bool checkMinPathCover(const std::vector> &pc); 140 | std::pair, size_t> colinearChainingByComponent(size_t cid, const std::vector &anchors, const std::vector &aids, long long sep_limit) const; 141 | 142 | 143 | 144 | 145 | std::vector nodeLength; 146 | std::unordered_map> nodeLookup; 147 | std::unordered_map originalNodeSize; 148 | std::unordered_map originalNodeName; 149 | std::vector nodeOffset; 150 | std::vector nodeIDs; 151 | std::vector> inNeighbors; 152 | std::vector> outNeighbors; 153 | std::vector reverse; 154 | std::vector linearizable; 155 | std::vector nodeSequences; 156 | size_t bpSize; 157 | std::vector ambiguousNodeSequences; 158 | std::vector ambiguousNodes; 159 | std::vector componentNumber; 160 | std::vector chainNumber; 161 | std::vector chainApproxPos; 162 | size_t firstAmbiguous; 163 | size_t DBGoverlap; 164 | bool finalized; 165 | 166 | std::vector component_map; 167 | std::vector component_idx; 168 | std::vector> component_ids; 169 | 170 | std::vector> topo, topo_ids; 171 | std::vector>> mpc, paths; 172 | std::vector>>> forwards, backwards; 173 | // std::vector> backwards; 174 | 175 | template 176 | friend class GraphAligner; 177 | template 178 | friend class GraphAlignerVGAlignment; 179 | template 180 | friend class GraphAlignerGAFAlignment; 181 | template 182 | friend class GraphAlignerBitvectorBanded; 183 | template 184 | friend class GraphAlignerBitvectorCommon; 185 | template 186 | friend class GraphAlignerBitvectorDijkstra; 187 | friend class DirectedGraph; 188 | friend class MinimizerSeeder; 189 | }; 190 | 191 | 192 | #endif 193 | -------------------------------------------------------------------------------- /src/AlignmentSelection.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "AlignmentSelection.h" 5 | #include "EValue.h" 6 | 7 | //an overlap which is larger than the fraction cutoff of the smaller alignment means the alignments are incompatible 8 | //eg alignments 12000bp and 15000bp, overlap of 12000*0.05 = 600bp means they are incompatible 9 | const float OverlapIncompatibleFractionCutoff = 0.05; 10 | 11 | namespace AlignmentSelection 12 | { 13 | bool alignmentIncompatible(const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right) 14 | { 15 | auto minOverlapLen = std::min((left.alignmentEnd - left.alignmentStart), (right.alignmentEnd - right.alignmentStart)) * OverlapIncompatibleFractionCutoff; 16 | assert(left.alignmentStart >= 0); 17 | assert(right.alignmentStart >= 0); 18 | size_t leftStart = left.alignmentStart; 19 | size_t leftEnd = left.alignmentEnd; 20 | size_t rightStart = right.alignmentStart; 21 | size_t rightEnd = right.alignmentEnd; 22 | if (leftStart > rightStart) 23 | { 24 | std::swap(leftStart, rightStart); 25 | std::swap(leftEnd, rightEnd); 26 | } 27 | int overlap = 0; 28 | assert(leftStart <= rightStart); 29 | if (leftEnd > rightStart) overlap = leftEnd - rightStart; 30 | return overlap > minOverlapLen; 31 | } 32 | 33 | //lower E-value is better 34 | bool alignmentECompare(const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right, size_t m, size_t n, const EValueCalculator& EValueCalc) 35 | { 36 | return EValueCalc.getEValue(m, n, left.alignmentLength(), left.alignmentScore) < EValueCalc.getEValue(m, n, right.alignmentLength(), right.alignmentScore); 37 | } 38 | 39 | bool alignmentScoreCompare(const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right, const EValueCalculator& EValueCalc) 40 | { 41 | return EValueCalc.getAlignmentScore(left.alignmentLength(), left.alignmentScore) > EValueCalc.getAlignmentScore(right.alignmentLength(), right.alignmentScore); 42 | } 43 | 44 | //longer is better, after that lower score is better 45 | bool alignmentLengthCompare(const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right) 46 | { 47 | if ((left.alignmentEnd - left.alignmentStart) > (right.alignmentEnd - right.alignmentStart)) return true; 48 | if ((right.alignmentEnd - right.alignmentStart) > (left.alignmentEnd - left.alignmentStart)) return false; 49 | if (left.alignmentScore < right.alignmentScore) return true; 50 | return false; 51 | } 52 | 53 | std::vector SelectAlignments(const std::vector& allAlignments, SelectionOptions options) 54 | { 55 | // roundabout to fit the signature of const ref while allowing filtering 56 | std::vector filteredByE; 57 | if (options.ECutoff != -1) 58 | { 59 | filteredByE = SelectECutoff(allAlignments, options.graphSize, options.readSize, options.ECutoff, options.EValueCalc); 60 | } 61 | const std::vector& alignments { (options.ECutoff != -1) ? filteredByE : allAlignments }; 62 | switch(options.method) 63 | { 64 | case GreedyLength: 65 | return GreedySelectAlignments(alignments, alignmentLengthCompare); 66 | case GreedyScore: 67 | return GreedySelectAlignments(alignments, [options](const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right) { return alignmentScoreCompare(left, right, options.EValueCalc); }); 68 | case GreedyE: 69 | return GreedySelectAlignments(alignments, [options](const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right) {return alignmentECompare(left, right, options.graphSize, options.readSize, options.EValueCalc); }); 70 | case ScheduleInverseESum: 71 | return ScheduleSelectAlignments(alignments, [options](const AlignmentResult::AlignmentItem aln) { return 1.0 / options.EValueCalc.getEValue(options.graphSize, options.readSize, aln.alignmentLength(), aln.alignmentScore); }); 72 | case ScheduleInverseEProduct: 73 | return ScheduleSelectAlignments(alignments, [options](const AlignmentResult::AlignmentItem aln) { return -log(options.EValueCalc.getEValue(options.graphSize, options.readSize, aln.alignmentLength(), aln.alignmentScore)); }); 74 | case ScheduleScore: 75 | return ScheduleSelectAlignments(alignments, [options](const AlignmentResult::AlignmentItem aln) { return options.EValueCalc.getAlignmentScore(aln.alignmentLength(), aln.alignmentScore); }); 76 | case ScheduleLength: 77 | return ScheduleSelectAlignments(alignments, [](const AlignmentResult::AlignmentItem aln) { return (aln.alignmentEnd - aln.alignmentStart) + 0.5 - 0.5 / (aln.alignmentScore); }); 78 | default: 79 | case All: 80 | return alignments; 81 | } 82 | assert(false); 83 | return alignments; 84 | } 85 | 86 | std::vector SelectECutoff(const std::vector& alignments, size_t m, size_t n, double cutoff, const EValueCalculator& EValueCalc) 87 | { 88 | std::vector result; 89 | for (size_t i = 0; i < alignments.size(); i++) 90 | { 91 | if (EValueCalc.getEValue(m, n, alignments[i].alignmentLength(), alignments[i].alignmentScore) <= cutoff) result.push_back(alignments[i]); 92 | } 93 | return result; 94 | } 95 | 96 | } -------------------------------------------------------------------------------- /src/AlignmentSelection.h: -------------------------------------------------------------------------------- 1 | #ifndef AlignmentSelection_h 2 | #define AlignmentSelection_h 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "vg.pb.h" 9 | #include "GraphAlignerCommon.h" 10 | #include "EValue.h" 11 | 12 | namespace AlignmentSelection 13 | { 14 | enum SelectionMethod 15 | { 16 | GreedyLength, 17 | GreedyScore, 18 | GreedyE, 19 | ScheduleInverseESum, 20 | ScheduleInverseEProduct, 21 | ScheduleScore, 22 | ScheduleLength, 23 | All 24 | }; 25 | struct SelectionOptions 26 | { 27 | SelectionMethod method; 28 | size_t graphSize; 29 | size_t readSize; 30 | double ECutoff; 31 | EValueCalculator EValueCalc; 32 | }; 33 | std::vector SelectAlignments(const std::vector& alignments, SelectionOptions options); 34 | bool alignmentIncompatible(const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right); 35 | 36 | template 37 | std::vector GreedySelectAlignments(const std::vector& alignments, AlnScorer alnScorer) 38 | { 39 | std::vector items; 40 | for (size_t i = 0; i < alignments.size(); i++) 41 | { 42 | items.push_back(i); 43 | } 44 | std::sort(items.begin(), items.end(), [&alignments, alnScorer](size_t left, size_t right) { return alnScorer(alignments[left], alignments[right]); }); 45 | std::vector result; 46 | for (auto i : items) 47 | { 48 | if (!std::any_of(result.begin(), result.end(), [&alignments, i](const AlignmentResult::AlignmentItem& existing) { return alignmentIncompatible(existing, alignments[i]); })) 49 | { 50 | result.push_back(alignments[i]); 51 | } 52 | } 53 | return result; 54 | } 55 | 56 | template 57 | std::vector ScheduleSelectAlignments(const std::vector& alignments, AlnScorer alnScorer) 58 | { 59 | std::vector items; 60 | for (size_t i = 0; i < alignments.size(); i++) 61 | { 62 | items.push_back(i); 63 | } 64 | std::sort(items.begin(), items.end(), [&alignments](size_t left, size_t right) { return alignments[left].alignmentEnd < alignments[right].alignmentEnd; }); 65 | std::vector backtrace; 66 | std::vector score; 67 | backtrace.resize(items.size(), std::numeric_limits::max()); 68 | score.resize(items.size(), 0); 69 | for (size_t i = 0; i < items.size(); i++) 70 | { 71 | double rawScore = alnScorer(alignments[items[i]]); 72 | score[i] = rawScore; 73 | for (size_t j = 0; j < i; j++) 74 | { 75 | if (alignmentIncompatible(alignments[items[i]], alignments[items[j]])) continue; 76 | if (score[j] + rawScore > score[i]) 77 | { 78 | backtrace[i] = j; 79 | score[i] = score[j] + rawScore; 80 | } 81 | } 82 | } 83 | size_t maxPos = 0; 84 | for (size_t i = 0; i < items.size(); i++) 85 | { 86 | if (score[i] > score[maxPos]) maxPos = i; 87 | } 88 | std::vector result; 89 | while (maxPos != std::numeric_limits::max()) 90 | { 91 | result.push_back(alignments[items[maxPos]]); 92 | assert(backtrace[maxPos] < maxPos || backtrace[maxPos] == std::numeric_limits::max()); 93 | maxPos = backtrace[maxPos]; 94 | } 95 | return result; 96 | } 97 | 98 | std::vector SelectECutoff(const std::vector& alignments, size_t m, size_t n, double cutoff, const EValueCalculator& EValueCalc); 99 | 100 | }; 101 | 102 | #endif 103 | -------------------------------------------------------------------------------- /src/AlignmentSubsequenceIdentity.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "CommonUtils.h" 6 | #include "GfaGraph.h" 7 | #include "fastqloader.h" 8 | 9 | bool fakeLengths = false; 10 | 11 | namespace std 12 | { 13 | template <> 14 | struct hash> 15 | { 16 | size_t operator()(const std::pair& x) const 17 | { 18 | return hash()(x.first) ^ hash()(x.second); 19 | } 20 | }; 21 | } 22 | 23 | struct Node 24 | { 25 | int nodeId; 26 | bool reverse; 27 | bool operator==(const Node& other) const 28 | { 29 | return nodeId == other.nodeId && reverse == other.reverse; 30 | } 31 | }; 32 | 33 | struct Alignment 34 | { 35 | std::vector path; 36 | std::vector length; 37 | std::string name; 38 | }; 39 | 40 | Alignment convertVGtoAlignment(const vg::Alignment& vgAln) 41 | { 42 | Alignment result; 43 | result.name = vgAln.name(); 44 | for (int i = 0; i < vgAln.path().mapping_size(); i++) 45 | { 46 | result.path.emplace_back(); 47 | result.path.back().nodeId = vgAln.path().mapping(i).position().node_id(); 48 | result.path.back().reverse = vgAln.path().mapping(i).position().is_reverse(); 49 | result.length.emplace_back(vgAln.path().mapping(i).edit(0).to_length()); 50 | } 51 | return result; 52 | } 53 | 54 | Alignment reverse(const Alignment& old) 55 | { 56 | Alignment result; 57 | result.name = old.name; 58 | for (size_t i = 0; i < old.path.size(); i++) 59 | { 60 | result.path.emplace_back(); 61 | result.path.back().nodeId = old.path[i].nodeId; 62 | result.path.back().reverse = !old.path[i].reverse; 63 | } 64 | result.length = old.length; 65 | std::reverse(result.path.begin(), result.path.end()); 66 | std::reverse(result.length.begin(), result.length.end()); 67 | return result; 68 | } 69 | 70 | std::pair getAlignmentIdentity(const Alignment& read, const Alignment& transcript, const std::unordered_map& readLengths) 71 | { 72 | std::vector> matchLen; 73 | matchLen.resize(read.path.size()+1); 74 | for (size_t i = 0; i < read.path.size()+1; i++) 75 | { 76 | matchLen[i].resize(transcript.path.size()+1, 0); 77 | } 78 | size_t maxMatch = 0; 79 | size_t maxmatchIndex = 0; 80 | size_t maxmatchMissing = 0; 81 | for (size_t i = 0; i < read.path.size(); i++) 82 | { 83 | for (size_t j = 0; j < transcript.path.size(); j++) 84 | { 85 | matchLen[i+1][j+1] = std::max(matchLen[i+1][j], matchLen[i][j+1]); 86 | if (read.path[i] == transcript.path[j]) 87 | { 88 | matchLen[i+1][j+1] = std::max(matchLen[i+1][j+1], matchLen[i][j] + std::min(read.length[i], transcript.length[j])); 89 | } 90 | else 91 | { 92 | matchLen[i+1][j+1] = std::max(matchLen[i+1][j+1], matchLen[i][j]); 93 | } 94 | if (matchLen[i+1][j+1] > maxMatch) 95 | { 96 | maxMatch = matchLen[i+1][j+1]; 97 | maxmatchIndex = j; 98 | maxmatchMissing = 0; 99 | if (read.length[i] < transcript.length[j]) maxmatchMissing = transcript.length[j] - read.length[i]; 100 | } 101 | } 102 | } 103 | size_t threeprimeDistance = maxmatchMissing; 104 | for (size_t i = maxmatchIndex+1; i < transcript.length.size(); i++) 105 | { 106 | threeprimeDistance += transcript.length[i]; 107 | } 108 | assert(maxMatch >= 0); 109 | double length; 110 | if (fakeLengths) 111 | { 112 | length = 1; 113 | } 114 | else 115 | { 116 | assert(readLengths.count(read.name) == 1); 117 | assert(maxMatch <= readLengths.at(read.name)); 118 | length = (double)maxMatch / (double)readLengths.at(read.name); 119 | } 120 | return std::make_pair(length, threeprimeDistance); 121 | } 122 | 123 | int main(int argc, char** argv) 124 | { 125 | std::string transcriptFile { argv[1] }; 126 | std::string readAlignmentFile { argv[2] }; 127 | std::string readFastaFile { argv[3] }; 128 | 129 | if (argc >= 5 && argv[4][0] == '1') fakeLengths = true; 130 | 131 | std::unordered_map readLengths; 132 | { 133 | auto reads = loadFastqFromFile(readFastaFile); 134 | for (auto read : reads) 135 | { 136 | readLengths[read.seq_id] = read.sequence.size(); 137 | } 138 | } 139 | 140 | std::vector transcripts; 141 | std::vector reads; 142 | { 143 | auto vgtranscripts = CommonUtils::LoadVGAlignments(transcriptFile); 144 | for (auto vg : vgtranscripts) 145 | { 146 | transcripts.push_back(convertVGtoAlignment(vg)); 147 | } 148 | } 149 | { 150 | auto vgreads = CommonUtils::LoadVGAlignments(readAlignmentFile); 151 | for (auto vg : vgreads) 152 | { 153 | reads.push_back(convertVGtoAlignment(vg)); 154 | } 155 | } 156 | 157 | std::unordered_map> transcriptsCrossingNode; 158 | for (size_t i = 0; i < transcripts.size(); i++) 159 | { 160 | for (int j = 0; j < transcripts[i].path.size(); j++) 161 | { 162 | transcriptsCrossingNode[transcripts[i].path[j].nodeId].push_back(i); 163 | } 164 | } 165 | 166 | std::unordered_map, std::pair> readTranscriptBestPair; 167 | 168 | for (size_t readi = 0; readi < reads.size(); readi++) 169 | { 170 | auto read = reads[readi]; 171 | std::set possibleTranscripts; 172 | for (size_t i = 0; i < read.path.size(); i++) 173 | { 174 | possibleTranscripts.insert(transcriptsCrossingNode[read.path[i].nodeId].begin(), transcriptsCrossingNode[read.path[i].nodeId].end()); 175 | } 176 | auto reverseread = reverse(read); 177 | for (auto i : possibleTranscripts) 178 | { 179 | auto identityFw = getAlignmentIdentity(read, transcripts[i], readLengths); 180 | auto identityBw = getAlignmentIdentity(reverseread, transcripts[i], readLengths); 181 | auto bigger = identityFw; 182 | if (identityBw.first > identityFw.first) bigger = identityBw; 183 | if (bigger.first > 0 && (readTranscriptBestPair.count(std::make_pair(readi, i)) == 0 || readTranscriptBestPair[std::make_pair(readi, i)].first < bigger.first)) 184 | { 185 | readTranscriptBestPair[std::make_pair(readi, i)] = bigger; 186 | } 187 | } 188 | } 189 | for (auto mapping : readTranscriptBestPair) 190 | { 191 | std::cout << reads[mapping.first.first].name << "\t" << transcripts[mapping.first.second].name << "\t" << mapping.second.first << "\t" << mapping.second.second << std::endl; 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /src/ArrayPriorityQueue.h: -------------------------------------------------------------------------------- 1 | #ifndef ArrayPriorityQueue_h 2 | #define ArrayPriorityQueue_h 3 | 4 | #include 5 | #include 6 | #include "ThreadReadAssertion.h" 7 | 8 | template 9 | class ArrayPriorityQueue 10 | { 11 | public: 12 | constexpr bool IsComponentPriorityQueue() { return false; } 13 | ArrayPriorityQueue(size_t maxPriority, size_t maxExtras) : 14 | activeQueues(), 15 | extras(), 16 | queues(), 17 | numItems(0) 18 | { 19 | initialize(maxPriority, maxExtras); 20 | } 21 | ArrayPriorityQueue() : 22 | activeQueues(), 23 | extras(), 24 | queues(), 25 | numItems(0) 26 | { 27 | } 28 | template 29 | typename std::enable_if::type initialize(size_t maxPriority, size_t maxExtras) 30 | { 31 | queues.resize(maxPriority); 32 | } 33 | template 34 | typename std::enable_if::type initialize(size_t maxPriority, size_t maxExtras) 35 | { 36 | extras.resize(maxExtras, std::vector{}); 37 | queues.resize(maxPriority); 38 | } 39 | #ifdef NDEBUG 40 | __attribute__((always_inline)) 41 | #endif 42 | T& top() 43 | { 44 | assert(activeQueues.size() > 0); 45 | size_t queue = activeQueues.top(); 46 | assert(queues[queue].size() > 0); 47 | return queues[queue].back(); 48 | } 49 | #ifdef NDEBUG 50 | __attribute__((always_inline)) 51 | #endif 52 | void pop() 53 | { 54 | size_t queue = activeQueues.top(); 55 | assert(queues[queue].size() > 0); 56 | queues[queue].pop_back(); 57 | if (queues[queue].size() == 0) activeQueues.pop(); 58 | numItems--; 59 | } 60 | #ifdef NDEBUG 61 | __attribute__((always_inline)) 62 | #endif 63 | size_t size() const 64 | { 65 | return numItems; 66 | } 67 | void insert(size_t component, int score, const T& item) 68 | { 69 | assert(false); 70 | } 71 | #ifdef NDEBUG 72 | __attribute__((always_inline)) 73 | #endif 74 | void insert(size_t priority, const T& item) 75 | { 76 | assert(priority < queues.size()); 77 | queues[priority].push_back(item); 78 | assert(SparseStorage || getId(item) < extras.size()); 79 | extras[getId(item)].push_back(item); 80 | if (queues[priority].size() == 1) activeQueues.emplace(priority); 81 | numItems++; 82 | } 83 | void clear() 84 | { 85 | while (activeQueues.size() > 0) 86 | { 87 | size_t queue = activeQueues.top(); 88 | for (auto item : queues[queue]) 89 | { 90 | removeExtras(getId(item)); 91 | } 92 | queues[queue].clear(); 93 | activeQueues.pop(); 94 | } 95 | numItems = 0; 96 | sparsify(); 97 | } 98 | 99 | template 100 | typename std::enable_if::type sparsify() 101 | { 102 | decltype(extras) empty; 103 | std::swap(extras, empty); 104 | } 105 | template 106 | typename std::enable_if::type sparsify() 107 | { 108 | } 109 | const std::vector& getExtras(size_t index) 110 | { 111 | assert(SparseStorage || index < extras.size()); 112 | return getVec(extras, index); 113 | } 114 | void removeExtras(size_t index) 115 | { 116 | assert(SparseStorage || index < extras.size()); 117 | extras[index].clear(); 118 | } 119 | size_t extraSize(size_t index) const 120 | { 121 | assert(SparseStorage || index < extras.size()); 122 | return getVec(extras, index).size(); 123 | } 124 | private: 125 | const std::vector& getVec(const std::vector>& list, size_t index) const 126 | { 127 | return list[index]; 128 | } 129 | const std::vector& getVec(const phmap::flat_hash_map>& list, size_t index) const 130 | { 131 | static std::vector empty; 132 | auto found = list.find(index); 133 | if (found == list.end()) return empty; 134 | return found->second; 135 | } 136 | size_t getId(const T& item) const 137 | { 138 | return item.target; 139 | } 140 | std::priority_queue, std::greater> activeQueues; 141 | typename std::conditional>, std::vector>>::type extras; 142 | std::vector> queues; 143 | size_t numItems; 144 | }; 145 | 146 | #endif 147 | -------------------------------------------------------------------------------- /src/BigraphToDigraph.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "CommonUtils.h" 6 | #include "vg.pb.h" 7 | #include "fastqloader.h" 8 | #include "BigraphToDigraph.h" 9 | #include "ThreadReadAssertion.h" 10 | #include "stream.hpp" 11 | 12 | static std::vector getAllowedNucleotides() 13 | { 14 | std::vector result; 15 | result.resize(256, false); 16 | result['a'] = true; 17 | result['A'] = true; 18 | result['c'] = true; 19 | result['C'] = true; 20 | result['g'] = true; 21 | result['G'] = true; 22 | result['t'] = true; 23 | result['T'] = true; 24 | result['y'] = true; 25 | result['Y'] = true; 26 | result['r'] = true; 27 | result['R'] = true; 28 | result['w'] = true; 29 | result['W'] = true; 30 | result['s'] = true; 31 | result['S'] = true; 32 | result['k'] = true; 33 | result['K'] = true; 34 | result['m'] = true; 35 | result['M'] = true; 36 | result['d'] = true; 37 | result['D'] = true; 38 | result['v'] = true; 39 | result['V'] = true; 40 | result['h'] = true; 41 | result['H'] = true; 42 | result['b'] = true; 43 | result['B'] = true; 44 | result['n'] = true; 45 | result['N'] = true; 46 | return result; 47 | } 48 | 49 | auto allowed = getAllowedNucleotides(); 50 | 51 | DirectedGraph::Node::Node(int nodeId, int originalNodeId, bool rightEnd, std::string sequence, std::string name) : 52 | nodeId(nodeId), 53 | originalNodeId(originalNodeId), 54 | rightEnd(rightEnd), 55 | sequence(sequence), 56 | name(name) 57 | { 58 | } 59 | 60 | DirectedGraph::Edge::Edge(size_t from, size_t to, size_t overlap) : 61 | fromId(from), 62 | toId(to), 63 | overlap(overlap) 64 | { 65 | } 66 | 67 | std::pair DirectedGraph::ConvertVGNodeToNodes(const vg::Node& node) 68 | { 69 | assert(node.id() < std::numeric_limits::max() / 2); 70 | assert(node.id()+1 < std::numeric_limits::max() / 2); 71 | return std::make_pair(DirectedGraph::Node { (int)node.id() * 2, (int)node.id(), true, node.sequence(), node.name() }, DirectedGraph::Node { (int)node.id() * 2 + 1, (int)node.id(), false, CommonUtils::ReverseComplement(node.sequence()), node.name() }); 72 | } 73 | 74 | std::pair DirectedGraph::ConvertVGEdgeToEdges(const vg::Edge& edge) 75 | { 76 | assert(edge.overlap() == 0); 77 | size_t fromLeft, fromRight, toLeft, toRight; 78 | if (edge.from_start()) 79 | { 80 | fromLeft = edge.from() * 2; 81 | fromRight = edge.from() * 2 + 1; 82 | } 83 | else 84 | { 85 | fromLeft = edge.from() * 2 + 1; 86 | fromRight = edge.from() * 2; 87 | } 88 | if (edge.to_end()) 89 | { 90 | toLeft = edge.to() * 2; 91 | toRight = edge.to() * 2 + 1; 92 | } 93 | else 94 | { 95 | toLeft = edge.to() * 2 + 1; 96 | toRight = edge.to() * 2; 97 | } 98 | return std::make_pair(DirectedGraph::Edge { fromRight, toRight, 0 }, DirectedGraph::Edge { toLeft, fromLeft, 0 }); 99 | } 100 | 101 | std::pair DirectedGraph::ConvertGFANodeToNodes(int id, const std::string& sequence, const std::string& name) 102 | { 103 | return std::make_pair(DirectedGraph::Node { id * 2, id, true, sequence, name }, DirectedGraph::Node { id * 2 + 1, id, false, CommonUtils::ReverseComplement(sequence), name }); 104 | } 105 | 106 | std::pair DirectedGraph::ConvertGFAEdgeToEdges(int from, const std::string& fromstart, int to, const std::string& toend, size_t overlap) 107 | { 108 | assert(fromstart == "+" || fromstart == "-"); 109 | assert(toend == "+" || toend == "-"); 110 | size_t fromLeft, fromRight, toLeft, toRight; 111 | if (fromstart == "-") 112 | { 113 | fromLeft = from * 2; 114 | fromRight = from * 2 + 1; 115 | } 116 | else 117 | { 118 | fromLeft = from * 2 + 1; 119 | fromRight = from * 2; 120 | } 121 | if (toend == "-") 122 | { 123 | toLeft = to * 2; 124 | toRight = to * 2 + 1; 125 | } 126 | else 127 | { 128 | toLeft = to * 2 + 1; 129 | toRight = to * 2; 130 | } 131 | return std::make_pair(DirectedGraph::Edge { fromRight, toRight, overlap }, DirectedGraph::Edge { toLeft, fromLeft, overlap }); 132 | } 133 | 134 | AlignmentGraph DirectedGraph::StreamVGGraphFromFile(std::string filename) 135 | { 136 | AlignmentGraph result; 137 | { 138 | std::vector breakpointsFw; 139 | std::vector breakpointsBw; 140 | breakpointsFw.push_back(0); 141 | breakpointsBw.push_back(0); 142 | std::ifstream graphfile { filename, std::ios::in | std::ios::binary }; 143 | std::function lambda = [&result, &breakpointsFw, &breakpointsBw](vg::Graph& g) { 144 | for (int i = 0; i < g.node_size(); i++) 145 | { 146 | for (size_t j = 0; j < g.node(i).sequence().size(); j++) 147 | { 148 | if (!allowed[g.node(i).sequence()[j]]) 149 | { 150 | throw CommonUtils::InvalidGraphException("Invalid sequence character: " + g.node(i).sequence()[j]); 151 | } 152 | } 153 | auto nodes = ConvertVGNodeToNodes(g.node(i)); 154 | assert(nodes.first.sequence.size() == nodes.second.sequence.size()); 155 | breakpointsFw.push_back(g.node(i).sequence().size()); 156 | breakpointsBw.push_back(g.node(i).sequence().size()); 157 | result.AddNode(nodes.first.nodeId, nodes.first.sequence, nodes.first.name, !nodes.first.rightEnd, breakpointsFw); 158 | result.AddNode(nodes.second.nodeId, nodes.second.sequence, nodes.second.name, !nodes.second.rightEnd, breakpointsBw); 159 | breakpointsFw.erase(breakpointsFw.begin()+1, breakpointsFw.end()); 160 | breakpointsBw.erase(breakpointsBw.begin()+1, breakpointsBw.end()); 161 | } 162 | }; 163 | stream::for_each(graphfile, lambda); 164 | } 165 | { 166 | std::ifstream graphfile { filename, std::ios::in | std::ios::binary }; 167 | std::function lambda = [&result](vg::Graph& g) { 168 | for (int i = 0; i < g.edge_size(); i++) 169 | { 170 | auto edges = ConvertVGEdgeToEdges(g.edge(i)); 171 | result.AddEdgeNodeId(edges.first.fromId, edges.first.toId, edges.first.overlap); 172 | result.AddEdgeNodeId(edges.second.fromId, edges.second.toId, edges.second.overlap); 173 | } 174 | }; 175 | stream::for_each(graphfile, lambda); 176 | } 177 | result.Finalize(64); 178 | return result; 179 | } 180 | 181 | AlignmentGraph DirectedGraph::BuildFromVG(const vg::Graph& graph) 182 | { 183 | AlignmentGraph result; 184 | std::vector breakpointsFw; 185 | std::vector breakpointsBw; 186 | breakpointsFw.push_back(0); 187 | breakpointsBw.push_back(0); 188 | for (int i = 0; i < graph.node_size(); i++) 189 | { 190 | for (size_t j = 0; j < graph.node(i).sequence().size(); j++) 191 | { 192 | if (!allowed[graph.node(i).sequence()[j]]) 193 | { 194 | throw CommonUtils::InvalidGraphException("Invalid sequence character: " + graph.node(i).sequence()[j]); 195 | } 196 | } 197 | auto nodes = ConvertVGNodeToNodes(graph.node(i)); 198 | breakpointsFw.push_back(graph.node(i).sequence().size()); 199 | breakpointsBw.push_back(graph.node(i).sequence().size()); 200 | result.AddNode(nodes.first.nodeId, nodes.first.sequence, nodes.first.name, !nodes.first.rightEnd, breakpointsFw); 201 | result.AddNode(nodes.second.nodeId, nodes.second.sequence, nodes.second.name, !nodes.second.rightEnd, breakpointsBw); 202 | breakpointsFw.erase(breakpointsFw.begin()+1, breakpointsFw.end()); 203 | breakpointsBw.erase(breakpointsBw.begin()+1, breakpointsBw.end()); 204 | } 205 | for (int i = 0; i < graph.edge_size(); i++) 206 | { 207 | auto edges = ConvertVGEdgeToEdges(graph.edge(i)); 208 | result.AddEdgeNodeId(edges.first.fromId, edges.first.toId, edges.first.overlap); 209 | result.AddEdgeNodeId(edges.second.fromId, edges.second.toId, edges.second.overlap); 210 | } 211 | result.Finalize(64); 212 | return result; 213 | } 214 | 215 | AlignmentGraph DirectedGraph::BuildFromGFA(const GfaGraph& graph) 216 | { 217 | AlignmentGraph result; 218 | result.DBGoverlap = graph.edgeOverlap; 219 | std::unordered_map> breakpoints; 220 | for (auto pair : graph.varyingOverlaps) 221 | { 222 | int to = pair.first.second.id * 2; 223 | if (!pair.first.second.end) to += 1; 224 | int from = pair.first.first.Reverse().id * 2; 225 | if (!pair.first.first.Reverse().end) from += 1; 226 | breakpoints[from].push_back(pair.second); 227 | breakpoints[to].push_back(pair.second); 228 | } 229 | for (auto node : graph.nodes) 230 | { 231 | for (size_t j = 0; j < node.second.size(); j++) 232 | { 233 | if (!allowed[node.second[j]]) 234 | { 235 | throw CommonUtils::InvalidGraphException("Invalid sequence character: " + node.second[j]); 236 | } 237 | } 238 | std::string name = graph.OriginalNodeName(node.first); 239 | auto nodes = ConvertGFANodeToNodes(node.first, node.second, name); 240 | std::vector breakpointsFw = breakpoints[node.first * 2]; 241 | std::vector breakpointsBw = breakpoints[node.first * 2 + 1]; 242 | breakpointsFw.push_back(0); 243 | breakpointsFw.push_back(node.second.size()); 244 | breakpointsBw.push_back(0); 245 | breakpointsBw.push_back(node.second.size()); 246 | std::sort(breakpointsFw.begin(), breakpointsFw.end()); 247 | std::sort(breakpointsBw.begin(), breakpointsBw.end()); 248 | result.AddNode(nodes.first.nodeId, nodes.first.sequence, nodes.first.name, !nodes.first.rightEnd, breakpointsFw); 249 | result.AddNode(nodes.second.nodeId, nodes.second.sequence, nodes.second.name, !nodes.second.rightEnd, breakpointsBw); 250 | } 251 | for (auto edge : graph.edges) 252 | { 253 | for (auto target : edge.second) 254 | { 255 | auto overlap = graph.edgeOverlap; 256 | if (graph.varyingOverlaps.count(std::make_pair(edge.first, target)) == 1) 257 | { 258 | overlap = graph.varyingOverlaps.at(std::make_pair(edge.first, target)); 259 | } 260 | auto pair = ConvertGFAEdgeToEdges(edge.first.id, edge.first.end ? "+" : "-", target.id, target.end ? "+" : "-", overlap); 261 | result.AddEdgeNodeId(pair.first.fromId, pair.first.toId, pair.first.overlap); 262 | result.AddEdgeNodeId(pair.second.fromId, pair.second.toId, pair.second.overlap); 263 | } 264 | } 265 | result.Finalize(64); 266 | return result; 267 | } 268 | -------------------------------------------------------------------------------- /src/BigraphToDigraph.h: -------------------------------------------------------------------------------- 1 | #ifndef BigraphToDigraph_H 2 | #define BigraphToDigraph_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "AlignmentGraph.h" 8 | #include "vg.pb.h" 9 | #include "GfaGraph.h" 10 | 11 | class DirectedGraph 12 | { 13 | public: 14 | struct Node 15 | { 16 | Node(int nodeId, int originalNodeId, bool rightEnd, std::string sequence, std::string name); 17 | int nodeId; 18 | int originalNodeId; 19 | bool rightEnd; 20 | std::string sequence; 21 | std::string name; 22 | }; 23 | struct Edge 24 | { 25 | Edge(size_t from, size_t to, size_t overlap); 26 | size_t fromId; 27 | size_t toId; 28 | size_t overlap; 29 | }; 30 | static std::pair ConvertVGNodeToNodes(const vg::Node& node); 31 | static std::pair ConvertVGEdgeToEdges(const vg::Edge& edge); 32 | static std::pair ConvertGFANodeToNodes(int id, const std::string& seq, const std::string& name); 33 | static std::pair ConvertGFAEdgeToEdges(int from, const std::string& fromStart, int to, const std::string& toEnd, size_t overlap); 34 | static AlignmentGraph BuildFromVG(const vg::Graph& graph); 35 | static AlignmentGraph BuildFromGFA(const GfaGraph& graph); 36 | static AlignmentGraph StreamVGGraphFromFile(std::string filename); 37 | private: 38 | }; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /src/BruteForceExactPrefixSeeds.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "vg.pb.h" 4 | #include "stream.hpp" 5 | #include "GfaGraph.h" 6 | #include "fastqloader.h" 7 | 8 | void bruteForceAddPaths(const GfaGraph& graph, std::vector>>& result, int node, size_t offset, size_t label, size_t length, size_t k) 9 | { 10 | label <<= 2; 11 | switch(graph.nodes.at(node)[offset]) 12 | { 13 | case 'A': 14 | case 'a': 15 | label += 0; 16 | break; 17 | case 'C': 18 | case 'c': 19 | label += 1; 20 | break; 21 | case 'G': 22 | case 'g': 23 | label += 2; 24 | break; 25 | case 'T': 26 | case 't': 27 | label += 3; 28 | break; 29 | default: 30 | assert(false); 31 | } 32 | if (length == k - 1) 33 | { 34 | assert(label < result.size()); 35 | result[label].emplace_back(node, offset); 36 | return; 37 | } 38 | if (offset < graph.nodes.at(node).size() - 1) 39 | { 40 | bruteForceAddPaths(graph, result, node, offset+1, label, length+1, k); 41 | } 42 | else 43 | { 44 | if (graph.edges.count(NodePos{node, true}) == 1) 45 | { 46 | for (auto edge : graph.edges.at(NodePos{node, true})) 47 | { 48 | bruteForceAddPaths(graph, result, edge.id, graph.edgeOverlap, label, length+1, k); 49 | } 50 | } 51 | } 52 | } 53 | 54 | std::vector>> buildBruteForcePathIndex(const GfaGraph& graph, const int k) 55 | { 56 | std::vector>> result; 57 | result.resize(pow(4, k)); 58 | for (auto node : graph.nodes) 59 | { 60 | for (size_t i = 0; i < node.second.size(); i++) 61 | { 62 | bruteForceAddPaths(graph, result, node.first, i, 0, 0, k); 63 | } 64 | } 65 | return result; 66 | } 67 | 68 | int main(int argc, char** argv) 69 | { 70 | std::string graphFile { argv[1] }; 71 | std::string readFile { argv[2] }; 72 | int k = std::stoi(argv[3]); 73 | std::string outputSeedFile { argv[4] }; 74 | 75 | std::cerr << "load graph" << std::endl; 76 | auto graph = GfaGraph::LoadFromFile(graphFile); 77 | std::cerr << "build index" << std::endl; 78 | auto index = buildBruteForcePathIndex(graph, k); 79 | 80 | std::cerr << "load reads" << std::endl; 81 | auto reads = loadFastqFromFile(readFile); 82 | std::vector> readLabels; 83 | readLabels.reserve(reads.size()); 84 | std::vector seeds; 85 | size_t numSeeds = 0; 86 | std::cerr << "count seeds" << std::endl; 87 | for (auto read : reads) 88 | { 89 | if (read.sequence.size() < k) continue; 90 | size_t label = 0; 91 | for (int i = 0; i < k; i++) 92 | { 93 | label <<= 2; 94 | switch(read.sequence[i]) 95 | { 96 | case 'A': 97 | case 'a': 98 | label += 0; 99 | break; 100 | case 'C': 101 | case 'c': 102 | label += 1; 103 | break; 104 | case 'T': 105 | case 't': 106 | label += 2; 107 | break; 108 | case 'G': 109 | case 'g': 110 | label += 3; 111 | break; 112 | default: 113 | break; 114 | } 115 | } 116 | readLabels.emplace_back(read.seq_id, label); 117 | numSeeds += index[label].size(); 118 | } 119 | std::cerr << numSeeds << " seeds" << std::endl; 120 | seeds.reserve(numSeeds); 121 | std::cerr << "get seeds" << std::endl; 122 | for (auto pair : readLabels) 123 | { 124 | for (auto pos : index[pair.second]) 125 | { 126 | vg::Alignment seed; 127 | seed.set_name(pair.first); 128 | auto mapping = seed.mutable_path()->add_mapping(); 129 | auto edit = mapping->add_edit(); 130 | edit->set_from_length(k); 131 | edit->set_to_length(k); 132 | mapping->mutable_position()->set_node_id(pos.first); 133 | mapping->mutable_position()->set_offset(pos.second); 134 | seeds.push_back(seed); 135 | seed.set_query_position(k-1); 136 | } 137 | } 138 | 139 | std::cerr << "write seeds" << std::endl; 140 | std::ofstream outFile { outputSeedFile, std::ios::binary }; 141 | stream::write_buffered(outFile, seeds, 0); 142 | } -------------------------------------------------------------------------------- /src/CommonUtils.cpp: -------------------------------------------------------------------------------- 1 | #include "CommonUtils.h" 2 | #include "stream.hpp" 3 | 4 | namespace CommonUtils 5 | { 6 | InvalidGraphException::InvalidGraphException(const char* c) : std::runtime_error(c) 7 | { 8 | } 9 | 10 | InvalidGraphException::InvalidGraphException(std::string c) : std::runtime_error(c) 11 | { 12 | } 13 | 14 | void mergeGraphs(vg::Graph& graph, const vg::Graph& part) 15 | { 16 | for (int i = 0; i < part.node_size(); i++) 17 | { 18 | auto node = graph.add_node(); 19 | node->set_id(part.node(i).id()); 20 | node->set_sequence(part.node(i).sequence()); 21 | node->set_name(part.node(i).name()); 22 | } 23 | for (int i = 0; i < part.edge_size(); i++) 24 | { 25 | auto edge = graph.add_edge(); 26 | edge->set_from(part.edge(i).from()); 27 | edge->set_to(part.edge(i).to()); 28 | edge->set_from_start(part.edge(i).from_start()); 29 | edge->set_to_end(part.edge(i).to_end()); 30 | edge->set_overlap(part.edge(i).overlap()); 31 | } 32 | } 33 | 34 | vg::Graph LoadVGGraph(std::string filename) 35 | { 36 | vg::Graph result; 37 | std::ifstream graphfile { filename, std::ios::in | std::ios::binary }; 38 | std::function lambda = [&result](vg::Graph& g) { 39 | mergeGraphs(result, g); 40 | }; 41 | stream::for_each(graphfile, lambda); 42 | return result; 43 | } 44 | 45 | std::vector LoadVGAlignments(std::string filename) 46 | { 47 | std::vector result; 48 | std::ifstream graphfile { filename, std::ios::in | std::ios::binary }; 49 | std::function lambda = [&result](vg::Alignment& g) { 50 | result.push_back(g); 51 | }; 52 | stream::for_each(graphfile, lambda); 53 | return result; 54 | } 55 | 56 | vg::Alignment LoadVGAlignment(std::string filename) 57 | { 58 | vg::Alignment result; 59 | std::ifstream graphfile { filename, std::ios::in | std::ios::binary }; 60 | std::function lambda = [&result](vg::Alignment& g) { 61 | result = g; 62 | }; 63 | stream::for_each(graphfile, lambda); 64 | return result; 65 | } 66 | 67 | std::string ReverseComplement(std::string str) 68 | { 69 | std::string result; 70 | result.reserve(str.size()); 71 | for (int i = str.size()-1; i >= 0; i--) 72 | { 73 | result += Complement(str[i]); 74 | } 75 | return result; 76 | } 77 | 78 | char Complement(char c) 79 | { 80 | switch (c) 81 | { 82 | case 'A': 83 | case 'a': 84 | return 'T'; 85 | case 'C': 86 | case 'c': 87 | return 'G'; 88 | case 'T': 89 | case 't': 90 | return 'A'; 91 | case 'G': 92 | case 'g': 93 | return 'C'; 94 | case 'N': 95 | case 'n': 96 | return 'N'; 97 | case 'U': 98 | case 'u': 99 | return 'A'; 100 | case 'R': 101 | case 'r': 102 | return 'Y'; 103 | case 'Y': 104 | case 'y': 105 | return 'R'; 106 | case 'K': 107 | case 'k': 108 | return 'M'; 109 | case 'M': 110 | case 'm': 111 | return 'K'; 112 | case 'S': 113 | case 's': 114 | return 'S'; 115 | case 'W': 116 | case 'w': 117 | return 'W'; 118 | case 'B': 119 | case 'b': 120 | return 'V'; 121 | case 'V': 122 | case 'v': 123 | return 'B'; 124 | case 'D': 125 | case 'd': 126 | return 'H'; 127 | case 'H': 128 | case 'h': 129 | return 'D'; 130 | default: 131 | assert(false); 132 | return 'N'; 133 | } 134 | } 135 | 136 | } 137 | 138 | BufferedWriter::BufferedWriter() : stream(nullptr) {}; 139 | BufferedWriter::BufferedWriter(std::ostream& stream) : stream(&stream) {}; 140 | BufferedWriter& BufferedWriter::operator<<(FlushClass) 141 | { 142 | if (stream == nullptr) return *this; 143 | flush(); 144 | return *this; 145 | } 146 | void BufferedWriter::flush() 147 | { 148 | if (stream == nullptr) return; 149 | stringstream << std::endl; 150 | (*stream) << stringstream.str(); 151 | stringstream.str(""); 152 | } 153 | bool BufferedWriter::inputDiscarded() const 154 | { 155 | return stream == nullptr; 156 | } -------------------------------------------------------------------------------- /src/CommonUtils.h: -------------------------------------------------------------------------------- 1 | #ifndef CommonUtils_h 2 | #define CommonUtils_h 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "vg.pb.h" 10 | 11 | namespace CommonUtils 12 | { 13 | struct InvalidGraphException : std::runtime_error 14 | { 15 | InvalidGraphException(const char* c); 16 | InvalidGraphException(std::string c); 17 | }; 18 | vg::Graph LoadVGGraph(std::string filename); 19 | char Complement(char original); 20 | std::string ReverseComplement(std::string original); 21 | vg::Alignment LoadVGAlignment(std::string filename); 22 | std::vector LoadVGAlignments(std::string filename); 23 | } 24 | 25 | class BufferedWriter : std::ostream 26 | { 27 | public: 28 | class FlushClass {}; 29 | BufferedWriter(); 30 | BufferedWriter(std::ostream& stream); 31 | BufferedWriter(const BufferedWriter& other) = default; 32 | BufferedWriter(BufferedWriter&& other) = default; 33 | BufferedWriter& operator=(const BufferedWriter& other) = default; 34 | BufferedWriter& operator=(BufferedWriter&& other) = default; 35 | template 36 | BufferedWriter& operator<<(T obj) 37 | { 38 | if (stream == nullptr) return *this; 39 | stringstream << obj; 40 | return *this; 41 | } 42 | BufferedWriter& operator<<(FlushClass f); 43 | void flush(); 44 | bool inputDiscarded() const; 45 | static FlushClass Flush; 46 | private: 47 | std::ostream* stream; 48 | std::stringstream stringstream; 49 | }; 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /src/ComponentPriorityQueue.h: -------------------------------------------------------------------------------- 1 | #ifndef ComponentPriorityQueue_h 2 | #define ComponentPriorityQueue_h 3 | 4 | #include 5 | #include 6 | #include "ThreadReadAssertion.h" 7 | 8 | template 9 | class ComponentPriorityQueue 10 | { 11 | struct PrioritizedItem 12 | { 13 | PrioritizedItem(size_t component, int score, size_t index) : component(component), score(score), index(index) {} 14 | size_t component; 15 | int score; 16 | size_t index; 17 | bool operator<(const PrioritizedItem& other) const { return component < other.component || (component == other.component && score < other.score); } 18 | bool operator>(const PrioritizedItem& other) const { return component > other.component || (component == other.component && score > other.score); } 19 | }; 20 | public: 21 | constexpr bool IsComponentPriorityQueue() { return true; } 22 | ComponentPriorityQueue(size_t maxNode) : 23 | activeQueues(), 24 | active(), 25 | extras() 26 | { 27 | initialize(maxNode); 28 | } 29 | ComponentPriorityQueue() : 30 | activeQueues(), 31 | active(), 32 | extras() 33 | { 34 | } 35 | template 36 | typename std::enable_if::type initialize(size_t maxNode) 37 | { 38 | active.resize(maxNode, false); 39 | } 40 | template 41 | typename std::enable_if::type initialize(size_t maxNode) 42 | { 43 | extras.resize(maxNode); 44 | active.resize(maxNode, false); 45 | } 46 | #ifdef NDEBUG 47 | __attribute__((always_inline)) 48 | #endif 49 | T& top() 50 | { 51 | assert(activeQueues.size() > 0); 52 | auto index = activeQueues.top().index; 53 | assert(active[index]); 54 | assert(extras[index].size() > 0); 55 | return extras[index][0]; 56 | } 57 | #ifdef NDEBUG 58 | __attribute__((always_inline)) 59 | #endif 60 | void pop() 61 | { 62 | assert(activeQueues.size() > 0); 63 | size_t index = activeQueues.top().index; 64 | assert(active[index]); 65 | assert(extras[index].size() > 0); 66 | extras[index].clear(); 67 | active[index] = false; 68 | activeQueues.pop(); 69 | } 70 | #ifdef NDEBUG 71 | __attribute__((always_inline)) 72 | #endif 73 | size_t size() const 74 | { 75 | return activeQueues.size(); 76 | } 77 | void insert(size_t component, const T& item) 78 | { 79 | assert(false); 80 | } 81 | #ifdef NDEBUG 82 | __attribute__((always_inline)) 83 | #endif 84 | void insert(size_t component, int score, const T& item) 85 | { 86 | size_t index = getId(item); 87 | assert(SparseStorage || index < extras.size()); 88 | if (!active[index]) 89 | { 90 | assert(extras[index].size() == 0); 91 | activeQueues.emplace(component, score, index); 92 | active[index] = true; 93 | } 94 | extras[index].push_back(item); 95 | } 96 | void clear() 97 | { 98 | while (activeQueues.size() > 0) 99 | { 100 | size_t index = activeQueues.top().index; 101 | assert(active[index]); 102 | removeExtras(index); 103 | active[index] = false; 104 | activeQueues.pop(); 105 | } 106 | sparsify(); 107 | } 108 | template 109 | typename std::enable_if::type sparsify() 110 | { 111 | decltype(extras) empty; 112 | std::swap(extras, empty); 113 | } 114 | template 115 | typename std::enable_if::type sparsify() 116 | { 117 | } 118 | const std::vector& getExtras(size_t index) const 119 | { 120 | assert(SparseStorage ||index < extras.size()); 121 | return getVec(extras, index); 122 | } 123 | void removeExtras(size_t index) 124 | { 125 | assert(SparseStorage ||index < extras.size()); 126 | extras[index].clear(); 127 | } 128 | size_t extraSize(size_t index) const 129 | { 130 | assert(SparseStorage ||index < extras.size()); 131 | return getVec(extras, index).size(); 132 | } 133 | bool valid() const 134 | { 135 | return active.size() > 0; 136 | } 137 | private: 138 | const std::vector& getVec(const std::vector>& list, size_t index) const 139 | { 140 | return list[index]; 141 | } 142 | const std::vector& getVec(const phmap::flat_hash_map>& list, size_t index) const 143 | { 144 | static std::vector empty; 145 | auto found = list.find(index); 146 | if (found == list.end()) return empty; 147 | return found->second; 148 | } 149 | size_t getId(const T& item) const 150 | { 151 | return item.target; 152 | } 153 | std::priority_queue, std::greater> activeQueues; 154 | std::vector active; 155 | typename std::conditional>, std::vector>>::type extras; 156 | }; 157 | 158 | #endif 159 | -------------------------------------------------------------------------------- /src/DijkstraQueue.h: -------------------------------------------------------------------------------- 1 | #ifndef DijkstraQueue_h 2 | #define DijkstraQueue_h 3 | 4 | #include 5 | #include 6 | #include "ThreadReadAssertion.h" 7 | 8 | namespace std 9 | { 10 | template<> 11 | struct hash> 12 | { 13 | size_t operator()(const std::pair pair) const 14 | { 15 | return std::hash{}(pair.first) ^ std::hash{}(pair.second); 16 | } 17 | }; 18 | } 19 | 20 | template 21 | class DijkstraPriorityQueue 22 | { 23 | public: 24 | constexpr bool IsComponentPriorityQueue() { return false; } 25 | DijkstraPriorityQueue() : 26 | activeQueues(), 27 | extras(), 28 | queues(), 29 | numItems(0), 30 | zeroScore(0) 31 | { 32 | initialize(129); 33 | } 34 | void initialize(size_t maxPriority) 35 | { 36 | queues.resize(maxPriority); 37 | } 38 | #ifdef NDEBUG 39 | __attribute__((always_inline)) 40 | #endif 41 | T& top() 42 | { 43 | assert(activeQueues.size() > 0); 44 | size_t queue = activeQueues.top(); 45 | assert(queues[queue].size() > 0); 46 | return queues[queue].back(); 47 | } 48 | #ifdef NDEBUG 49 | __attribute__((always_inline)) 50 | #endif 51 | void pop() 52 | { 53 | assert(numItems > 0); 54 | assert(activeQueues.size() > 0); 55 | size_t queue = activeQueues.top(); 56 | assert(queue < queues.size()); 57 | assert(queues[queue].size() > 0); 58 | queues[queue].pop_back(); 59 | if (queues[queue].size() == 0) activeQueues.pop(); 60 | numItems--; 61 | } 62 | #ifdef NDEBUG 63 | __attribute__((always_inline)) 64 | #endif 65 | size_t size() const 66 | { 67 | return numItems; 68 | } 69 | void insert(size_t component, int score, const T& item) 70 | { 71 | assert(false); 72 | } 73 | #ifdef NDEBUG 74 | __attribute__((always_inline)) 75 | #endif 76 | void insert(size_t priority, const T& item) 77 | { 78 | assert(priority >= zeroScore); 79 | priority -= zeroScore; 80 | assert(priority < queues.size()); 81 | queues[priority].push_back(item); 82 | extras[getId(item)].push_back(item); 83 | if (queues[priority].size() == 1) activeQueues.emplace(priority); 84 | numItems++; 85 | } 86 | void clear() 87 | { 88 | for (size_t i = 0; i < queues.size(); i++) 89 | { 90 | queues[i].clear(); 91 | } 92 | typename std::remove_reference::type tmp; 93 | std::swap(tmp, activeQueues); 94 | typename std::remove_reference::type tmp2; 95 | std::swap(tmp2, extras); 96 | numItems = 0; 97 | zeroScore = 0; 98 | sparsify(); 99 | } 100 | void increaseScore(size_t increase) 101 | { 102 | assert(increase > 0); 103 | assert(increase < queues.size()); 104 | assert(activeQueues.size() == 0 || activeQueues.top() >= increase); 105 | typename std::remove_reference::type tmp; 106 | std::swap(tmp, activeQueues); 107 | for (size_t i = 0; i < queues.size() - increase; i++) 108 | { 109 | assert(queues[i].size() == 0); 110 | std::swap(queues[i], queues[i + increase]); 111 | if (queues[i].size() > 0) activeQueues.emplace(i); 112 | } 113 | for (size_t i = queues.size() - increase; i < queues.size(); i++) 114 | { 115 | assert(queues[i].size() == 0); 116 | } 117 | zeroScore += increase; 118 | } 119 | void sparsify() 120 | { 121 | decltype(extras) empty; 122 | std::swap(extras, empty); 123 | } 124 | const std::vector& getExtras(size_t slice, size_t index) 125 | { 126 | return getExtras(std::make_pair(slice, index)); 127 | } 128 | const std::vector& getExtras(std::pair index) 129 | { 130 | return getVec(index); 131 | } 132 | void removeExtras(size_t slice, size_t index) 133 | { 134 | removeExtras(std::make_pair(slice, index)); 135 | } 136 | void removeExtras(std::pair index) 137 | { 138 | extras[index].clear(); 139 | } 140 | size_t extraSize(size_t slice, size_t index) const 141 | { 142 | return extraSize(std::make_pair(slice, index)); 143 | } 144 | size_t extraSize(std::pair index) const 145 | { 146 | return getVec(index).size(); 147 | } 148 | size_t zero() const 149 | { 150 | return zeroScore; 151 | } 152 | private: 153 | const std::vector& getVec(std::pair index) const 154 | { 155 | static std::vector empty; 156 | auto found = extras.find(index); 157 | if (found == extras.end()) return empty; 158 | return found->second; 159 | } 160 | std::pair getId(const T& item) const 161 | { 162 | return std::make_pair(item.slice, item.target); 163 | } 164 | std::priority_queue, std::greater> activeQueues; 165 | phmap::flat_hash_map, std::vector> extras; 166 | std::vector> queues; 167 | size_t numItems; 168 | size_t zeroScore; 169 | }; 170 | 171 | #endif 172 | -------------------------------------------------------------------------------- /src/EValue.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "ThreadReadAssertion.h" 4 | #include "EValue.h" 5 | 6 | // model the alignment as one sequence with the alphabet {match, mismatch} 7 | // with random alignments having P(match) = P(mismatch) = 0.5 8 | // and match having score +1, mismatch score <0 chosen such that an alignment at minIdentity has score 0 9 | // then use Karlin-Altschul equation to calculate E 10 | // get lambda and K numerically depending no the match & mismatch score 11 | 12 | // ...except the bitvector algorithm doesn't give the number of matches and mismatches 13 | // so approximate the alignment score as (length * matchScore + numEdits * (mismatchScore-matchScore)) 14 | // this is close enough hopefully 15 | 16 | constexpr double e = 2.71828182845904523536028747135266249775724709369995; 17 | 18 | EValueCalculator::EValueCalculator() : 19 | matchScore(-1), 20 | mismatchScore(-1), 21 | lambda(-1), 22 | K(-1) 23 | { 24 | } 25 | 26 | EValueCalculator::EValueCalculator(double minIdentity) : 27 | matchScore(1), 28 | mismatchScore(-minIdentity / (1.0 - minIdentity)), 29 | lambda(-1), 30 | K(-1) 31 | { 32 | initializeLambda(); 33 | initializeK(); 34 | } 35 | 36 | double EValueCalculator::getEValue(size_t databaseSize, size_t querySize, double alignmentScore) const 37 | { 38 | return K * databaseSize * querySize * pow(e, -lambda * alignmentScore); 39 | } 40 | 41 | double EValueCalculator::getEValue(size_t databaseSize, size_t querySize, size_t alignmentLength, size_t numEdits) const 42 | { 43 | return getEValue(databaseSize, querySize, getAlignmentScore(alignmentLength, numEdits)); 44 | } 45 | 46 | double EValueCalculator::getAlignmentScore(size_t alignmentLength, size_t numEdits) const 47 | { 48 | return alignmentLength * matchScore - numEdits * (mismatchScore - matchScore); 49 | } 50 | 51 | void EValueCalculator::initializeLambda() 52 | { 53 | // lambda is bounded by 0 < lambda < ln(2) < 0.7 54 | double guessMin = 0; 55 | double guessMax = 0.7; 56 | // bisect, max error 2^-100 57 | for (int i = 0; i < 100; i++) 58 | { 59 | double guessMid = (guessMin + guessMax) * 0.5; 60 | double valueMid = pow(e, guessMid*matchScore) * .5 + pow(e, guessMid*mismatchScore) * 0.5 - 1; 61 | if (valueMid < 0) guessMin = guessMid; 62 | if (valueMid > 0) guessMax = guessMid; 63 | // due to floating point precision limits 64 | if (valueMid == 0) 65 | { 66 | guessMin = guessMid; 67 | guessMax = guessMid; 68 | break; 69 | } 70 | if (guessMin == guessMax) break; 71 | } 72 | lambda = (guessMin + guessMax) / 2; 73 | } 74 | 75 | void EValueCalculator::initializeK() 76 | { 77 | assert(lambda != -1); 78 | double seriesSum = 0; 79 | std::vector pascalsTriangle; 80 | pascalsTriangle.push_back(1); 81 | for (int k = 1; k < 10; k++) 82 | { 83 | std::vector newTriangle; 84 | newTriangle.resize(pascalsTriangle.size()+1, 0); 85 | for (size_t j = 0; j < pascalsTriangle.size(); j++) 86 | { 87 | newTriangle[j] += pascalsTriangle[j]; 88 | newTriangle[j+1] += pascalsTriangle[j]; 89 | } 90 | pascalsTriangle = newTriangle; 91 | assert(pascalsTriangle[0] == 1); 92 | assert(pascalsTriangle.back() == 1); 93 | assert(pascalsTriangle.size() == k+1); 94 | size_t triangleSum = 0; 95 | for (auto n : pascalsTriangle) triangleSum += n; 96 | double negativeExpectation = 0; 97 | double greaterProbability = 0; 98 | for (size_t j = 0; j < pascalsTriangle.size(); j++) 99 | { 100 | size_t matches = j; 101 | size_t mismatches = pascalsTriangle.size() - 1 - j; 102 | double score = (double)matches * matchScore + (double)mismatches * mismatchScore; 103 | double probability = (double)pascalsTriangle[j] / (double)triangleSum; 104 | if (score < 0) negativeExpectation += pow(e, lambda * score) * probability; 105 | if (score >= 0) greaterProbability += probability; 106 | } 107 | seriesSum += (negativeExpectation + greaterProbability) / (double)k; 108 | } 109 | double expectation = .5 * matchScore * pow(e, lambda * matchScore) + .5 * mismatchScore * pow(e, lambda * mismatchScore); 110 | double Cstar = pow(e, -2 * seriesSum) / (lambda * expectation); 111 | // assume delta is 1 even though its not really 112 | K = Cstar * lambda / (1.0 - pow(e, -lambda)); 113 | } 114 | -------------------------------------------------------------------------------- /src/EValue.h: -------------------------------------------------------------------------------- 1 | #ifndef EValue_h 2 | #define EValue_h 3 | 4 | class EValueCalculator 5 | { 6 | public: 7 | EValueCalculator(); 8 | EValueCalculator(double minIdentity); 9 | double getAlignmentScore(size_t alignmentLength, size_t numEdits) const; 10 | double getEValue(size_t databaseSize, size_t querySize, size_t alignmentLength, size_t numEdits) const; 11 | double getEValue(size_t databaseSize, size_t querySize, double alignmentScore) const; 12 | private: 13 | void initializeLambda(); 14 | void initializeK(); 15 | double matchScore; 16 | double mismatchScore; 17 | double lambda; 18 | double K; 19 | }; 20 | 21 | #endif -------------------------------------------------------------------------------- /src/EstimateRepeatCount.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "CommonUtils.h" 8 | #include "GfaGraph.h" 9 | 10 | int main(int argc, char** argv) 11 | { 12 | std::string ingraphfilename {argv[1]}; 13 | std::string inalignmentfilename {argv[2]}; 14 | std::string outfilename {argv[3]}; 15 | 16 | std::cerr << "load graph" << std::endl; 17 | auto graph = GfaGraph::LoadFromFile(ingraphfilename); 18 | std::cerr << "process graph" << std::endl; 19 | std::unordered_map> baseCounts; 20 | std::unordered_map> outNeighbors; 21 | std::unordered_map> leftInneighbors; 22 | std::unordered_map> rightInneighbors; 23 | std::unordered_map counts; 24 | { 25 | std::unordered_map> edges; 26 | for (auto edge : graph.edges) 27 | { 28 | for (auto target : edge.second) 29 | { 30 | edges[edge.first].insert(target); 31 | edges[NodePos{target.id, !target.end}].insert(NodePos{edge.first.id, !edge.first.end}); 32 | } 33 | } 34 | for (auto node : graph.nodes) 35 | { 36 | NodePos end {node.first, true}; 37 | if (edges.count(end) == 1 && edges[end].size() == 1) 38 | { 39 | outNeighbors[node.first].push_back(edges[end].begin()->id); 40 | if (edges[end].begin()->end) 41 | { 42 | rightInneighbors[edges[end].begin()->id].push_back(node.first); 43 | } 44 | else 45 | { 46 | leftInneighbors[edges[end].begin()->id].push_back(node.first); 47 | } 48 | } 49 | NodePos start {node.first, false}; 50 | if (edges.count(start) == 1 && edges[start].size() == 1) 51 | { 52 | outNeighbors[node.first].push_back(edges[start].begin()->id); 53 | if (edges[start].begin()->end) 54 | { 55 | rightInneighbors[edges[start].begin()->id].push_back(node.first); 56 | } 57 | else 58 | { 59 | leftInneighbors[edges[start].begin()->id].push_back(node.first); 60 | } 61 | } 62 | if (edges.count(start) == 1) counts[node.first] = std::max(counts[node.first], edges[start].size()); 63 | if (edges.count(end) == 1) counts[node.first] = std::max(counts[node.first], edges[end].size()); 64 | } 65 | } 66 | 67 | std::cerr << "load alignment" << std::endl; 68 | auto alignments = CommonUtils::LoadVGAlignments(inalignmentfilename); 69 | for (auto aln : alignments) 70 | { 71 | for (size_t i = 0; i < aln.path().mapping_size(); i++) 72 | { 73 | baseCounts[aln.path().mapping(i).position().node_id()][aln.name()] += 1; 74 | } 75 | } 76 | std::cerr << "init counts" << std::endl; 77 | for (auto pair : baseCounts) 78 | { 79 | for (auto count : pair.second) 80 | { 81 | counts[pair.first] = std::max(counts[pair.first], count.second); 82 | } 83 | } 84 | 85 | std::cerr << "iterate" << std::endl; 86 | std::vector updateQueue; 87 | updateQueue.reserve(graph.nodes.size()); 88 | for (const auto& node : graph.nodes) 89 | { 90 | updateQueue.push_back(node.first); 91 | } 92 | std::cerr << "numnodes " << updateQueue.size() << std::endl; 93 | size_t iterated = 0; 94 | int maxcount = 0; 95 | while (updateQueue.size() > 0) 96 | { 97 | auto node = updateQueue.back(); 98 | updateQueue.pop_back(); 99 | iterated++; 100 | if (iterated % 1000000 == 0) std::cerr << "iterated " << iterated << std::endl; 101 | int leftCountShouldBe = 0; 102 | if (leftInneighbors.count(node) == 1) 103 | { 104 | for (auto neighbor : leftInneighbors.at(node)) 105 | { 106 | leftCountShouldBe += counts[neighbor]; 107 | } 108 | } 109 | int rightCountShouldBe = 0; 110 | if (rightInneighbors.count(node) == 1) 111 | { 112 | for (auto neighbor : rightInneighbors.at(node)) 113 | { 114 | rightCountShouldBe += counts[neighbor]; 115 | } 116 | } 117 | if (counts[node] >= leftCountShouldBe && counts[node] >= rightCountShouldBe) continue; 118 | counts[node] = std::max(leftCountShouldBe, rightCountShouldBe); 119 | if (counts[node] > maxcount) 120 | { 121 | maxcount = counts[node]; 122 | std::cerr << "node " << node << " iter " << iterated << " maxcount " << maxcount << std::endl; 123 | } 124 | if (outNeighbors.count(node) == 1) 125 | { 126 | for (auto neighbor : outNeighbors.at(node)) 127 | { 128 | updateQueue.push_back(neighbor); 129 | } 130 | } 131 | } 132 | std::cerr << "iteration done with " << iterated << std::endl; 133 | 134 | std::cerr << "write result" << std::endl; 135 | std::ofstream out {outfilename}; 136 | out << "node,_minalntoporepeatcount"; 137 | out << std::endl; 138 | std::vector nodevec; 139 | for (const auto& node : graph.nodes) 140 | { 141 | nodevec.push_back(node.first); 142 | } 143 | std::sort(nodevec.begin(), nodevec.end()); 144 | for (auto node : nodevec) 145 | { 146 | out << node; 147 | out << "," << counts[node]; 148 | out << std::endl; 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/ExtractCorrectedReads.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "GfaGraph.h" 8 | #include "vg.pb.h" 9 | #include "stream.hpp" 10 | #include "CommonUtils.h" 11 | #include "fastqloader.h" 12 | #include "ReadCorrection.h" 13 | 14 | std::string toLower(std::string seq); 15 | 16 | void addPartial(const std::unordered_map& ids, std::unordered_map>& partials, std::function seqGetter, const vg::Alignment& v) 17 | { 18 | Correction result; 19 | result.startIndex = v.query_position(); 20 | result.endIndex = v.query_position() + v.sequence().size(); 21 | result.corrected = ""; 22 | for (int i = 0; i < v.path().mapping_size(); i++) 23 | { 24 | auto nodeid = v.path().mapping(i).position().node_id(); 25 | auto sequence = seqGetter(ids.at(nodeid)); 26 | int len = 0; 27 | for (int j = 0; j < v.path().mapping(i).edit_size(); j++) 28 | { 29 | len += v.path().mapping(i).edit(j).from_length(); 30 | } 31 | if (v.path().mapping(i).position().is_reverse()) 32 | { 33 | sequence = CommonUtils::ReverseComplement(sequence); 34 | } 35 | if (v.path().mapping(i).position().offset() > 0) 36 | { 37 | sequence = sequence.substr(v.path().mapping(i).position().offset()); 38 | } 39 | sequence = sequence.substr(0, len); 40 | result.corrected += sequence; 41 | } 42 | partials[v.name()].push_back(result); 43 | } 44 | 45 | void addPartial(const vg::Graph& g, const std::unordered_map& ids, const vg::Alignment& v, std::unordered_map>& partials) 46 | { 47 | addPartial(ids, partials, [&g](int id) {return g.node(id).sequence();}, v); 48 | } 49 | 50 | void addPartial(const GfaGraph& g, const std::unordered_map& ids, const vg::Alignment& v, std::unordered_map>& partials) 51 | { 52 | addPartial(ids, partials, [&g](int id) {return g.nodes.at(id);}, v); 53 | } 54 | 55 | void mergePartials(const std::unordered_map>& partials, const std::vector& reads, size_t maxOverlap) 56 | { 57 | for (auto read : reads) 58 | { 59 | if (partials.count(read.seq_id) == 0) 60 | { 61 | std::cout << ">" << read.seq_id << std::endl << toLower(read.sequence) << std::endl; 62 | continue; 63 | } 64 | auto p = partials.at(read.seq_id); 65 | std::sort(p.begin(), p.end(), [](const Correction& left, const Correction& right) { return left.startIndex < right.startIndex; }); 66 | auto corrected = getCorrected(read.sequence, p, maxOverlap); 67 | std::cout << ">" << read.seq_id << std::endl << corrected << std::endl; 68 | } 69 | } 70 | 71 | int main(int argc, char** argv) 72 | { 73 | std::string graphfilename {argv[1]}; 74 | std::string alnfilename { argv[2] }; 75 | std::string readfilename { argv[3] }; 76 | //output in stdout 77 | auto reads = loadFastqFromFile(readfilename); 78 | for (int i = 4; i < argc; i++) 79 | { 80 | auto extrareads = loadFastqFromFile(argv[i]); 81 | reads.insert(reads.end(), extrareads.begin(), extrareads.end()); 82 | } 83 | 84 | size_t maxOverlap = 0; 85 | 86 | std::unordered_map> partials; 87 | if (graphfilename.substr(graphfilename.size()-3) == ".vg") 88 | { 89 | vg::Graph graph = CommonUtils::LoadVGGraph(argv[1]); 90 | std::unordered_map ids; 91 | for (int i = 0; i < graph.node_size(); i++) 92 | { 93 | ids[graph.node(i).id()] = i; 94 | } 95 | { 96 | std::ifstream alnfile { argv[2], std::ios::in | std::ios::binary }; 97 | std::function lambda = [&graph, &ids, &partials](vg::Alignment& aln) { 98 | addPartial(graph, ids, aln, partials); 99 | }; 100 | stream::for_each(alnfile, lambda); 101 | } 102 | } 103 | else if (graphfilename.substr(graphfilename.size() - 4) == ".gfa") 104 | { 105 | GfaGraph graph = GfaGraph::LoadFromFile(argv[1]); 106 | maxOverlap = graph.edgeOverlap; 107 | std::unordered_map ids; 108 | for (auto node : graph.nodes) 109 | { 110 | ids[node.first] = node.first; 111 | } 112 | { 113 | std::ifstream alnfile { argv[2], std::ios::in | std::ios::binary }; 114 | std::function lambda = [&graph, &ids, &partials](vg::Alignment& aln) { 115 | addPartial(graph, ids, aln, partials); 116 | }; 117 | stream::for_each(alnfile, lambda); 118 | } 119 | } 120 | 121 | 122 | mergePartials(partials, reads, maxOverlap); 123 | } -------------------------------------------------------------------------------- /src/ExtractExactPathSubgraph.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "GfaGraph.h" 5 | #include "CommonUtils.h" 6 | 7 | int main(int argc, char** argv) 8 | { 9 | std::string infile {argv[1]}; 10 | std::string outfile {argv[2]}; 11 | std::string alignmentfile {argv[3]}; 12 | auto alignments = CommonUtils::LoadVGAlignments(alignmentfile); 13 | auto graph = GfaGraph::LoadFromFile(infile); 14 | std::unordered_set pickedNodes; 15 | std::unordered_set> pickedEdges; 16 | for (const auto& alignment : alignments) 17 | { 18 | pickedNodes.insert(alignment.path().mapping(0).position().node_id()); 19 | for (int i = 1; i < alignment.path().mapping_size(); i++) 20 | { 21 | pickedNodes.insert(alignment.path().mapping(i).position().node_id()); 22 | NodePos from {alignment.path().mapping(i-1).position().node_id(), alignment.path().mapping(i-1).position().is_reverse()}; 23 | NodePos to {alignment.path().mapping(i).position().node_id(), alignment.path().mapping(i).position().is_reverse()}; 24 | pickedEdges.emplace(from, to); 25 | } 26 | } 27 | std::cerr << pickedNodes.size() << " nodes, ~" << pickedEdges.size() << " edges" << std::endl; 28 | auto result = graph.GetSubgraph(pickedNodes, pickedEdges); 29 | result.SaveToFile(outfile); 30 | } -------------------------------------------------------------------------------- /src/ExtractPathSequence.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "GfaGraph.h" 8 | #include "vg.pb.h" 9 | #include "stream.hpp" 10 | #include "CommonUtils.h" 11 | 12 | void printPath(const std::unordered_map& ids, std::function seqGetter, const vg::Alignment& v) 13 | { 14 | std::cout << ">" << v.name() << "_" << v.query_position() << "_" << (v.query_position() + v.sequence().size()) << std::endl; 15 | for (int i = 0; i < v.path().mapping_size(); i++) 16 | { 17 | auto nodeid = v.path().mapping(i).position().node_id(); 18 | auto sequence = seqGetter(ids.at(nodeid)); 19 | int len = 0; 20 | for (int j = 0; j < v.path().mapping(i).edit_size(); j++) 21 | { 22 | len += v.path().mapping(i).edit(j).from_length(); 23 | } 24 | if (v.path().mapping(i).position().is_reverse()) 25 | { 26 | sequence = CommonUtils::ReverseComplement(sequence); 27 | } 28 | if (v.path().mapping(i).position().offset() > 0) 29 | { 30 | sequence = sequence.substr(v.path().mapping(i).position().offset()); 31 | } 32 | sequence = sequence.substr(0, len); 33 | std::cout << sequence; 34 | } 35 | std::cout << std::endl; 36 | } 37 | 38 | void printPath(const vg::Graph& g, const std::unordered_map& ids, const vg::Alignment& v) 39 | { 40 | printPath(ids, [&g](int id) {return g.node(id).sequence();}, v); 41 | } 42 | 43 | void printPath(const GfaGraph& g, const std::unordered_map& ids, const vg::Alignment& v) 44 | { 45 | printPath(ids, [&g](int id) {return g.nodes.at(id);}, v); 46 | } 47 | 48 | int main(int argc, char** argv) 49 | { 50 | std::string graphfilename {argv[1]}; 51 | std::string alnfilename { argv[2] }; 52 | std::unordered_map ids; 53 | if (graphfilename.substr(graphfilename.size()-3) == ".vg") 54 | { 55 | vg::Graph graph = CommonUtils::LoadVGGraph(argv[1]); 56 | for (int i = 0; i < graph.node_size(); i++) 57 | { 58 | ids[graph.node(i).id()] = i; 59 | } 60 | { 61 | std::ifstream graphfile { argv[2], std::ios::in | std::ios::binary }; 62 | std::function lambda = [&graph, &ids](vg::Alignment& g) { 63 | std::cerr << g.name() << std::endl; 64 | printPath(graph, ids, g); 65 | }; 66 | stream::for_each(graphfile, lambda); 67 | } 68 | } 69 | else if (graphfilename.substr(graphfilename.size() - 4) == ".gfa") 70 | { 71 | GfaGraph graph = GfaGraph::LoadFromFile(argv[1]); 72 | for (auto node : graph.nodes) 73 | { 74 | ids[node.first] = node.first; 75 | } 76 | { 77 | std::ifstream graphfile { argv[2], std::ios::in | std::ios::binary }; 78 | std::function lambda = [&graph, &ids](vg::Alignment& g) { 79 | std::cerr << g.name() << std::endl; 80 | printPath(graph, ids, g); 81 | }; 82 | stream::for_each(graphfile, lambda); 83 | } 84 | } 85 | 86 | } -------------------------------------------------------------------------------- /src/ExtractPathSubgraphNeighbourhood.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "GfaGraph.h" 5 | #include "CommonUtils.h" 6 | 7 | class PriorityNode 8 | { 9 | public: 10 | PriorityNode(NodePos pos, int priority) : 11 | pos(pos), 12 | priority(priority) 13 | {} 14 | NodePos pos; 15 | int priority; 16 | bool operator>(const PriorityNode& other) const 17 | { 18 | return priority > other.priority; 19 | } 20 | }; 21 | 22 | int main(int argc, char** argv) 23 | { 24 | std::string infile {argv[1]}; 25 | std::string outfile {argv[2]}; 26 | std::string alignmentfile {argv[3]}; 27 | int length = std::stoi(argv[4]); 28 | std::cerr << "length: " << length << std::endl; 29 | auto alignments = CommonUtils::LoadVGAlignments(alignmentfile); 30 | auto graph = GfaGraph::LoadFromFile(infile); 31 | std::priority_queue, std::greater> queue; 32 | for (const auto& alignment : alignments) 33 | { 34 | for (const auto& pos : alignment.path().mapping()) 35 | { 36 | queue.emplace(NodePos {pos.position().node_id(), pos.position().is_reverse()}, 0); 37 | } 38 | } 39 | std::unordered_map distance; 40 | while (queue.size() != 0) 41 | { 42 | auto top = queue.top(); 43 | queue.pop(); 44 | if (top.priority > length) break; 45 | if (distance.count(top.pos) == 1 && distance[top.pos] <= top.priority) continue; 46 | distance[top.pos] = top.priority; 47 | if (graph.edges.count(top.pos) == 1) 48 | { 49 | for (auto edge : graph.edges.at(top.pos)) 50 | { 51 | assert(graph.nodes.at(top.pos.id).size() > graph.edgeOverlap); 52 | queue.emplace(edge, top.priority + graph.nodes.at(top.pos.id).size() - graph.edgeOverlap); 53 | } 54 | } 55 | } 56 | std::unordered_set picked; 57 | for (auto pair : distance) 58 | { 59 | picked.insert(pair.first.id); 60 | } 61 | std::cerr << picked.size() << std::endl; 62 | auto result = graph.GetSubgraph(picked); 63 | result.SaveToFile(outfile); 64 | } -------------------------------------------------------------------------------- /src/GfaGraph.h: -------------------------------------------------------------------------------- 1 | #ifndef GfaGraph_h 2 | #define GfaGraph_h 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | class NodePos 12 | { 13 | public: 14 | NodePos(); 15 | NodePos(int id, bool end); 16 | int id; 17 | bool end; 18 | NodePos Reverse() const; 19 | bool operator==(const NodePos& other) const; 20 | bool operator!=(const NodePos& other) const; 21 | }; 22 | 23 | namespace std 24 | { 25 | template <> 26 | struct hash 27 | { 28 | size_t operator()(const NodePos& x) const 29 | { 30 | return hash()(x.id) ^ hash()(x.end); 31 | } 32 | }; 33 | template <> 34 | struct hash> 35 | { 36 | size_t operator()(const std::pair& x) const 37 | { 38 | // simple hashing with hash()(x.first) ^ hash()(x.second) collides each edge formed like (x -> x+1) 39 | // instead: 40 | // https://stackoverflow.com/questions/682438/hash-function-providing-unique-uint-from-an-integer-coordinate-pair 41 | // https://en.wikipedia.org/wiki/Pairing_function#Cantor_pairing_function 42 | // and arbitrarily ignore directionality 43 | size_t pairing = .5 * (x.first.id + x.second.id) * (x.first.id + x.second.id + 1) + x.second.id; 44 | return hash()(pairing); 45 | } 46 | }; 47 | } 48 | 49 | class GfaGraph 50 | { 51 | public: 52 | GfaGraph(); 53 | static GfaGraph LoadFromFile(std::string filename, bool allowVaryingOverlaps=false, bool warnAboutMissingNodes=false); 54 | static GfaGraph LoadFromStream(std::istream& stream, bool allowVaryingOverlaps=false, bool warnAboutMissingNodes=false); 55 | void SaveToFile(std::string filename) const; 56 | void SaveToStream(std::ostream& stream) const; 57 | void AddSubgraph(const GfaGraph& subgraph); 58 | GfaGraph GetSubgraph(const std::unordered_set& ids) const; 59 | GfaGraph GetSubgraph(const std::unordered_set& nodes, const std::unordered_set>& edges) const; 60 | std::string OriginalNodeName(int nodeId) const; 61 | void confirmDoublesidedEdges(); 62 | std::unordered_map nodes; 63 | std::unordered_map> edges; 64 | std::unordered_map, size_t> varyingOverlaps; 65 | size_t edgeOverlap; 66 | std::unordered_map tags; 67 | std::unordered_map originalNodeName; 68 | private: 69 | void numberBackToIntegers(); 70 | }; 71 | 72 | #endif -------------------------------------------------------------------------------- /src/GraphAlignerGAFAlignment.h: -------------------------------------------------------------------------------- 1 | #ifndef GraphAlignerGAFAlignment_h 2 | #define GraphAlignerGAFAlignment_h 3 | 4 | #include 5 | #include 6 | #include "AlignmentGraph.h" 7 | #include "NodeSlice.h" 8 | #include "CommonUtils.h" 9 | #include "ThreadReadAssertion.h" 10 | #include "GraphAlignerCommon.h" 11 | 12 | template 13 | class GraphAlignerGAFAlignment 14 | { 15 | using Common = GraphAlignerCommon; 16 | using Params = typename Common::Params; 17 | using MatrixPosition = typename Common::MatrixPosition; 18 | using TraceItem = typename Common::TraceItem; 19 | struct MergedNodePos 20 | { 21 | int nodeId; 22 | bool reverse; 23 | size_t nodeOffset; 24 | size_t seqPos; 25 | }; 26 | enum EditType 27 | { 28 | Match, 29 | Mismatch, 30 | MatchOrMismatch, 31 | Insertion, 32 | Deletion, 33 | Empty 34 | }; 35 | public: 36 | 37 | static std::string traceToAlignment(const std::string& seq_id, const std::string& sequence, const GraphAlignerCommon::OnewayTrace& tracePair, const Params& params, bool cigarMatchMismatchMerge) 38 | { 39 | auto& trace = tracePair.trace; 40 | if (trace.size() == 0) return nullptr; 41 | std::stringstream cigar; 42 | std::string readName = seq_id; 43 | size_t readLen = sequence.size(); 44 | size_t readStart = trace[0].DPposition.seqPos; 45 | size_t readEnd = trace.back().DPposition.seqPos+1; 46 | bool strand = true; 47 | std::stringstream nodePath; 48 | size_t nodePathLen = 0; 49 | size_t nodePathStart = trace[0].DPposition.nodeOffset; 50 | size_t nodePathEnd = 0; 51 | size_t matches = 0; 52 | size_t blockLength = trace.size(); 53 | int mappingQuality = 255; 54 | 55 | MergedNodePos currentPos; 56 | currentPos.nodeId = trace[0].DPposition.node; 57 | currentPos.reverse = (trace[0].DPposition.node % 2) == 1; 58 | currentPos.nodeOffset = trace[0].DPposition.nodeOffset; 59 | currentPos.seqPos = trace[0].DPposition.seqPos; 60 | EditType currentEdit = Empty; 61 | size_t mismatches = 0; 62 | size_t deletions = 0; 63 | size_t insertions = 0; 64 | size_t editLength = 0; 65 | if (cigarMatchMismatchMerge) 66 | { 67 | currentEdit = MatchOrMismatch; 68 | editLength = 1; 69 | if (Common::characterMatch(trace[0].sequenceCharacter, trace[0].graphCharacter)) 70 | { 71 | matches += 1; 72 | } 73 | else 74 | { 75 | mismatches += 1; 76 | } 77 | } 78 | else if (Common::characterMatch(trace[0].sequenceCharacter, trace[0].graphCharacter)) 79 | { 80 | currentEdit = Match; 81 | editLength = 1; 82 | matches += 1; 83 | } 84 | else 85 | { 86 | currentEdit = Mismatch; 87 | editLength = 1; 88 | mismatches += 1; 89 | } 90 | addPosToString(nodePath, currentPos, params); 91 | nodePathLen += params.graph.originalNodeSize.at(currentPos.nodeId); 92 | for (size_t pos = 1; pos < trace.size(); pos++) 93 | { 94 | assert(trace[pos].DPposition.seqPos < sequence.size()); 95 | MergedNodePos newPos; 96 | newPos.nodeId = trace[pos].DPposition.node; 97 | newPos.reverse = (trace[pos].DPposition.node % 2) == 1; 98 | newPos.nodeOffset = trace[pos].DPposition.nodeOffset; 99 | newPos.seqPos = trace[pos].DPposition.seqPos; 100 | bool insideNode = !trace[pos-1].nodeSwitch || (newPos.nodeId == currentPos.nodeId && newPos.reverse == currentPos.reverse && newPos.nodeOffset > currentPos.nodeOffset); 101 | 102 | assert(newPos.seqPos >= currentPos.seqPos); 103 | 104 | if (!insideNode) 105 | { 106 | size_t skippedBefore = params.graph.originalNodeSize.at(currentPos.nodeId) - 1 - trace[pos-1].DPposition.nodeOffset; 107 | currentPos = newPos; 108 | addPosToString(nodePath, currentPos, params); 109 | assert(trace[pos].DPposition.nodeOffset < params.graph.originalNodeSize.at(currentPos.nodeId)); 110 | size_t skippedAfter = trace[pos].DPposition.nodeOffset; 111 | nodePathLen += params.graph.originalNodeSize.at(currentPos.nodeId) - (skippedBefore + skippedAfter); 112 | } 113 | 114 | if (trace[pos-1].DPposition.seqPos == trace[pos].DPposition.seqPos) 115 | { 116 | if (currentEdit == Empty) currentEdit = Deletion; 117 | if (currentEdit != Deletion) 118 | { 119 | addCigarItem(cigar, editLength, currentEdit); 120 | currentEdit = Deletion; 121 | editLength = 0; 122 | } 123 | editLength += 1; 124 | deletions += 1; 125 | } 126 | else if (insideNode && trace[pos-1].DPposition.nodeOffset == trace[pos].DPposition.nodeOffset) 127 | { 128 | if (currentEdit == Empty) currentEdit = Insertion; 129 | if (currentEdit != Insertion) 130 | { 131 | addCigarItem(cigar, editLength, currentEdit); 132 | currentEdit = Insertion; 133 | editLength = 0; 134 | } 135 | editLength += 1; 136 | insertions += 1; 137 | } 138 | else if (cigarMatchMismatchMerge) 139 | { 140 | if (currentEdit == Empty) currentEdit = MatchOrMismatch; 141 | if (currentEdit != MatchOrMismatch) 142 | { 143 | addCigarItem(cigar, editLength, currentEdit); 144 | currentEdit = MatchOrMismatch; 145 | editLength = 0; 146 | } 147 | editLength += 1; 148 | if (Common::characterMatch(trace[pos].sequenceCharacter, trace[pos].graphCharacter)) 149 | { 150 | matches += 1; 151 | } 152 | else 153 | { 154 | mismatches += 1; 155 | } 156 | } 157 | else if (Common::characterMatch(trace[pos].sequenceCharacter, trace[pos].graphCharacter)) 158 | { 159 | if (currentEdit == Empty) currentEdit = Match; 160 | if (currentEdit != Match) 161 | { 162 | addCigarItem(cigar, editLength, currentEdit); 163 | currentEdit = Match; 164 | editLength = 0; 165 | } 166 | editLength += 1; 167 | matches += 1; 168 | } 169 | else 170 | { 171 | if (currentEdit == Empty) currentEdit = Mismatch; 172 | if (currentEdit != Mismatch) 173 | { 174 | addCigarItem(cigar, editLength, currentEdit); 175 | currentEdit = Mismatch; 176 | editLength = 0; 177 | } 178 | editLength += 1; 179 | mismatches += 1; 180 | } 181 | if (insideNode) 182 | { 183 | assert(trace[pos-1].nodeSwitch || newPos.nodeId == currentPos.nodeId); 184 | assert(trace[pos-1].nodeSwitch || newPos.reverse == currentPos.reverse); 185 | } 186 | } 187 | 188 | assert(matches + mismatches + deletions + insertions == trace.size()); 189 | addCigarItem(cigar, editLength, currentEdit); 190 | 191 | nodePathEnd = nodePathLen - (params.graph.originalNodeSize.at(trace.back().DPposition.node) - 1 - trace.back().DPposition.nodeOffset); 192 | 193 | std::stringstream sstr; 194 | sstr << readName << "\t" << readLen << "\t" << readStart << "\t" << readEnd << "\t" << (strand ? "+" : "-") << "\t" << nodePath.str() << "\t" << nodePathLen << "\t" << nodePathStart << "\t" << nodePathEnd << "\t" << matches << "\t" << blockLength << "\t" << mappingQuality; 195 | sstr << "\t" << "NM:i:" << (mismatches + deletions + insertions); 196 | sstr << "\t" << "dv:f:" << 1.0-((double)matches / (double)(matches + mismatches + deletions + insertions)); 197 | sstr << "\t" << "id:f:" << ((double)matches / (double)(matches + mismatches + deletions + insertions)); 198 | sstr << "\t" << "cg:Z:" << cigar.str(); 199 | return sstr.str(); 200 | } 201 | 202 | private: 203 | 204 | static void addPosToString(std::stringstream& str, MergedNodePos pos, const Params& params) 205 | { 206 | if (pos.reverse) 207 | { 208 | str << "<"; 209 | } 210 | else 211 | { 212 | str << ">"; 213 | } 214 | std::string nodeName = params.graph.originalNodeName.at(pos.nodeId); 215 | if (nodeName == "") 216 | { 217 | str << pos.nodeId/2; 218 | } 219 | else 220 | { 221 | str << nodeName; 222 | } 223 | } 224 | 225 | static void addCigarItem(std::stringstream& str, size_t editLength, EditType type) 226 | { 227 | if (editLength == 0) return; 228 | str << editLength; 229 | switch(type) 230 | { 231 | case MatchOrMismatch: 232 | str << "M"; 233 | break; 234 | case Match: 235 | str << "="; 236 | break; 237 | case Mismatch: 238 | str << "X"; 239 | break; 240 | case Insertion: 241 | str << "I"; 242 | break; 243 | case Deletion: 244 | str << "D"; 245 | break; 246 | case Empty: 247 | default: 248 | return; 249 | } 250 | } 251 | }; 252 | 253 | #endif -------------------------------------------------------------------------------- /src/GraphAlignerVGAlignment.h: -------------------------------------------------------------------------------- 1 | #ifndef GraphAlignerVGAlignment_h 2 | #define GraphAlignerVGAlignment_h 3 | 4 | #include 5 | #include 6 | #include "AlignmentGraph.h" 7 | #include "vg.pb.h" 8 | #include "NodeSlice.h" 9 | #include "CommonUtils.h" 10 | #include "ThreadReadAssertion.h" 11 | #include "GraphAlignerCommon.h" 12 | 13 | template 14 | class GraphAlignerVGAlignment 15 | { 16 | using Common = GraphAlignerCommon; 17 | using Params = typename Common::Params; 18 | using MatrixPosition = typename Common::MatrixPosition; 19 | using TraceItem = typename Common::TraceItem; 20 | struct MergedNodePos 21 | { 22 | int nodeId; 23 | bool reverse; 24 | size_t nodeOffset; 25 | size_t seqPos; 26 | }; 27 | enum EditType 28 | { 29 | Match, 30 | Mismatch, 31 | Insertion, 32 | Deletion, 33 | Empty 34 | }; 35 | public: 36 | 37 | static std::shared_ptr traceToAlignment(const std::string& seq_id, const std::string& sequence, ScoreType score, const std::vector& trace, size_t cellsProcessed, bool reverse) 38 | { 39 | if (trace.size() == 0) return nullptr; 40 | vg::Alignment* aln = new vg::Alignment; 41 | std::shared_ptr result { aln }; 42 | result->set_name(seq_id); 43 | result->set_score(score); 44 | result->set_sequence(sequence); 45 | auto path = new vg::Path; 46 | result->set_allocated_path(path); 47 | MergedNodePos currentPos; 48 | currentPos.nodeId = trace[0].DPposition.node; 49 | currentPos.reverse = (trace[0].DPposition.node % 2) == 1; 50 | currentPos.nodeOffset = trace[0].DPposition.nodeOffset; 51 | currentPos.seqPos = trace[0].DPposition.seqPos; 52 | int rank = 0; 53 | auto vgmapping = path->add_mapping(); 54 | auto position = new vg::Position; 55 | vgmapping->set_allocated_position(position); 56 | vgmapping->set_rank(rank); 57 | auto edit = vgmapping->add_edit(); 58 | EditType currentEdit = Empty; 59 | size_t mismatches = 0; 60 | size_t deletions = 0; 61 | size_t insertions = 0; 62 | size_t matches = 0; 63 | if (Common::characterMatch(trace[0].sequenceCharacter, trace[0].graphCharacter)) 64 | { 65 | currentEdit = Match; 66 | edit->set_from_length(edit->from_length()+1); 67 | edit->set_to_length(edit->to_length()+1); 68 | matches += 1; 69 | } 70 | else 71 | { 72 | currentEdit = Mismatch; 73 | edit->set_from_length(edit->from_length()+1); 74 | edit->set_to_length(edit->to_length()+1); 75 | edit->set_sequence(std::string { sequence[0] }); 76 | mismatches += 1; 77 | } 78 | position->set_node_id(currentPos.nodeId); 79 | position->set_is_reverse(currentPos.reverse); 80 | position->set_offset(currentPos.nodeOffset); 81 | for (size_t pos = 1; pos < trace.size(); pos++) 82 | { 83 | assert(trace[pos].DPposition.seqPos < sequence.size()); 84 | MergedNodePos newPos; 85 | newPos.nodeId = trace[pos].DPposition.node; 86 | newPos.reverse = (trace[pos].DPposition.node % 2) == 1; 87 | newPos.nodeOffset = trace[pos].DPposition.nodeOffset; 88 | newPos.seqPos = trace[pos].DPposition.seqPos; 89 | bool insideNode = !trace[pos-1].nodeSwitch || (newPos.nodeId == currentPos.nodeId && newPos.reverse == currentPos.reverse && newPos.nodeOffset > currentPos.nodeOffset); 90 | 91 | assert(newPos.seqPos >= currentPos.seqPos); 92 | 93 | if (!insideNode) 94 | { 95 | rank++; 96 | currentPos = newPos; 97 | vgmapping = path->add_mapping(); 98 | position = new vg::Position; 99 | vgmapping->set_allocated_position(position); 100 | vgmapping->set_rank(rank); 101 | position->set_offset(currentPos.nodeOffset); 102 | position->set_node_id(currentPos.nodeId); 103 | position->set_is_reverse(currentPos.reverse); 104 | edit = vgmapping->add_edit(); 105 | currentEdit = Empty; 106 | } 107 | 108 | if (trace[pos-1].DPposition.seqPos == trace[pos].DPposition.seqPos) 109 | { 110 | if (currentEdit == Empty) currentEdit = Deletion; 111 | if (currentEdit != Deletion) 112 | { 113 | edit = vgmapping->add_edit(); 114 | currentEdit = Deletion; 115 | } 116 | edit->set_from_length(edit->from_length()+1); 117 | deletions += 1; 118 | } 119 | else if (insideNode && trace[pos-1].DPposition.nodeOffset == trace[pos].DPposition.nodeOffset) 120 | { 121 | if (currentEdit == Empty) currentEdit = Insertion; 122 | if (currentEdit != Insertion) 123 | { 124 | edit = vgmapping->add_edit(); 125 | currentEdit = Insertion; 126 | } 127 | edit->set_to_length(edit->to_length()+1); 128 | edit->set_sequence(edit->sequence() + trace[pos].sequenceCharacter); 129 | insertions += 1; 130 | } 131 | else if (Common::characterMatch(trace[pos].sequenceCharacter, trace[pos].graphCharacter)) 132 | { 133 | if (currentEdit == Empty) currentEdit = Match; 134 | if (currentEdit != Match) 135 | { 136 | edit = vgmapping->add_edit(); 137 | currentEdit = Match; 138 | } 139 | edit->set_from_length(edit->from_length()+1); 140 | edit->set_to_length(edit->to_length()+1); 141 | matches += 1; 142 | } 143 | else 144 | { 145 | if (currentEdit == Empty) currentEdit = Mismatch; 146 | if (currentEdit != Mismatch) 147 | { 148 | edit = vgmapping->add_edit(); 149 | currentEdit = Mismatch; 150 | } 151 | edit->set_from_length(edit->from_length()+1); 152 | edit->set_to_length(edit->to_length()+1); 153 | edit->set_sequence(edit->sequence() + trace[pos].sequenceCharacter); 154 | mismatches += 1; 155 | } 156 | if (insideNode) 157 | { 158 | assert(trace[pos-1].nodeSwitch || newPos.nodeId == currentPos.nodeId); 159 | assert(trace[pos-1].nodeSwitch || newPos.reverse == currentPos.reverse); 160 | } 161 | } 162 | result->set_identity((double)matches / (double)(matches + mismatches + insertions + deletions)); 163 | assert(currentEdit != Empty); 164 | return result; 165 | } 166 | 167 | static bool posEqual(const vg::Position& pos1, const vg::Position& pos2) 168 | { 169 | return pos1.node_id() == pos2.node_id() && pos1.is_reverse() == pos2.is_reverse(); 170 | } 171 | }; 172 | 173 | #endif -------------------------------------------------------------------------------- /src/GraphAlignerWrapper.cpp: -------------------------------------------------------------------------------- 1 | //split this here so modifying GraphAligner.h doesn't require recompiling every cpp file 2 | 3 | #include 4 | #include "GraphAlignerWrapper.h" 5 | #include "GraphAligner.h" 6 | #include "ThreadReadAssertion.h" 7 | 8 | AlignmentResult AlignOneWay(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, size_t initialBandwidth, size_t rampBandwidth, bool quietMode, GraphAlignerCommon::AlignerGraphsizedState& reusableState, bool lowMemory, bool forceGlobal, bool preciseClipping, bool nondeterministicOptimizations, double preciseClippingIdentityCutoff, int Xdropcutoff, size_t DPRestartStride) 9 | { 10 | GraphAlignerCommon::Params params {initialBandwidth, rampBandwidth, graph, std::numeric_limits::max(), quietMode, false, lowMemory, forceGlobal, preciseClipping, 1, 0, nondeterministicOptimizations, preciseClippingIdentityCutoff, Xdropcutoff}; 11 | GraphAligner aligner {params}; 12 | return aligner.AlignOneWay(seq_id, sequence, reusableState, DPRestartStride); 13 | } 14 | 15 | AlignmentResult AlignOneWayDijkstra(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, bool quietMode, GraphAlignerCommon::AlignerGraphsizedState& reusableState, bool forceGlobal, bool preciseClipping) 16 | { 17 | GraphAlignerCommon::Params params {1, 1, graph, std::numeric_limits::max(), quietMode, false, true, forceGlobal, preciseClipping, 1, 0, false, .5, 0}; 18 | GraphAligner aligner {params}; 19 | return aligner.AlignOneWayDijkstra(seq_id, sequence, reusableState); 20 | } 21 | 22 | AlignmentResult AlignOneWay(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, size_t initialBandwidth, size_t rampBandwidth, size_t maxCellsPerSlice, bool quietMode, bool sloppyOptimizations, const std::vector& seedHits, GraphAlignerCommon::AlignerGraphsizedState& reusableState, bool lowMemory, bool forceGlobal, bool preciseClipping, size_t minClusterSize, double seedExtendDensity, bool nondeterministicOptimizations, double preciseClippingIdentityCutoff, int Xdropcutoff, long long l, long long r, long long offset) 23 | { 24 | GraphAlignerCommon::Params params {initialBandwidth, rampBandwidth, graph, maxCellsPerSlice, quietMode, sloppyOptimizations, lowMemory, forceGlobal, preciseClipping, minClusterSize, seedExtendDensity, nondeterministicOptimizations, preciseClippingIdentityCutoff, Xdropcutoff}; 25 | GraphAligner aligner {params}; 26 | if (l == -1) l = 0; 27 | if (r == -1) r = seedHits.size(); 28 | return aligner.AlignOneWay(seq_id, sequence, seedHits, reusableState, l, r, offset); 29 | } 30 | 31 | void AddAlignment(const std::string& seq_id, const std::string& sequence, AlignmentResult::AlignmentItem& alignment) 32 | { 33 | GraphAlignerCommon::Params params {1, 1, AlignmentGraph::DummyGraph(), 1, true, true, true, false, false, 1, 0, false, .5, 0}; 34 | GraphAligner aligner {params}; 35 | aligner.AddAlignment(seq_id, sequence, alignment); 36 | } 37 | 38 | void AddGAFLine(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, AlignmentResult::AlignmentItem& alignment, bool cigarMatchMismatchMerge) 39 | { 40 | GraphAlignerCommon::Params params {1, 1, graph, 1, true, true, true, false, false, 1, 0, false, .5, 0}; 41 | GraphAligner aligner {params}; 42 | aligner.AddGAFLine(seq_id, sequence, alignment, cigarMatchMismatchMerge); 43 | } 44 | 45 | void AddCorrected(AlignmentResult::AlignmentItem& alignment) 46 | { 47 | GraphAlignerCommon::Params params {1, 1, AlignmentGraph::DummyGraph(), 1, true, true, true, false, false, 1, 0, false, .5, 0}; 48 | GraphAligner aligner {params}; 49 | aligner.AddCorrected(alignment); 50 | } 51 | 52 | void OrderSeeds(const AlignmentGraph& graph, std::vector& seedHits) 53 | { 54 | GraphAlignerCommon::Params params {1, 1, graph, 1, true, true, true, false, false, 1, 0, false, .5, 0}; 55 | GraphAligner aligner {params}; 56 | aligner.orderSeedsByChaining(seedHits); 57 | } 58 | 59 | 60 | void OrderSeedsCLC(const AlignmentGraph& graph, std::vector& seedHits) 61 | { 62 | GraphAlignerCommon::Params params {1, 1, graph, 1, true, true, true, false, false, 1, 0, false, .5, 0}; 63 | GraphAligner aligner {params}; 64 | aligner.orderSeedsByChainingCLC(seedHits); 65 | } 66 | -------------------------------------------------------------------------------- /src/GraphAlignerWrapper.h: -------------------------------------------------------------------------------- 1 | //split this here so modifying GraphAligner.h doesn't require recompiling every cpp file 2 | 3 | #ifndef GraphAlignerWrapper_h 4 | #define GraphAlignerWrapper_h 5 | 6 | #include 7 | #include "vg.pb.h" 8 | #include "GraphAlignerCommon.h" 9 | #include "AlignmentGraph.h" 10 | 11 | class SeedHit 12 | { 13 | public: 14 | SeedHit(int nodeID, size_t nodeOffset, size_t seqPos, size_t matchLen, size_t rawSeedGoodness, bool reverse) : 15 | nodeID(nodeID), 16 | nodeOffset(nodeOffset), 17 | seqPos(seqPos), 18 | matchLen(matchLen), 19 | reverse(reverse), 20 | alignmentGraphNodeId(std::numeric_limits::max()), 21 | alignmentGraphNodeOffset(std::numeric_limits::max()), 22 | rawSeedGoodness(rawSeedGoodness), 23 | seedGoodness(0), 24 | seedClusterSize(0) 25 | { 26 | } 27 | int nodeID; 28 | size_t nodeOffset; 29 | size_t seqPos; 30 | size_t matchLen; 31 | bool reverse; 32 | size_t alignmentGraphNodeId; 33 | size_t alignmentGraphNodeOffset; 34 | size_t rawSeedGoodness; 35 | size_t seedGoodness; 36 | size_t seedClusterSize; 37 | }; 38 | 39 | AlignmentResult AlignOneWay(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, size_t initialBandwidth, size_t rampBandwidth, bool quietMode, GraphAlignerCommon::AlignerGraphsizedState& reusableState, bool lowMemory, bool forceGlobal, bool preciseClipping, bool nondeterministicOptimizations, double preciseClippingIdentityCutoff, int Xdropcutoff, size_t DPRestartStride); 40 | AlignmentResult AlignOneWayDijkstra(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, bool quietMode, GraphAlignerCommon::AlignerGraphsizedState& reusableState, bool forceGlobal, bool preciseClipping); 41 | AlignmentResult AlignOneWay(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, size_t initialBandwidth, size_t rampBandwidth, size_t maxCellsPerSlice, bool quietMode, bool sloppyOptimizations, const std::vector& seedHits, GraphAlignerCommon::AlignerGraphsizedState& reusableState, bool lowMemory, bool forceGlobal, bool preciseClipping, size_t minClusterSize, double seedExtendDensity, bool nondeterministicOptimizations, double preciseClippingIdentityCutoff, int Xdropcutoff, long long l = -1, long long r = -1, long long offset = 0); 42 | 43 | void AddAlignment(const std::string& seq_id, const std::string& sequence, AlignmentResult::AlignmentItem& alignment); 44 | void AddGAFLine(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, AlignmentResult::AlignmentItem& alignment, bool cigarMatchMismatchMerge); 45 | void AddCorrected(AlignmentResult::AlignmentItem& alignment); 46 | void OrderSeeds(const AlignmentGraph& graph, std::vector& seedHits); 47 | void OrderSeedsCLC(const AlignmentGraph& graph, std::vector& seedHits); 48 | 49 | #endif -------------------------------------------------------------------------------- /src/MafToAlignment.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "CommonUtils.h" 5 | #include "vg.pb.h" 6 | #include "stream.hpp" 7 | 8 | struct MafEntry { 9 | std::string readname; 10 | std::string realsequence; 11 | int startpos; 12 | int length; 13 | bool backward; 14 | }; 15 | 16 | std::vector mafsToAlignments(const std::vector& mafs, const std::vector& posToNode, const std::map& nodeSize, const std::map& nodeIsReverse) 17 | { 18 | std::vector result; 19 | for (size_t i = 0; i < mafs.size(); i++) 20 | { 21 | std::vector nodeIds; 22 | nodeIds.push_back(posToNode[mafs[i].startpos]); 23 | for (int j = 1; j < mafs[i].length; j++) 24 | { 25 | if (posToNode[mafs[i].startpos+j] != nodeIds.back()) 26 | { 27 | nodeIds.push_back(posToNode[mafs[i].startpos+j]); 28 | } 29 | } 30 | if (mafs[i].backward) 31 | { 32 | std::reverse(nodeIds.begin(), nodeIds.end()); 33 | } 34 | vg::Alignment mafResult; 35 | mafResult.set_name(mafs[i].readname); 36 | auto path = new vg::Path; 37 | mafResult.set_allocated_path(path); 38 | for (size_t j = 0; j < nodeIds.size(); j++) 39 | { 40 | auto vgmapping = path->add_mapping(); 41 | auto position = new vg::Position; 42 | vgmapping->set_allocated_position(position); 43 | vgmapping->set_rank(j); 44 | position->set_node_id(nodeIds[j]); 45 | position->set_is_reverse(nodeIsReverse.at(nodeIds[j]) ^ mafs[i].backward); 46 | auto edit = vgmapping->add_edit(); 47 | edit->set_from_length(nodeSize.at(nodeIds[j])); 48 | } 49 | result.push_back(mafResult); 50 | } 51 | return result; 52 | } 53 | 54 | std::vector getMafEntries(std::string filename) 55 | { 56 | std::vector result; 57 | 58 | std::ifstream mafFile { filename }; 59 | while (mafFile.good()) 60 | { 61 | std::string line; 62 | std::getline(mafFile, line); 63 | std::string a, b, direction; 64 | if (line.size() == 0 || line[0] != 'a') continue; 65 | MafEntry maf; 66 | std::string checks, checkref; 67 | mafFile >> checks >> checkref; 68 | assert(checkref == "ref"); 69 | assert(checks == "s"); 70 | mafFile >> maf.startpos >> maf.length; 71 | mafFile >> a >> b; 72 | mafFile >> maf.realsequence; 73 | //https://stackoverflow.com/questions/20406744/how-to-find-and-replace-all-occurrences-of-a-substring-in-a-string 74 | std::string::size_type n = 0; 75 | while ((n = maf.realsequence.find("-", n)) != std::string::npos) 76 | { 77 | maf.realsequence.replace(n, 1, ""); 78 | } 79 | mafFile >> checks >> maf.readname; 80 | assert(checks == "s"); 81 | mafFile >> a >> b >> direction; 82 | if (direction == "-") 83 | { 84 | maf.realsequence = CommonUtils::ReverseComplement(maf.realsequence); 85 | } 86 | result.push_back(maf); 87 | } 88 | 89 | return result; 90 | } 91 | 92 | int main(int argc, char** argv) 93 | { 94 | vg::Graph graph = CommonUtils::LoadVGGraph(argv[1]); 95 | 96 | vg::Alignment referenceAlignment; 97 | { 98 | std::ifstream referenceFile { argv[2], std::ios::in | std::ios::binary }; 99 | std::function lambda = [&referenceAlignment](vg::Alignment& g) { 100 | referenceAlignment = g; 101 | }; 102 | stream::for_each(referenceFile, lambda); 103 | } 104 | 105 | std::vector posToNode; 106 | std::map nodeIsReverse; 107 | std::map nodeSizes; 108 | 109 | for (int i = 0; i < referenceAlignment.path().mapping_size(); i++) 110 | { 111 | auto mapping = referenceAlignment.path().mapping(i); 112 | int currentNodeSize = mapping.edit(0).to_length(); 113 | for (int j = 0; j < currentNodeSize; j++) 114 | { 115 | posToNode.push_back(mapping.position().node_id()); 116 | } 117 | nodeIsReverse[mapping.position().node_id()] = mapping.position().is_reverse(); 118 | } 119 | 120 | for (int i = 0; i < graph.node_size(); i++) 121 | { 122 | nodeSizes[graph.node(i).id()] = graph.node(i).sequence().size(); 123 | } 124 | 125 | auto mafs = getMafEntries(argv[3]); 126 | auto alignments = mafsToAlignments(mafs, posToNode, nodeSizes, nodeIsReverse); 127 | 128 | std::ofstream alignmentOut { argv[4], std::ios::out | std::ios::binary }; 129 | stream::write_buffered(alignmentOut, alignments, 0); 130 | 131 | std::ofstream readsOut { argv[5], std::ios::out }; 132 | for (size_t i = 0; i < mafs.size(); i++) 133 | { 134 | readsOut << ">" << mafs[i].readname << std::endl << mafs[i].realsequence << std::endl; 135 | } 136 | 137 | } -------------------------------------------------------------------------------- /src/MinimizerSeeder.h: -------------------------------------------------------------------------------- 1 | #ifndef MinimizerSeeder_h 2 | #define MinimizerSeeder_h 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "AlignmentGraph.h" 11 | #include "GraphAlignerWrapper.h" 12 | #include "BooPHF.h" 13 | 14 | class MinimizerSeeder 15 | { 16 | struct KmerBucket 17 | { 18 | KmerBucket(); 19 | KmerBucket(const KmerBucket& other) = delete; 20 | KmerBucket(KmerBucket&& other) = default; 21 | ~KmerBucket(); 22 | KmerBucket& operator=(const KmerBucket& other) = delete; 23 | KmerBucket& operator=(KmerBucket&& other) = default; 24 | typedef boomphf::SingleHashFunctor hasher_t; 25 | typedef boomphf::mphf boophf_t; 26 | boophf_t* locator; 27 | sdsl::int_vector<0> kmerCheck; 28 | sdsl::int_vector<0> startPos; 29 | sdsl::int_vector<0> positions; 30 | }; 31 | public: 32 | MinimizerSeeder(const AlignmentGraph& graph, size_t minimizerLength, size_t windowSize, size_t numThreads, double keepLeastFrequentFraction); 33 | std::vector getSeeds(const std::string& sequence, double density) const; 34 | bool canSeed() const; 35 | private: 36 | void addMinimizers(std::vector& result, std::vector>& matchIndices, size_t maxCount) const; 37 | size_t getStart(size_t bucket, size_t index) const; 38 | size_t getBucket(size_t hash) const; 39 | SeedHit matchToSeedHit(int nodeId, size_t nodeOffset, size_t seqPos, int count) const; 40 | void initMinimizers(size_t numThreads); 41 | void initMaxCount(double keepLeastFrequentFraction); 42 | const AlignmentGraph& graph; 43 | std::vector buckets; 44 | size_t minimizerLength; 45 | size_t windowSize; 46 | size_t maxCount; 47 | }; 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /src/MummerSeeder.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "CommonUtils.h" 6 | #include "MummerSeeder.h" 7 | 8 | char lowercaseRef(char c) 9 | { 10 | switch(c) 11 | { 12 | case 'a': 13 | case 'A': 14 | return 'a'; 15 | case 'c': 16 | case 'C': 17 | return 'c'; 18 | case 'g': 19 | case 'G': 20 | return 'g'; 21 | case 'u': 22 | case 'U': 23 | case 't': 24 | case 'T': 25 | return 't'; 26 | default: 27 | case '`': 28 | return '`'; 29 | } 30 | assert(false); 31 | return std::numeric_limits::max(); 32 | } 33 | 34 | char lowercaseSeq(char c) 35 | { 36 | switch(c) 37 | { 38 | case 'a': 39 | case 'A': 40 | return 'a'; 41 | case 'c': 42 | case 'C': 43 | return 'c'; 44 | case 'g': 45 | case 'G': 46 | return 'g'; 47 | case 'u': 48 | case 'U': 49 | case 't': 50 | case 'T': 51 | return 't'; 52 | default: 53 | return 'x'; 54 | } 55 | assert(false); 56 | return std::numeric_limits::max(); 57 | } 58 | 59 | bool fileExists(const std::string& fileName) 60 | { 61 | std::ifstream file { fileName }; 62 | return file.good(); 63 | } 64 | 65 | MummerSeeder::MummerSeeder(const GfaGraph& graph, const std::string& cachePrefix) 66 | { 67 | if (cachePrefix.size() > 0 && fileExists(cachePrefix + ".aux")) 68 | { 69 | loadFrom(cachePrefix); 70 | } 71 | else 72 | { 73 | initTree(graph); 74 | if (cachePrefix.size() > 0) saveTo(cachePrefix); 75 | } 76 | } 77 | 78 | MummerSeeder::MummerSeeder(const vg::Graph& graph, const std::string& cachePrefix) 79 | { 80 | if (cachePrefix.size() > 0 && fileExists(cachePrefix + ".aux")) 81 | { 82 | loadFrom(cachePrefix); 83 | } 84 | else 85 | { 86 | initTree(graph); 87 | if (cachePrefix.size() > 0) saveTo(cachePrefix); 88 | } 89 | } 90 | 91 | void MummerSeeder::initTree(const GfaGraph& graph) 92 | { 93 | for (auto node : graph.nodes) 94 | { 95 | nodePositions.push_back(seq.size()); 96 | nodeIDs.push_back(node.first); 97 | seq += node.second; 98 | seq += '`'; 99 | } 100 | nodePositions.push_back(seq.size()); 101 | for (size_t i = 0; i < seq.size(); i++) 102 | { 103 | seq[i] = lowercaseRef(seq[i]); 104 | } 105 | seq.shrink_to_fit(); 106 | matcher = std::make_unique(mummer::mummer::sparseSA::create_auto(seq.c_str(), seq.size(), 0, true)); 107 | } 108 | 109 | void MummerSeeder::initTree(const vg::Graph& graph) 110 | { 111 | for (int i = 0; i < graph.node_size(); i++) 112 | { 113 | nodePositions.push_back(seq.size()); 114 | nodeIDs.push_back(graph.node(i).id()); 115 | seq += graph.node(i).sequence(); 116 | seq += '`'; 117 | } 118 | nodePositions.push_back(seq.size()); 119 | for (size_t i = 0; i < seq.size(); i++) 120 | { 121 | seq[i] = lowercaseRef(seq[i]); 122 | } 123 | seq.shrink_to_fit(); 124 | matcher = std::make_unique(mummer::mummer::sparseSA::create_auto(seq.c_str(), seq.size(), 0, true)); 125 | } 126 | 127 | size_t MummerSeeder::getNodeIndex(size_t indexPos) const 128 | { 129 | assert(indexPos < nodePositions.back()); 130 | auto next = std::upper_bound(nodePositions.begin(), nodePositions.end(), indexPos); 131 | assert(next != nodePositions.begin()); 132 | size_t index = (next - nodePositions.begin()) - 1; 133 | assert(index < nodePositions.size()-1); 134 | return index; 135 | } 136 | 137 | void MummerSeeder::saveTo(const std::string& prefix) const 138 | { 139 | std::ofstream file { prefix + ".aux", std::ios::binary }; 140 | { 141 | boost::archive::text_oarchive oa(file); 142 | oa << seq; 143 | oa << nodePositions; 144 | oa << nodeIDs; 145 | } 146 | matcher->save(prefix + "_index"); 147 | } 148 | 149 | void MummerSeeder::loadFrom(const std::string& prefix) 150 | { 151 | std::ifstream file { prefix + ".aux", std::ios::binary }; 152 | { 153 | boost::archive::text_iarchive ia(file); 154 | ia >> seq; 155 | ia >> nodePositions; 156 | ia >> nodeIDs; 157 | } 158 | // same params that create_auto with minlen=0 passes 159 | matcher = std::make_unique(seq, false, 1, true, false, false, 1, 0, true); 160 | matcher->load(prefix + "_index"); 161 | } 162 | 163 | struct MatchWithOrientation 164 | { 165 | MatchWithOrientation(const mummer::mummer::match_t& match, bool reverse) : 166 | match(match), 167 | reverse(reverse) 168 | { 169 | } 170 | mummer::mummer::match_t match; 171 | bool reverse; 172 | bool operator>(const MatchWithOrientation& other) const 173 | { 174 | return match.len > other.match.len; 175 | } 176 | }; 177 | 178 | std::vector MummerSeeder::getMumSeeds(std::string sequence, size_t maxCount, size_t minLen) const 179 | { 180 | for (size_t i = 0; i < sequence.size(); i++) 181 | { 182 | sequence[i] = lowercaseSeq(sequence[i]); 183 | } 184 | assert(matcher != nullptr); 185 | std::priority_queue, std::greater> matches; 186 | matcher->findMAM_each(sequence, minLen, false, [&matches, maxCount](const mummer::mummer::match_t& match) 187 | { 188 | if (matches.size() < maxCount) 189 | { 190 | matches.emplace(match, false); 191 | return; 192 | } 193 | if (matches.top().match.len < match.len) 194 | { 195 | matches.pop(); 196 | matches.emplace(match, false); 197 | } 198 | }); 199 | revcompInPlace(sequence); 200 | matcher->findMAM_each(sequence, minLen, false, [&matches, maxCount](const mummer::mummer::match_t& match) 201 | { 202 | if (matches.size() < maxCount) 203 | { 204 | matches.emplace(match, true); 205 | return; 206 | } 207 | if (matches.top().match.len < match.len) 208 | { 209 | matches.pop(); 210 | matches.emplace(match, true); 211 | } 212 | }); 213 | std::vector MAMs; 214 | std::vector bwMAMs; 215 | while (matches.size() > 0) 216 | { 217 | if (matches.top().reverse) 218 | { 219 | bwMAMs.push_back(matches.top().match); 220 | } 221 | else 222 | { 223 | MAMs.push_back(matches.top().match); 224 | } 225 | matches.pop(); 226 | } 227 | auto seeds = matchesToSeeds(sequence.size(), MAMs, bwMAMs); 228 | assert(seeds.size() <= maxCount); 229 | std::sort(seeds.begin(), seeds.end(), [](const SeedHit& left, const SeedHit& right) { return left.matchLen > right.matchLen; }); 230 | return seeds; 231 | } 232 | 233 | std::vector MummerSeeder::getMemSeeds(std::string sequence, size_t maxCount, size_t minLen) const 234 | { 235 | for (size_t i = 0; i < sequence.size(); i++) 236 | { 237 | sequence[i] = lowercaseSeq(sequence[i]); 238 | } 239 | assert(matcher != nullptr); 240 | std::priority_queue, std::greater> matches; 241 | matcher->findMEM_each(sequence, minLen, false, [&matches, maxCount](const mummer::mummer::match_t& match) 242 | { 243 | if (matches.size() < maxCount) 244 | { 245 | matches.emplace(match, false); 246 | return; 247 | } 248 | if (matches.top().match.len < match.len) 249 | { 250 | matches.pop(); 251 | matches.emplace(match, false); 252 | } 253 | }); 254 | revcompInPlace(sequence); 255 | matcher->findMEM_each(sequence, minLen, false, [&matches, maxCount](const mummer::mummer::match_t& match) 256 | { 257 | if (matches.size() < maxCount) 258 | { 259 | matches.emplace(match, true); 260 | return; 261 | } 262 | if (matches.top().match.len < match.len) 263 | { 264 | matches.pop(); 265 | matches.emplace(match, true); 266 | } 267 | }); 268 | std::vector MAMs; 269 | std::vector bwMAMs; 270 | while (matches.size() > 0) 271 | { 272 | if (matches.top().reverse) 273 | { 274 | bwMAMs.push_back(matches.top().match); 275 | } 276 | else 277 | { 278 | MAMs.push_back(matches.top().match); 279 | } 280 | matches.pop(); 281 | } 282 | auto seeds = matchesToSeeds(sequence.size(), MAMs, bwMAMs); 283 | assert(seeds.size() <= maxCount); 284 | std::sort(seeds.begin(), seeds.end(), [](const SeedHit& left, const SeedHit& right) { return left.matchLen > right.matchLen; }); 285 | return seeds; 286 | } 287 | 288 | std::vector MummerSeeder::matchesToSeeds(size_t seqLen, const std::vector& fwmatches, const std::vector& bwmatches) const 289 | { 290 | std::vector result; 291 | result.reserve(fwmatches.size() + bwmatches.size()); 292 | for (auto match : fwmatches) 293 | { 294 | assert(match.ref + match.len <= nodePositions.back()); 295 | auto index = getNodeIndex(match.ref); 296 | int nodeID = nodeIDs[index]; 297 | size_t nodeOffset = match.ref - nodePositions[index]; 298 | size_t seqPos = match.query; 299 | size_t matchLen = match.len; 300 | result.emplace_back(nodeID, nodeOffset, seqPos, matchLen, matchLen, false); 301 | } 302 | for (auto match : bwmatches) 303 | { 304 | assert(match.ref + match.len <= nodePositions.back()); 305 | auto index = getNodeIndex(match.ref); 306 | int nodeID = nodeIDs[index]; 307 | size_t nodeOffset = match.ref - nodePositions[index]; 308 | size_t seqPos = match.query; 309 | size_t matchLen = match.len; 310 | assert(match.len > 0); 311 | assert(nodeOffset + matchLen <= nodeLength(index)); 312 | assert(seqPos + matchLen <= seqLen); 313 | nodeOffset = nodeLength(index) - nodeOffset - matchLen; 314 | seqPos = seqLen - seqPos - matchLen; 315 | assert(nodeOffset < nodeLength(index)); 316 | assert(seqPos < seqLen); 317 | result.emplace_back(nodeID, nodeOffset, seqPos, matchLen, matchLen, true); 318 | } 319 | return result; 320 | } 321 | 322 | size_t MummerSeeder::nodeLength(size_t indexPos) const 323 | { 324 | //-1 for separator 325 | return nodePositions[indexPos+1] - nodePositions[indexPos] - 1; 326 | } 327 | 328 | void MummerSeeder::revcompInPlace(std::string& seq) const 329 | { 330 | std::reverse(seq.begin(), seq.end()); 331 | for (size_t i = 0; i < seq.size(); i++) 332 | { 333 | switch(seq[i]) 334 | { 335 | case 'a': 336 | seq[i] = 't'; 337 | break; 338 | case 'u': 339 | case 't': 340 | seq[i] = 'a'; 341 | break; 342 | case 'c': 343 | seq[i] = 'g'; 344 | break; 345 | case 'g': 346 | seq[i] = 'c'; 347 | break; 348 | default: 349 | seq[i] = 'x'; 350 | break; 351 | } 352 | } 353 | } 354 | -------------------------------------------------------------------------------- /src/MummerSeeder.h: -------------------------------------------------------------------------------- 1 | #ifndef MummerSeeder_h 2 | #define MummerSeeder_h 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "GfaGraph.h" 9 | #include "GraphAlignerWrapper.h" 10 | #include "vg.pb.h" 11 | 12 | class MummerSeeder 13 | { 14 | public: 15 | MummerSeeder(const GfaGraph& graph, const std::string& cachePrefix); 16 | MummerSeeder(const vg::Graph& graph, const std::string& cachePrefix); 17 | std::vector getMemSeeds(std::string sequence, size_t maxCount, size_t minLen) const; 18 | std::vector getMumSeeds(std::string sequence, size_t maxCount, size_t minLen) const; 19 | private: 20 | std::vector matchesToSeeds(size_t seqLen, const std::vector& fwmatches, const std::vector& bwmatches) const; 21 | void revcompInPlace(std::string& seq) const; 22 | size_t getNodeIndex(size_t indexPos) const; 23 | size_t nodeLength(size_t indexPos) const; 24 | void initTree(const GfaGraph& graph); 25 | void initTree(const vg::Graph& graph); 26 | void saveTo(const std::string& cachePrefix) const; 27 | void loadFrom(const std::string& cachePrefix); 28 | std::string seq; 29 | std::unique_ptr matcher; 30 | std::vector nodePositions; 31 | std::vector nodeIDs; 32 | }; 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /src/NodePosCsv.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "CommonUtils.h" 7 | 8 | int main(int argc, char** argv) 9 | { 10 | std::string infilename {argv[1]}; 11 | std::string outfilename {argv[2]}; 12 | 13 | std::unordered_map>>> positions; 14 | std::unordered_map> minRepeatCounts; 15 | auto alignments = CommonUtils::LoadVGAlignments(infilename); 16 | std::unordered_set alignmentNames; 17 | for (auto aln : alignments) 18 | { 19 | alignmentNames.insert(aln.name()); 20 | int pos = aln.query_position(); 21 | for (size_t i = 0; i < aln.path().mapping_size(); i++) 22 | { 23 | auto mapping = aln.path().mapping(i); 24 | positions[mapping.position().node_id()][aln.name()].emplace_back(pos, pos+mapping.edit(0).to_length()); 25 | pos += mapping.edit(0).to_length(); 26 | minRepeatCounts[mapping.position().node_id()][aln.name()] += 1; 27 | } 28 | } 29 | std::vector readnames { alignmentNames.begin(), alignmentNames.end() }; 30 | std::sort(readnames.begin(), readnames.end()); 31 | std::ofstream out {outfilename}; 32 | out << "node,_numreads,_minalnrepeatcount,_traversingreads"; 33 | for (auto read : readnames) 34 | { 35 | out << "," << read; 36 | } 37 | out << std::endl; 38 | std::vector nodevec; 39 | for (auto node : positions) 40 | { 41 | nodevec.push_back(node.first); 42 | } 43 | std::sort(nodevec.begin(), nodevec.end()); 44 | for (auto node : nodevec) 45 | { 46 | out << node; 47 | out << "," << positions[node].size(); 48 | int minRepeatCount = 0; 49 | for (auto pair : minRepeatCounts[node]) 50 | { 51 | minRepeatCount = std::max(minRepeatCount, pair.second); 52 | } 53 | out << "," << minRepeatCount; 54 | out << ","; 55 | bool first = true; 56 | for (auto read : positions[node]) 57 | { 58 | if (read.second.size() > 0) 59 | { 60 | if (!first) out << ";"; 61 | out << read.first; 62 | first = false; 63 | } 64 | } 65 | for (auto read : readnames) 66 | { 67 | out << ","; 68 | if (positions[node].count(read) == 1) 69 | { 70 | for (size_t i = 0; i < positions[node][read].size(); i++) 71 | { 72 | if (i > 0) out << ";"; 73 | out << positions[node][read][i].first << "-" << positions[node][read][i].second; 74 | } 75 | } 76 | } 77 | out << std::endl; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/PickAdjacentAlnPairs.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "vg.pb.h" 4 | #include "stream.hpp" 5 | #include "CommonUtils.h" 6 | #include "fastqloader.h" 7 | 8 | void outputPairs(std::ofstream& alignmentOut, const std::string& readname, const std::unordered_map& readLens, const std::vector& starts, const std::vector& ends, const int maxSplitDist, const int minPartialLen) 9 | { 10 | std::vector pairs; 11 | size_t currentPairNum = 0; 12 | for (auto& start : starts) 13 | { 14 | assert(start.query_position() == 0); 15 | int startEnd = 0; 16 | for (int i = 0; i < start.path().mapping_size(); i++) 17 | { 18 | startEnd += start.path().mapping(i).edit(0).to_length(); 19 | } 20 | assert(startEnd >= minPartialLen); 21 | for (auto& end : ends) 22 | { 23 | int endStart = end.query_position(); 24 | if (abs(startEnd-endStart) > maxSplitDist) continue; 25 | vg::Alignment left { start }; 26 | vg::Alignment right { end }; 27 | left.set_name(readname + "_pair" + std::to_string(currentPairNum) + "_1"); 28 | right.set_name(readname + "_pair" + std::to_string(currentPairNum) + "_2"); 29 | pairs.push_back(std::move(left)); 30 | pairs.push_back(std::move(right)); 31 | currentPairNum++; 32 | } 33 | } 34 | if (pairs.size() > 0) stream::write_buffered(alignmentOut, pairs, 0); 35 | } 36 | 37 | void pickAndWritePairs(std::string inputFile, std::string outputFile, const std::unordered_map& readLens, const int maxSplitDist, const int minPartialLen) 38 | { 39 | std::ofstream alignmentOut { outputFile, std::ios::out | std::ios::binary }; 40 | std::string currentRead; 41 | std::vector starts; 42 | std::vector ends; 43 | 44 | std::ifstream alignmentIn { inputFile, std::ios::in | std::ios::binary }; 45 | std::function lambda = [&alignmentOut, &readLens, ¤tRead, &starts, &ends, maxSplitDist, minPartialLen](vg::Alignment& aln) { 46 | if (aln.name() != currentRead) 47 | { 48 | outputPairs(alignmentOut, currentRead, readLens, starts, ends, maxSplitDist, minPartialLen); 49 | starts.clear(); 50 | ends.clear(); 51 | currentRead = aln.name(); 52 | } 53 | assert(readLens.count(aln.name()) == 1); 54 | size_t alnlen = 0; 55 | for (int i = 0; i < aln.path().mapping_size(); i++) 56 | { 57 | alnlen += aln.path().mapping(i).edit(0).to_length(); 58 | } 59 | if (alnlen < minPartialLen) return; 60 | if (aln.query_position() == 0) 61 | { 62 | starts.push_back(aln); 63 | } 64 | if (aln.query_position() + alnlen == readLens.at(aln.name())) 65 | { 66 | ends.push_back(aln); 67 | } 68 | }; 69 | stream::for_each(alignmentIn, lambda); 70 | outputPairs(alignmentOut, currentRead, readLens, starts, ends, maxSplitDist, minPartialLen); 71 | } 72 | 73 | std::unordered_map getReadLens(std::string filename) 74 | { 75 | std::unordered_map result; 76 | FastQ::streamFastqFromFile(filename, false, [&result](const FastQ& read) 77 | { 78 | result[read.seq_id] = read.sequence.size(); 79 | }); 80 | return result; 81 | } 82 | 83 | int main(int argc, char** argv) 84 | { 85 | std::string inputAlns { argv[1] }; 86 | int maxSplitDist = std::stoi(argv[2]); 87 | std::string readFile { argv[3] }; 88 | std::string outputAlns { argv[4] }; 89 | int minPartialLen = std::stoi(argv[5]); 90 | 91 | auto readLens = getReadLens(readFile); 92 | pickAndWritePairs(inputAlns, outputAlns, readLens, maxSplitDist, minPartialLen); 93 | } -------------------------------------------------------------------------------- /src/PickMummerSeeds.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "vg.pb.h" 8 | #include "stream.hpp" 9 | #include "CommonUtils.h" 10 | #include "fastqloader.h" 11 | 12 | struct MummerSeed 13 | { 14 | size_t readpos; 15 | size_t len; 16 | int nodeId; 17 | size_t nodepos; 18 | bool reverse; 19 | }; 20 | 21 | class AlignmentLengthCompare 22 | { 23 | public: 24 | bool operator()(const MummerSeed& left, const MummerSeed& right) const 25 | { 26 | return left.len > right.len; 27 | } 28 | }; 29 | 30 | vg::Alignment createAlignment(const std::string& readname, const MummerSeed& seed) 31 | { 32 | vg::Alignment result; 33 | result.set_name(readname); 34 | result.set_query_position(seed.readpos); 35 | auto path = new vg::Path; 36 | result.set_allocated_path(path); 37 | auto vgmapping = path->add_mapping(); 38 | auto position = new vg::Position; 39 | vgmapping->set_allocated_position(position); 40 | position->set_node_id(seed.nodeId); 41 | position->set_is_reverse(seed.reverse); 42 | position->set_offset(seed.nodepos); 43 | auto edit = vgmapping->add_edit(); 44 | edit->set_from_length(seed.len); 45 | edit->set_to_length(seed.len); 46 | return result; 47 | } 48 | 49 | int getNodeIndex(size_t pos, const std::vector& nodeMappingPositions) 50 | { 51 | auto iter = std::upper_bound(nodeMappingPositions.begin(), nodeMappingPositions.end(), pos); 52 | int index = iter - nodeMappingPositions.begin(); 53 | assert(index > 0); 54 | assert(index <= nodeMappingPositions.size()); 55 | return index-1; 56 | } 57 | 58 | int main(int argc, char** argv) 59 | { 60 | std::string outputFileName { argv[1] }; 61 | std::string gfaReferenceFilename { argv[2] }; 62 | int maxSeeds = std::stoi(argv[3]); 63 | std::string readFile { argv[4] }; 64 | std::unordered_map readLengths; 65 | std::unordered_map nodeLengths; 66 | 67 | { 68 | auto reads = loadFastqFromFile(readFile); 69 | for (auto read : reads) 70 | { 71 | readLengths[read.seq_id] = read.sequence.size(); 72 | } 73 | } 74 | { 75 | auto reads = loadFastqFromFile(gfaReferenceFilename); 76 | for (size_t i = 0; i < reads.size(); i++) 77 | { 78 | nodeLengths[std::stoi(reads[i].seq_id)] = reads[i].sequence.size(); 79 | } 80 | } 81 | 82 | std::unordered_map, AlignmentLengthCompare>> alignments; 83 | size_t numElems = 0; 84 | std::string currentRead; 85 | std::string line; 86 | bool currentReverse = false; 87 | size_t currentReadLength; 88 | std::priority_queue, AlignmentLengthCompare>* currentQueue; 89 | while (std::getline(std::cin, line)) 90 | { 91 | if (line[0] == '>') 92 | { 93 | if (line.size() > 8 && (std::string{line.end()-8, line.end()} == " Reverse" || std::string{line.end()-8, line.end()} == "_Reverse")) 94 | { 95 | currentReverse = true; 96 | currentRead = std::string { line.begin()+2, line.end()-8 }; 97 | } 98 | else 99 | { 100 | currentReverse = false; 101 | currentRead = std::string { line.begin()+2, line.end() }; 102 | } 103 | currentReadLength = readLengths[currentRead]; 104 | currentQueue = &alignments[currentRead]; 105 | } 106 | else 107 | { 108 | std::stringstream str { line }; 109 | MummerSeed newSeed; 110 | str >> newSeed.nodeId >> newSeed.nodepos >> newSeed.readpos >> newSeed.len; 111 | newSeed.reverse = currentReverse; 112 | assert(newSeed.nodepos >= 1); 113 | assert(newSeed.readpos >= 1); 114 | newSeed.nodepos -= 1; 115 | newSeed.readpos -= 1; 116 | if (currentReverse) 117 | { 118 | //there's some weird bug, possibly even in mummer 119 | //ignore it until we figure out what's going on 120 | if (newSeed.nodepos > nodeLengths[newSeed.nodeId] - 1) continue; 121 | if (newSeed.readpos > currentReadLength - 1) continue; 122 | assert(newSeed.nodepos <= nodeLengths[newSeed.nodeId] - 1); 123 | assert(newSeed.readpos <= currentReadLength - 1); 124 | newSeed.nodepos = nodeLengths[newSeed.nodeId] - 1 - newSeed.nodepos; 125 | newSeed.readpos = currentReadLength - 1 - newSeed.readpos; 126 | } 127 | //there's some weird bug, possibly even in mummer 128 | //ignore it until we figure out what's going on 129 | if (newSeed.readpos >= currentReadLength) continue; 130 | if (newSeed.nodepos >= nodeLengths[newSeed.nodeId]) continue; 131 | assert(newSeed.readpos < currentReadLength); 132 | assert(newSeed.nodepos < nodeLengths[newSeed.nodeId]); 133 | assert(newSeed.readpos >= 0); 134 | assert(newSeed.nodepos >= 0); 135 | if (currentQueue->size() < maxSeeds) 136 | { 137 | currentQueue->emplace(newSeed); 138 | numElems++; 139 | } 140 | else if (AlignmentLengthCompare{}(newSeed, currentQueue->top())) 141 | { 142 | currentQueue->pop(); 143 | currentQueue->emplace(newSeed); 144 | } 145 | } 146 | } 147 | std::vector writeAlignments; 148 | writeAlignments.reserve(numElems); 149 | std::vector insertAlns; 150 | insertAlns.reserve(maxSeeds); 151 | for (auto& pair : alignments) 152 | { 153 | insertAlns.clear(); 154 | while (pair.second.size() > 0) 155 | { 156 | auto aln = createAlignment(pair.first, pair.second.top()); 157 | insertAlns.push_back(aln); 158 | pair.second.pop(); 159 | } 160 | std::reverse(insertAlns.begin(), insertAlns.end()); 161 | writeAlignments.insert(writeAlignments.end(), insertAlns.begin(), insertAlns.end()); 162 | } 163 | assert(writeAlignments.size() == numElems); 164 | std::ofstream alignmentOut { outputFileName, std::ios::out | std::ios::binary }; 165 | stream::write_buffered(alignmentOut, writeAlignments, 0); 166 | } -------------------------------------------------------------------------------- /src/Postprocess.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include //https://github.com/cameron314/concurrentqueue 7 | #include "vg.pb.h" 8 | #include "stream.hpp" 9 | #include "fastqloader.h" 10 | #include "CommonUtils.h" 11 | #include "AlignmentSelection.h" 12 | 13 | std::atomic readingDone; 14 | std::atomic splittingDone; 15 | std::vector cleanup; 16 | 17 | size_t allAlnsCount = 0; 18 | size_t selectedAlnCount = 0; 19 | size_t fullLengthAlnCount = 0; 20 | size_t readsWithAnAlnCount = 0; 21 | size_t bpInReads = 0; 22 | size_t bpInSelected = 0; 23 | size_t bpInFull = 0; 24 | 25 | std::unordered_map getReadLengths(std::string readFile) 26 | { 27 | std::unordered_map result; 28 | auto reads = loadFastqFromFile(readFile); 29 | for (auto read : reads) 30 | { 31 | result[read.seq_id] = read.sequence.size(); 32 | } 33 | return result; 34 | } 35 | 36 | void loadAlignments(std::string filename, moodycamel::ConcurrentQueue& output) 37 | { 38 | vg::Alignment* current[100]; 39 | size_t countCurrent = 0; 40 | std::ifstream alnFile { filename, std::ios::in | std::ios::binary }; 41 | std::function lambda = [&cleanup, &output, ¤t, &countCurrent](vg::Alignment& g) { 42 | vg::Alignment* ptr = new vg::Alignment; 43 | *ptr = g; 44 | cleanup.push_back(ptr); 45 | current[countCurrent] = ptr; 46 | countCurrent++; 47 | if (countCurrent == 100) 48 | { 49 | output.enqueue_bulk(current, 100); 50 | countCurrent = 0; 51 | } 52 | }; 53 | stream::for_each(alnFile, lambda); 54 | if (countCurrent > 0) 55 | { 56 | output.enqueue_bulk(current, countCurrent); 57 | } 58 | 59 | readingDone = true; 60 | } 61 | 62 | void splitAlignmentsIntoSelectedAndFullLength(const std::unordered_map& readLengths, moodycamel::ConcurrentQueue& inputAlns, moodycamel::ConcurrentQueue& outputSelected, moodycamel::ConcurrentQueue& outputFullLength) 63 | { 64 | vg::Alignment* alns[100] {}; 65 | 66 | std::unordered_map> alnsPerRead; 67 | while (true) 68 | { 69 | size_t gotAlns = inputAlns.try_dequeue_bulk(alns, 100); 70 | if (gotAlns == 0) 71 | { 72 | if (readingDone) break; 73 | std::this_thread::sleep_for(std::chrono::milliseconds(10)); 74 | continue; 75 | } 76 | for (size_t i = 0; i < gotAlns; i++) 77 | { 78 | alnsPerRead[alns[i]->name()].push_back(alns[i]); 79 | } 80 | } 81 | 82 | AlignmentSelection::SelectionOptions options; 83 | options.method = AlignmentSelection::SelectionMethod::GreedyLength; 84 | for (auto pair : alnsPerRead) 85 | { 86 | auto selected = AlignmentSelection::SelectAlignments(pair.second, options); 87 | outputSelected.enqueue_bulk(selected.data(), selected.size()); 88 | allAlnsCount += pair.second.size(); 89 | selectedAlnCount += selected.size(); 90 | for (auto ptr : selected) 91 | { 92 | bpInSelected += ptr->sequence().size(); 93 | } 94 | if (selected[0]->sequence().size() >= readLengths.at(pair.first) - 1) 95 | { 96 | outputFullLength.enqueue(selected[0]); 97 | bpInFull += selected[0]->sequence().size(); 98 | fullLengthAlnCount += 1; 99 | } 100 | } 101 | 102 | readsWithAnAlnCount = alnsPerRead.size(); 103 | 104 | splittingDone = true; 105 | } 106 | 107 | void writeAlignments(std::string filename, moodycamel::ConcurrentQueue& inputAlns) 108 | { 109 | std::ofstream outfile { filename, std::ios::out | std::ios::binary }; 110 | 111 | std::vector alns; 112 | alns.resize(1000, nullptr); 113 | 114 | while (true) 115 | { 116 | alns.resize(1000); 117 | size_t gotAlns = inputAlns.try_dequeue_bulk(alns.data(), 1000); 118 | if (gotAlns == 0) 119 | { 120 | if (splittingDone) break; 121 | std::this_thread::sleep_for(std::chrono::milliseconds(10)); 122 | continue; 123 | } 124 | alns.resize(gotAlns); 125 | stream::write_buffered_ptr(outfile, alns, 0); 126 | } 127 | } 128 | 129 | int main(int argc, char** argv) 130 | { 131 | std::string rawAlnFile { argv[1] }; 132 | std::string readsFile { argv[2] }; 133 | std::string outputSelectedAlnFile { argv[3] }; 134 | std::string outputFullLengthAlnFile { argv[4] }; 135 | std::string outputSummaryFile { argv[5] }; 136 | 137 | readingDone = false; 138 | splittingDone = false; 139 | 140 | auto readLengths = getReadLengths(readsFile); 141 | 142 | moodycamel::ConcurrentQueue readToSplitting; 143 | moodycamel::ConcurrentQueue splitToSelected; 144 | moodycamel::ConcurrentQueue splitToFullLength; 145 | 146 | std::thread readThread {[&rawAlnFile, &readToSplitting](){loadAlignments(rawAlnFile, readToSplitting);}}; 147 | std::thread splitter {[&readToSplitting, &splitToSelected, &splitToFullLength, &readLengths](){splitAlignmentsIntoSelectedAndFullLength(readLengths, readToSplitting, splitToSelected, splitToFullLength);}}; 148 | std::thread selectedWriter {[&splitToSelected, &outputSelectedAlnFile](){writeAlignments(outputSelectedAlnFile, splitToSelected);}}; 149 | std::thread fullLengthWriter {[&splitToFullLength, &outputFullLengthAlnFile](){writeAlignments(outputFullLengthAlnFile, splitToFullLength);}}; 150 | 151 | readThread.join(); 152 | splitter.join(); 153 | selectedWriter.join(); 154 | fullLengthWriter.join(); 155 | 156 | for (auto aln : cleanup) 157 | { 158 | delete aln; 159 | } 160 | 161 | for (auto pair : readLengths) 162 | { 163 | bpInReads += pair.second; 164 | } 165 | 166 | std::ofstream summary {outputSummaryFile}; 167 | summary << readLengths.size() << "\tnumber of reads" << std::endl; 168 | summary << selectedAlnCount << "\tnumber of selected alignments" << std::endl; 169 | summary << fullLengthAlnCount << "\tnumber of full length alignments" << std::endl; 170 | summary << readsWithAnAlnCount << "\treads with an alignment" << std::endl; 171 | summary << bpInReads << "\tbp in reads" << std::endl; 172 | summary << bpInSelected << "\tbp in selected alignments" << std::endl; 173 | summary << bpInFull << "\tbp in full length alignments" << std::endl; 174 | } -------------------------------------------------------------------------------- /src/ReadCorrection.cpp: -------------------------------------------------------------------------------- 1 | #include "ThreadReadAssertion.h" 2 | #include "ReadCorrection.h" 3 | 4 | std::string toUpper(std::string seq) 5 | { 6 | for (auto& c : seq) 7 | { 8 | c = toupper(c); 9 | } 10 | return seq; 11 | } 12 | 13 | std::string toLower(std::string seq) 14 | { 15 | for (auto& c : seq) 16 | { 17 | c = tolower(c); 18 | } 19 | return seq; 20 | } 21 | 22 | size_t getLongestOverlap(const std::string& left, const std::string& right, size_t maxOverlap) 23 | { 24 | if (left.size() < maxOverlap) maxOverlap = left.size(); 25 | if (right.size() < maxOverlap) maxOverlap = right.size(); 26 | for (size_t i = maxOverlap; i > 0; i--) 27 | { 28 | bool match = true; 29 | for (size_t a = 0; a < i && match; a++) 30 | { 31 | if (left[left.size() - maxOverlap + a] != right[a]) match = false; 32 | } 33 | if (match) return i; 34 | } 35 | return 0; 36 | } 37 | 38 | std::string getCorrected(const std::string& raw, const std::vector& corrections, size_t maxOverlap) 39 | { 40 | std::string result; 41 | size_t currentEnd = 0; 42 | for (size_t i = 0; i < corrections.size(); i++) 43 | { 44 | assert(i == 0 || corrections[i].startIndex >= corrections[i-1].startIndex); 45 | if (corrections[i].startIndex < currentEnd) 46 | { 47 | size_t overlap = getLongestOverlap(result, corrections[i].corrected, maxOverlap); 48 | result += toUpper(corrections[i].corrected.substr(overlap)); 49 | } 50 | else if (corrections[i].startIndex > currentEnd) 51 | { 52 | result += toLower(raw.substr(currentEnd, corrections[i].startIndex - currentEnd)); 53 | result += toUpper(corrections[i].corrected); 54 | } 55 | else 56 | { 57 | assert(corrections[i].startIndex == currentEnd); 58 | result += toUpper(corrections[i].corrected); 59 | } 60 | currentEnd = corrections[i].endIndex; 61 | } 62 | if (currentEnd < raw.size()) result += toLower(raw.substr(currentEnd)); 63 | return result; 64 | } 65 | -------------------------------------------------------------------------------- /src/ReadCorrection.h: -------------------------------------------------------------------------------- 1 | #ifndef ReadCorrection_h 2 | #define ReadCorrection_h 3 | 4 | #include 5 | #include 6 | 7 | struct Correction 8 | { 9 | size_t startIndex; 10 | size_t endIndex; 11 | std::string corrected; 12 | }; 13 | 14 | std::string getCorrected(const std::string& raw, const std::vector& corrections, size_t maxOverlap); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/ReverseReads.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "fastqloader.h" 3 | 4 | int main(int argc, char** argv) 5 | { 6 | auto reads = loadFastqFromFile(argv[1]); 7 | std::ofstream output {argv[2]}; 8 | for (size_t i = 0; i < reads.size(); i++) 9 | { 10 | auto reverse = reads[i].reverseComplement(); 11 | output << ">" << reverse.seq_id << "_Reverse" << "\n"; 12 | output << reverse.sequence << "\n"; 13 | } 14 | } -------------------------------------------------------------------------------- /src/SelectLongestAlignment.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "CommonUtils.h" 5 | #include "fastqloader.h" 6 | #include "stream.hpp" 7 | 8 | int main(int argc, char** argv) 9 | { 10 | std::string alnfile { argv[1] }; 11 | std::string outfile { argv[2] }; 12 | 13 | auto alns = CommonUtils::LoadVGAlignments(alnfile); 14 | 15 | std::unordered_map result; 16 | for (auto aln : alns) 17 | { 18 | if (result.count(aln.name()) == 0) result[aln.name()] = aln; 19 | else if (aln.sequence().size() > result[aln.name()].sequence().size()) result[aln.name()] = aln; 20 | else if (aln.sequence().size() == result[aln.name()].sequence().size() && aln.score() < result[aln.name()].score()) result[aln.name()] = aln; 21 | } 22 | 23 | std::vector writeAlns; 24 | writeAlns.reserve(result.size()); 25 | for (auto pair : result) 26 | { 27 | writeAlns.push_back(pair.second); 28 | } 29 | 30 | std::ofstream resultFile { outfile, std::ios::out | std::ios::binary }; 31 | stream::write_buffered(resultFile, writeAlns, 0); 32 | } 33 | -------------------------------------------------------------------------------- /src/SupportedSubgraph.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "CommonUtils.h" 5 | #include "vg.pb.h" 6 | #include "stream.hpp" 7 | 8 | 9 | int main(int argc, char** argv) 10 | { 11 | std::string graphFile { argv[1] }; 12 | std::string alnFile { argv[2] }; 13 | std::string outputGraph { argv[3] }; 14 | 15 | vg::Graph graph = CommonUtils::LoadVGGraph(graphFile); 16 | 17 | std::vector alignments; 18 | { 19 | std::ifstream alignmentfile {alnFile, std::ios::in | std::ios::binary}; 20 | std::function lambda = [&alignments](vg::Alignment& g) { 21 | alignments.push_back(g); 22 | }; 23 | stream::for_each(alignmentfile, lambda); 24 | } 25 | 26 | std::map> existingEdges; 27 | for (size_t i = 0; i < graph.edge_size(); i++) 28 | { 29 | existingEdges[graph.edge(i).from()].insert(graph.edge(i).to()); 30 | } 31 | 32 | std::map> supportedEdges; 33 | std::unordered_set supportedNodes; 34 | 35 | for (size_t i = 0; i < alignments.size(); i++) 36 | { 37 | std::cout << "alignment " << alignments[i].name() << std::endl; 38 | for (size_t j = 0; j < alignments[i].path().mapping_size()-1; j++) 39 | { 40 | auto from = alignments[i].path().mapping(j).position().node_id(); 41 | auto to = alignments[i].path().mapping(j+1).position().node_id(); 42 | supportedNodes.insert(from); 43 | supportedNodes.insert(to); 44 | if (existingEdges[from].count(to) == 0 && existingEdges[to].count(from) == 0) 45 | { 46 | std::cout << "nonexistant alignment from " << from << " to " << to << std::endl; 47 | } 48 | supportedEdges[from].insert(to); 49 | } 50 | } 51 | 52 | vg::Graph resultGraph; 53 | for (int i = 0 ; i < graph.node_size(); i++) 54 | { 55 | if (supportedNodes.count(graph.node(i).id()) == 0) continue; 56 | auto* node = resultGraph.add_node(); 57 | node->set_sequence(graph.node(i).sequence()); 58 | node->set_id(graph.node(i).id()); 59 | node->set_name(graph.node(i).name()); 60 | } 61 | for (int i = 0; i < graph.edge_size(); i++) 62 | { 63 | auto from = graph.edge(i).from(); 64 | auto to = graph.edge(i).to(); 65 | bool foundForward = supportedEdges[from].count(to) == 1; 66 | auto foundBackward = supportedEdges[to].count(from) == 1; 67 | if (!foundForward && !foundBackward) 68 | { 69 | continue; 70 | } 71 | auto* edge = resultGraph.add_edge(); 72 | edge->set_from(graph.edge(i).from()); 73 | edge->set_to(graph.edge(i).to()); 74 | edge->set_from_start(graph.edge(i).from_start()); 75 | edge->set_to_end(graph.edge(i).to_end()); 76 | edge->set_overlap(graph.edge(i).overlap()); 77 | } 78 | 79 | std::ofstream graphOut { outputGraph, std::ios::out | std::ios::binary }; 80 | std::vector writeVector {resultGraph}; 81 | stream::write_buffered(graphOut, writeVector, 0); 82 | } -------------------------------------------------------------------------------- /src/ThreadReadAssertion.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "ThreadReadAssertion.h" 5 | 6 | namespace ThreadReadAssertion 7 | { 8 | thread_local int currentnodeID; 9 | thread_local bool currentreverse; 10 | thread_local size_t currentseqPos; 11 | thread_local size_t currentmatchLen; 12 | thread_local size_t currentnodeOffset; 13 | thread_local std::string_view currentRead; 14 | void signal(int signal) 15 | { 16 | std::stringstream msg; 17 | msg << "Signal " << signal << ". Read: " << currentRead << ". Seed: " << assertGetSeedInfo(); 18 | std::cerr << msg.str() << std::endl; 19 | std::abort(); 20 | } 21 | void setRead(const std::string& readName) 22 | { 23 | currentRead = std::string_view(readName.data(), readName.size()); 24 | } 25 | void setSeed(int nodeID, bool reverse, size_t seqPos, size_t matchLen, size_t nodeOffset) 26 | { 27 | currentnodeID = nodeID; 28 | currentreverse = reverse; 29 | currentseqPos = seqPos; 30 | currentmatchLen = matchLen; 31 | currentnodeOffset = nodeOffset; 32 | } 33 | void assertFailed(const char* expression, const char* file, int line) 34 | { 35 | std::stringstream msg; 36 | msg << file << ":" << line << ": Assertion '" << expression << "' failed. Read: " << currentRead << ". Seed: " << assertGetSeedInfo(); 37 | std::cerr << msg.str() << std::endl; 38 | throw AssertionFailure {}; 39 | } 40 | std::string assertGetSeedInfo() 41 | { 42 | return std::to_string(currentnodeID) + (currentreverse ? "-" : "+") + "," + std::to_string(currentseqPos) + "," + std::to_string(currentmatchLen) + "," + std::to_string(currentnodeOffset); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/ThreadReadAssertion.h: -------------------------------------------------------------------------------- 1 | #ifndef ThreadReadAssertion_h 2 | #define ThreadReadAssertion_h 3 | 4 | #include 5 | 6 | namespace ThreadReadAssertion 7 | { 8 | class AssertionFailure 9 | { 10 | }; 11 | void setRead(const std::string& readName); 12 | void setSeed(int nodeID, bool reverse, size_t seqPos, size_t matchLen, size_t nodeOffset); 13 | void assertFailed(const char* expression, const char* file, int line); 14 | void signal(int signal); 15 | std::string assertGetSeedInfo(); 16 | } 17 | 18 | #endif 19 | 20 | #ifdef assert 21 | #undef assert 22 | #endif 23 | 24 | #ifndef NDEBUG 25 | 26 | //https://stackoverflow.com/questions/9701229/c-assert-implementation-in-assert-h 27 | #define assert(expression) (void)((expression) || (ThreadReadAssertion::assertFailed(#expression, __FILE__, __LINE__),0)) 28 | #define assertSetRead(name, nodeid, reverse, seqpos, matchlen, nodeoffset) { ThreadReadAssertion::setRead(name); ThreadReadAssertion::setSeed(nodeid, reverse, seqpos, matchlen, nodeoffset); } 29 | #define assertSetNoRead(name) { ThreadReadAssertion::setRead(name); ThreadReadAssertion::setSeed(0, 0, 0, 0, 0); } 30 | 31 | #else 32 | 33 | #define assert(ignore) ((void)0) 34 | #define assertSetRead(name, nodeid, reverse, seqpos, matchlen, nodeoffset) ((void)0) 35 | #define assertSetNoRead(name) ((void)0) 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/UnitigifyDBG.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "CommonUtils.h" 5 | #include "GfaGraph.h" 6 | 7 | GfaGraph unitigify(const GfaGraph& graph) 8 | { 9 | GfaGraph result; 10 | result.edgeOverlap = graph.edgeOverlap; 11 | std::unordered_map belongsInUnitig; 12 | std::unordered_set nodesHandled; 13 | std::unordered_map unitigLeft; 14 | std::unordered_map unitigRight; 15 | std::vector> nodesInUnitig; 16 | 17 | for (auto node : graph.nodes) 18 | { 19 | if (nodesHandled.count(node.first) == 1) continue; 20 | NodePos left { node.first, false }; 21 | NodePos right { node.first, true }; 22 | bool leftBreaks = true; 23 | bool rightBreaks = true; 24 | if (graph.edges.count(left) == 1 && graph.edges.at(left).size() == 1) 25 | { 26 | auto neighbor = graph.edges.at(left)[0]; 27 | assert(graph.edges.count(neighbor.Reverse()) == 1); 28 | if (graph.edges.at(neighbor.Reverse()).size() == 1) leftBreaks = false; 29 | } 30 | if (graph.edges.count(right) == 1 && graph.edges.at(right).size() == 1) 31 | { 32 | auto neighbor = graph.edges.at(right)[0]; 33 | assert(graph.edges.count(neighbor.Reverse()) == 1); 34 | if (graph.edges.at(neighbor.Reverse()).size() == 1) rightBreaks = false; 35 | } 36 | if (leftBreaks && rightBreaks) 37 | { 38 | assert(nodesHandled.count(node.first) == 0); 39 | nodesHandled.insert(node.first); 40 | unitigLeft[nodesInUnitig.size()] = NodePos { node.first, true }; 41 | unitigRight[nodesInUnitig.size()] = NodePos { node.first, true }; 42 | assert(belongsInUnitig.count(right) == 0); 43 | belongsInUnitig[right] = nodesInUnitig.size(); 44 | nodesInUnitig.emplace_back(); 45 | nodesInUnitig.back().emplace_back(node.first, true); 46 | continue; 47 | } 48 | if (!leftBreaks && !rightBreaks) 49 | { 50 | continue; 51 | } 52 | assert((leftBreaks && !rightBreaks) || (rightBreaks && !leftBreaks)); 53 | NodePos start; 54 | int id = nodesInUnitig.size(); 55 | nodesInUnitig.emplace_back(); 56 | start.id = node.first; 57 | start.end = leftBreaks; 58 | assert(belongsInUnitig.count(start) == 0); 59 | assert(belongsInUnitig.count(start.Reverse()) == 0); 60 | assert(nodesHandled.count(start.id) == 0); 61 | unitigLeft[id] = start; 62 | unitigRight[id] = start; 63 | nodesHandled.insert(start.id); 64 | belongsInUnitig[start] = id; 65 | nodesInUnitig.back().push_back(start); 66 | assert(graph.edges.count(start) == 1 && graph.edges.at(start).size() == 1); 67 | while (graph.edges.count(start) == 1 && graph.edges.at(start).size() == 1) 68 | { 69 | start = graph.edges.at(start)[0]; 70 | assert(graph.edges.count(start.Reverse()) == 1); 71 | if (graph.edges.at(start.Reverse()).size() != 1) break; 72 | assert(belongsInUnitig.count(start) == 0); 73 | assert(belongsInUnitig.count(start.Reverse()) == 0); 74 | assert(nodesHandled.count(start.id) == 0); 75 | unitigRight[id] = start; 76 | nodesHandled.insert(start.id); 77 | belongsInUnitig[start] = id; 78 | nodesInUnitig.back().push_back(start); 79 | } 80 | } 81 | //circular separate components 82 | for (auto node : graph.nodes) 83 | { 84 | if (nodesHandled.count(node.first) == 1) continue; 85 | NodePos left { node.first, false }; 86 | NodePos right { node.first, true }; 87 | assert(graph.edges.count(left) == 1 && graph.edges.at(left).size() == 1); 88 | assert(graph.edges.count(right) == 1 && graph.edges.at(right).size() == 1); 89 | NodePos start = right; 90 | int id = nodesInUnitig.size(); 91 | nodesInUnitig.emplace_back(); 92 | unitigLeft[id] = start; 93 | unitigRight[id] = start; 94 | do 95 | { 96 | nodesHandled.insert(node.first); 97 | belongsInUnitig[start] = id; 98 | nodesInUnitig.back().push_back(start); 99 | assert(graph.edges.count(start) == 1 && graph.edges.at(start).size() == 1); 100 | assert(graph.edges.count(start.Reverse()) == 1 && graph.edges.at(start.Reverse()).size() == 1); 101 | start = graph.edges.at(start)[0]; 102 | } while (start.id != node.first); 103 | result.edges[NodePos { id, true }].emplace_back(id, true); 104 | } 105 | assert(nodesHandled.size() == graph.nodes.size()); 106 | assert(belongsInUnitig.size() == graph.nodes.size()); 107 | for (size_t i = 0; i < nodesInUnitig.size(); i++) 108 | { 109 | std::string seq; 110 | assert(nodesInUnitig[i].size() > 0); 111 | seq = graph.nodes.at(nodesInUnitig[i][0].id); 112 | if (!nodesInUnitig[i][0].end) seq = CommonUtils::ReverseComplement(seq); 113 | seq = seq.substr(0, graph.edgeOverlap); 114 | for (auto node : nodesInUnitig[i]) 115 | { 116 | std::string add; 117 | add = graph.nodes.at(node.id); 118 | if (!node.end) add = CommonUtils::ReverseComplement(add); 119 | add = add.substr(graph.edgeOverlap); 120 | seq += add; 121 | } 122 | result.nodes[i] = seq; 123 | } 124 | for (auto edge : graph.edges) 125 | { 126 | NodePos src = edge.first; 127 | NodePos from; 128 | assert(belongsInUnitig.count(src) == 1 || belongsInUnitig.count(src.Reverse()) == 1); 129 | if (belongsInUnitig.count(src) == 1) 130 | { 131 | assert(belongsInUnitig.count(src) == 1); 132 | assert(belongsInUnitig.count(src.Reverse()) == 0); 133 | if (unitigRight[belongsInUnitig[src]] != src) continue; 134 | from = NodePos { belongsInUnitig[src], true }; 135 | } 136 | else 137 | { 138 | assert(belongsInUnitig.count(src) == 0); 139 | assert(belongsInUnitig.count(src.Reverse()) == 1); 140 | if (unitigLeft[belongsInUnitig[src.Reverse()]] != src.Reverse()) continue; 141 | from = NodePos { belongsInUnitig[src.Reverse()], false }; 142 | } 143 | for (auto dst : edge.second) 144 | { 145 | NodePos to; 146 | assert(belongsInUnitig.count(dst) == 1 || belongsInUnitig.count(dst.Reverse()) == 1); 147 | if (belongsInUnitig.count(dst) == 1) 148 | { 149 | assert(belongsInUnitig.count(dst) == 1); 150 | assert(belongsInUnitig.count(dst.Reverse()) == 0); 151 | if (unitigLeft[belongsInUnitig[dst]] != dst) continue; 152 | to = NodePos { belongsInUnitig[dst], true }; 153 | } 154 | else 155 | { 156 | assert(belongsInUnitig.count(dst) == 0); 157 | assert(belongsInUnitig.count(dst.Reverse()) == 1); 158 | if (unitigRight[belongsInUnitig[dst.Reverse()]] != dst.Reverse()) continue; 159 | to = NodePos { belongsInUnitig[dst.Reverse()], false }; 160 | } 161 | result.edges[from].push_back(to); 162 | } 163 | } 164 | return result; 165 | } 166 | 167 | int main(int argc, char** argv) 168 | { 169 | std::string inputGraph { argv[1] }; 170 | std::string outputGraph { argv[2] }; 171 | 172 | auto graph = GfaGraph::LoadFromFile(inputGraph); 173 | graph.confirmDoublesidedEdges(); 174 | auto result = unitigify(graph); 175 | result.SaveToFile(outputGraph); 176 | } -------------------------------------------------------------------------------- /src/UntipRelative.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "GfaGraph.h" 8 | 9 | std::unordered_map getNodeMapping(const GfaGraph& graph) 10 | { 11 | std::unordered_map result; 12 | for (auto node : graph.nodes) 13 | { 14 | size_t id = result.size(); 15 | result[NodePos { node.first, true }] = id; 16 | id = result.size(); 17 | result[NodePos { node.first, false }] = id; 18 | } 19 | return result; 20 | } 21 | 22 | std::vector getLengths(const std::unordered_map& nodeMapping, const GfaGraph& graph) 23 | { 24 | std::vector result; 25 | result.resize(nodeMapping.size(), 0); 26 | for (auto node : graph.nodes) 27 | { 28 | result[nodeMapping.at(NodePos{ node.first, true })] = node.second.size() - graph.edgeOverlap; 29 | result[nodeMapping.at(NodePos{ node.first, false })] = node.second.size() - graph.edgeOverlap; 30 | } 31 | return result; 32 | } 33 | 34 | std::vector> getOutEdges(const std::unordered_map& nodeMapping, const GfaGraph& graph) 35 | { 36 | std::vector> result; 37 | result.resize(nodeMapping.size()); 38 | for (auto edge : graph.edges) 39 | { 40 | NodePos source = edge.first; 41 | NodePos revSource = source.Reverse(); 42 | for (auto target : edge.second) 43 | { 44 | NodePos revTarget = target.Reverse(); 45 | assert(nodeMapping.at(source) < result.size()); 46 | assert(nodeMapping.at(target) < result.size()); 47 | assert(nodeMapping.at(revSource) < result.size()); 48 | assert(nodeMapping.at(revTarget) < result.size()); 49 | result[nodeMapping.at(source)].push_back(nodeMapping.at(target)); 50 | result[nodeMapping.at(revTarget)].push_back(nodeMapping.at(revSource)); 51 | } 52 | } 53 | return result; 54 | } 55 | 56 | std::vector getNodeDepths(const std::vector>& componentNodes, const std::vector& nodeLengths, const std::vector>& edges) 57 | { 58 | std::vector result; 59 | result.resize(nodeLengths.size(), 0); 60 | for (size_t i = componentNodes.size()-1; i < componentNodes.size(); i--) 61 | { 62 | if (componentNodes[i].size() > 1) 63 | { 64 | for (auto node : componentNodes[i]) 65 | { 66 | result[node] = std::numeric_limits::max(); 67 | } 68 | } 69 | else 70 | { 71 | auto node = componentNodes[i][0]; 72 | result[node] = nodeLengths[node]; 73 | for (auto neighbor : edges[node]) 74 | { 75 | if (result[neighbor] == std::numeric_limits::max()) 76 | { 77 | result[node] = std::numeric_limits::max(); 78 | break; 79 | } 80 | if (neighbor == node) 81 | { 82 | result[node] = std::numeric_limits::max(); 83 | break; 84 | } 85 | result[node] = std::max(result[node], result[neighbor] + nodeLengths[node]); 86 | } 87 | } 88 | } 89 | return result; 90 | } 91 | 92 | void removeRec(std::vector& keepers, size_t pos, const std::vector>& edges) 93 | { 94 | if (!keepers[pos]) return; 95 | keepers[pos] = false; 96 | for (auto neighbor : edges[pos]) 97 | { 98 | removeRec(keepers, neighbor, edges); 99 | } 100 | } 101 | 102 | std::vector getKeepers(const std::vector& depths, const std::vector>& edges, const size_t maxRemovableLen, const size_t minSafeLen, const double fraction) 103 | { 104 | std::vector result; 105 | result.resize(depths.size(), true); 106 | for (size_t i = 0; i < depths.size(); i++) 107 | { 108 | if (!result[i]) continue; 109 | size_t bigLength = 0; 110 | for (auto neighbor : edges[i]) 111 | { 112 | bigLength = std::max(bigLength, depths[neighbor]); 113 | } 114 | if (bigLength < minSafeLen) continue; 115 | size_t removableLen = bigLength * fraction; 116 | removableLen = std::min(removableLen, maxRemovableLen); 117 | for (auto neighbor : edges[i]) 118 | { 119 | if (depths[neighbor] <= removableLen) 120 | { 121 | removeRec(result, neighbor, edges); 122 | } 123 | } 124 | } 125 | return result; 126 | } 127 | 128 | void strongConnectIterative(size_t node, size_t& i, std::vector& index, std::vector& lowlink, std::vector& onStack, std::vector& S, std::vector>& result, const std::vector>& edges) 129 | { 130 | std::vector> stack; 131 | stack.emplace_back(0, node, 0); 132 | while (stack.size() > 0) 133 | { 134 | auto top = stack.back(); 135 | size_t node = std::get<1>(top); 136 | size_t neighborI = std::get<2>(top); 137 | stack.pop_back(); 138 | switch(std::get<0>(top)) 139 | { 140 | case 0: 141 | assert(!onStack[node]); 142 | assert(index[node] == -1); 143 | assert(lowlink[node] == -1); 144 | index[node] = i; 145 | lowlink[node] = i; 146 | i++; 147 | S.push_back(node); 148 | onStack[node] = true; 149 | START_LOOP: 150 | case 1: 151 | if (neighborI < edges[node].size()) 152 | { 153 | auto neighbor = edges[node][neighborI]; 154 | if (index[neighbor] == -1) 155 | { 156 | stack.emplace_back(2, node, neighborI); 157 | stack.emplace_back(0, edges[node][neighborI], 0); 158 | continue; 159 | } 160 | else if (onStack[neighbor]) 161 | { 162 | assert(index[neighbor] != -1); 163 | lowlink[node] = std::min(lowlink[node], index[neighbor]); 164 | } 165 | neighborI++; 166 | } 167 | if (neighborI < edges[node].size()) goto START_LOOP; 168 | goto END_LOOP; 169 | case 2: 170 | { 171 | auto neighbor = edges[node][neighborI]; 172 | assert(lowlink[neighbor] != -1); 173 | lowlink[node] = std::min(lowlink[node], lowlink[neighbor]); 174 | neighborI++; 175 | goto START_LOOP; 176 | } 177 | END_LOOP: 178 | case 3: 179 | assert(lowlink[node] != -1); 180 | assert(index[node] != -1); 181 | if (lowlink[node] == index[node]) 182 | { 183 | result.emplace_back(); 184 | size_t stacknode; 185 | do 186 | { 187 | assert(S.size() > 0); 188 | stacknode = S.back(); 189 | S.pop_back(); 190 | assert(onStack[stacknode]); 191 | onStack[stacknode] = false; 192 | result.back().push_back(stacknode); 193 | } while (stacknode != node); 194 | } 195 | } 196 | } 197 | } 198 | 199 | std::vector> topologicalSort(const std::vector>& edges) 200 | { 201 | std::vector index; 202 | std::vector lowlink; 203 | std::vector onStack; 204 | index.resize(edges.size(), -1); 205 | lowlink.resize(edges.size(), -1); 206 | onStack.resize(edges.size(), false); 207 | std::vector S; 208 | std::vector> result; 209 | size_t i = 0; 210 | for (size_t node = 0; node < edges.size(); node++) 211 | { 212 | if (index[node] == -1) strongConnectIterative(node, i, index, lowlink, onStack, S, result, edges); 213 | assert(S.size() == 0); 214 | } 215 | assert(i == edges.size()); 216 | std::reverse(result.begin(), result.end()); 217 | std::vector belongsToComponent; 218 | belongsToComponent.resize(edges.size(), -1); 219 | for (size_t i = 0; i < result.size(); i++) 220 | { 221 | for (auto node : result[i]) 222 | { 223 | belongsToComponent[node] = i; 224 | } 225 | } 226 | for (size_t i = 0; i < edges.size(); i++) 227 | { 228 | assert(belongsToComponent[i] != -1); 229 | for (auto edge : edges[i]) 230 | { 231 | assert(belongsToComponent[edge] != -1); 232 | assert(belongsToComponent[edge] >= belongsToComponent[i]); 233 | } 234 | } 235 | return result; 236 | } 237 | 238 | std::unordered_set filterNodes(const GfaGraph& graph, const int maxRemovableLen, const int minSafeLen, const double fraction) 239 | { 240 | auto nodeMapping = getNodeMapping(graph); 241 | auto lengths = getLengths(nodeMapping, graph); 242 | auto edges = getOutEdges(nodeMapping, graph); 243 | auto order = topologicalSort(edges); 244 | auto depths = getNodeDepths(order, lengths, edges); 245 | auto keepers = getKeepers(depths, edges, maxRemovableLen, minSafeLen, fraction); 246 | std::unordered_set result; 247 | for (auto node : graph.nodes) 248 | { 249 | if (keepers[nodeMapping[NodePos { node.first, true }]] && keepers[nodeMapping[NodePos { node.first, false }]]) 250 | { 251 | result.emplace(node.first); 252 | } 253 | } 254 | return result; 255 | } 256 | 257 | int main(int argc, char** argv) 258 | { 259 | int maxRemovableLen = std::stoi(argv[1]); 260 | int minSafeLen = std::stoi(argv[2]); 261 | double fraction = std::stod(argv[3]); 262 | auto graph = GfaGraph::LoadFromStream(std::cin); 263 | //write to cout 264 | 265 | auto keptNodes = filterNodes(graph, maxRemovableLen, minSafeLen, fraction); 266 | auto filteredGraph = graph.GetSubgraph(keptNodes); 267 | filteredGraph.SaveToStream(std::cout); 268 | } -------------------------------------------------------------------------------- /src/VisualizeAlignment.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "GfaGraph.h" 6 | #include "AlignmentCorrectnessEstimation.h" 7 | #include "CommonUtils.h" 8 | #include "GraphAlignerWrapper.h" 9 | 10 | void pad(std::string& str, size_t size) 11 | { 12 | assert(str.size() <= size); 13 | while (str.size() < size) 14 | { 15 | str += " "; 16 | } 17 | } 18 | 19 | std::vector loadTrace(std::string filename) 20 | { 21 | std::ifstream file { filename }; 22 | std::vector result; 23 | while (file.good()) 24 | { 25 | int nodeid, offset, reverse, readpos, type; 26 | char graphChar, readChar; 27 | file >> nodeid >> offset >> reverse >> readpos >> type >> graphChar >> readChar; 28 | if (!file.good()) break; 29 | result.emplace_back(); 30 | result.back().nodeID = nodeid; 31 | result.back().offset = offset; 32 | result.back().reverse = reverse == 1; 33 | result.back().readpos = readpos; 34 | result.back().type = (AlignmentResult::TraceMatchType)type; 35 | result.back().graphChar = graphChar; 36 | result.back().readChar = readChar; 37 | } 38 | return result; 39 | } 40 | 41 | std::string getCharwiseCorrectInfo(const std::vector& trace) 42 | { 43 | std::string result; 44 | AlignmentCorrectnessEstimationState charwiseCorrect; 45 | std::vector charwiseCorrectCorrectTrace; 46 | std::vector charwiseCorrectFalseTrace; 47 | for (size_t i = 0; i < trace.size(); i++) 48 | { 49 | auto type = trace[i].type; 50 | if (type == AlignmentResult::TraceMatchType::MATCH) 51 | { 52 | charwiseCorrect = charwiseCorrect.NextState(0, 1); 53 | charwiseCorrectCorrectTrace.push_back(charwiseCorrect.CorrectFromCorrect()); 54 | charwiseCorrectFalseTrace.push_back(charwiseCorrect.FalseFromCorrect()); 55 | } 56 | else if (type == AlignmentResult::TraceMatchType::FORWARDBACKWARDSPLIT) 57 | { 58 | bool oldCorrect = charwiseCorrect.CurrentlyCorrect(); 59 | charwiseCorrect = AlignmentCorrectnessEstimationState {}; 60 | charwiseCorrectCorrectTrace.push_back(oldCorrect); 61 | charwiseCorrectFalseTrace.push_back(oldCorrect); 62 | } 63 | else 64 | { 65 | charwiseCorrect = charwiseCorrect.NextState(1, 1); 66 | charwiseCorrectCorrectTrace.push_back(charwiseCorrect.CorrectFromCorrect()); 67 | charwiseCorrectFalseTrace.push_back(charwiseCorrect.FalseFromCorrect()); 68 | } 69 | } 70 | bool charwiseCurrentlyCorrect = charwiseCorrect.CurrentlyCorrect(); 71 | std::string charwiseCorrectInfo = ""; 72 | for (size_t i = charwiseCorrectCorrectTrace.size()-1; i < charwiseCorrectCorrectTrace.size(); i--) 73 | { 74 | if (charwiseCurrentlyCorrect) 75 | { 76 | charwiseCorrectInfo += "#"; 77 | charwiseCurrentlyCorrect = charwiseCorrectCorrectTrace[i]; 78 | } 79 | else 80 | { 81 | charwiseCorrectInfo += " "; 82 | charwiseCurrentlyCorrect = charwiseCorrectFalseTrace[i]; 83 | } 84 | } 85 | std::reverse(charwiseCorrectInfo.begin(), charwiseCorrectInfo.end()); 86 | return charwiseCorrectInfo; 87 | } 88 | 89 | std::string getSlicewiseCorrectInfo(const std::vector& trace) 90 | { 91 | int readcharsUntilSlicewiseCheck = 64; 92 | int mismatches = 0; 93 | AlignmentCorrectnessEstimationState slicewiseCorrect; 94 | std::string slicewiseCorrectInfo; 95 | for (size_t i = 0; i < trace.size(); i++) 96 | { 97 | switch(trace[i].type) 98 | { 99 | case AlignmentResult::TraceMatchType::MATCH: 100 | readcharsUntilSlicewiseCheck--; 101 | break; 102 | case AlignmentResult::TraceMatchType::MISMATCH: 103 | mismatches++; 104 | readcharsUntilSlicewiseCheck--; 105 | break; 106 | case AlignmentResult::TraceMatchType::INSERTION: 107 | mismatches++; 108 | readcharsUntilSlicewiseCheck--; 109 | break; 110 | case AlignmentResult::TraceMatchType::DELETION: 111 | mismatches++; 112 | break; 113 | case AlignmentResult::TraceMatchType::FORWARDBACKWARDSPLIT: 114 | break; 115 | } 116 | if (readcharsUntilSlicewiseCheck == 0) 117 | { 118 | slicewiseCorrect = slicewiseCorrect.NextState(mismatches, 64); 119 | char addchar = slicewiseCorrect.CurrentlyCorrect() ? '#' : ' '; 120 | for (int i = 0; i < 64; i++) 121 | { 122 | slicewiseCorrectInfo += addchar; 123 | } 124 | mismatches = 0; 125 | readcharsUntilSlicewiseCheck = 64; 126 | } 127 | } 128 | pad(slicewiseCorrectInfo, trace.size()); 129 | return slicewiseCorrectInfo; 130 | } 131 | 132 | int main(int argc, char** argv) 133 | { 134 | std::string tracefile { argv[1] }; 135 | 136 | std::vector trace = loadTrace(tracefile); 137 | 138 | std::string graphinfo; 139 | std::string graphpath; 140 | std::string alignmentinfo; 141 | std::string readinfo; 142 | std::string readpath; 143 | int oldNodeId = trace[0].nodeID; 144 | bool oldReverse = trace[0].reverse; 145 | int oldReadPos = trace[0].readpos; 146 | size_t splitIndex = 0; 147 | for (int i = 0; i < trace.size(); i++) 148 | { 149 | auto type = trace[i].type; 150 | char readChar = trace[i].readChar; 151 | char graphChar = trace[i].graphChar; 152 | if (i == 0) 153 | { 154 | graphinfo += "v"; 155 | readinfo += "^"; 156 | } 157 | if ((i > 0 && (trace[i].nodeID != trace[i-1].nodeID)) || type == AlignmentResult::TraceMatchType::FORWARDBACKWARDSPLIT) 158 | { 159 | int nodeidInfoLength = std::to_string(oldNodeId).size() + 1; 160 | if (i > graphinfo.size() + nodeidInfoLength) 161 | { 162 | graphinfo += std::to_string(oldNodeId); 163 | if (oldReverse) graphinfo += "-"; else graphinfo += "+"; 164 | } 165 | int readSizeInfoLength = std::to_string(oldReadPos).size(); 166 | if (i > readinfo.size() + readSizeInfoLength) 167 | { 168 | readinfo += std::to_string(oldReadPos); 169 | } 170 | pad(graphinfo, i); 171 | pad(readinfo, i); 172 | graphinfo += "v"; 173 | readinfo += "^"; 174 | oldNodeId = trace[i].nodeID; 175 | oldReverse = trace[i].reverse; 176 | oldReadPos = trace[i].readpos; 177 | } 178 | 179 | switch(type) 180 | { 181 | case AlignmentResult::TraceMatchType::MATCH: 182 | graphpath += graphChar; 183 | readpath += readChar; 184 | alignmentinfo += "|"; 185 | assert(graphChar == readChar); 186 | break; 187 | case AlignmentResult::TraceMatchType::MISMATCH: 188 | graphpath += graphChar; 189 | readpath += readChar; 190 | alignmentinfo += " "; 191 | assert(graphChar != readChar); 192 | break; 193 | case AlignmentResult::TraceMatchType::INSERTION: 194 | graphpath += ' '; 195 | readpath += readChar; 196 | alignmentinfo += " "; 197 | break; 198 | case AlignmentResult::TraceMatchType::DELETION: 199 | graphpath += graphChar; 200 | alignmentinfo += " "; 201 | readpath += ' '; 202 | break; 203 | case AlignmentResult::TraceMatchType::FORWARDBACKWARDSPLIT: 204 | graphpath += graphChar; 205 | readpath += readChar; 206 | alignmentinfo += graphChar == readChar ? '|' : ' '; 207 | splitIndex = i; 208 | break; 209 | } 210 | } 211 | 212 | std::cerr << "splitIndex " << splitIndex << std::endl; 213 | 214 | std::string charwiseCorrectInfo; 215 | std::string slicewiseCorrectInfo; 216 | { 217 | std::vector backwards; 218 | backwards.insert(backwards.end(), trace.begin(), trace.begin()+splitIndex); 219 | std::reverse(backwards.begin(), backwards.end()); 220 | auto bw = getCharwiseCorrectInfo(backwards); 221 | auto slicewise = getSlicewiseCorrectInfo(backwards); 222 | std::reverse(bw.begin(), bw.end()); 223 | std::reverse(slicewise.begin(), slicewise.end()); 224 | charwiseCorrectInfo += bw; 225 | slicewiseCorrectInfo += slicewise; 226 | } 227 | { 228 | std::vector forwards; 229 | forwards.insert(forwards.end(), trace.begin()+splitIndex, trace.end()); 230 | slicewiseCorrectInfo += getSlicewiseCorrectInfo(forwards); 231 | charwiseCorrectInfo += getCharwiseCorrectInfo(forwards); 232 | } 233 | std::cout << " " << graphinfo << std::endl; 234 | std::cout << "GRAPH: " << graphpath << std::endl; 235 | std::cout << " " << alignmentinfo << std::endl; 236 | std::cout << "READ: " << readpath << std::endl; 237 | std::cout << " " << readinfo << std::endl; 238 | std::cout << " " << charwiseCorrectInfo << std::endl; 239 | std::cout << " " << slicewiseCorrectInfo << std::endl; 240 | } -------------------------------------------------------------------------------- /src/fastqloader.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "fastqloader.h" 4 | #include "CommonUtils.h" 5 | 6 | std::vector loadFastqFromFile(std::string filename, bool includeQuality) 7 | { 8 | std::vector result; 9 | FastQ::streamFastqFromFile(filename, includeQuality, [&result](FastQ& fq) { 10 | result.emplace_back(std::move(fq)); 11 | }); 12 | return result; 13 | } 14 | 15 | FastQ FastQ::reverseComplement() const 16 | { 17 | FastQ result; 18 | result.sequence = CommonUtils::ReverseComplement(sequence); 19 | result.seq_id = seq_id; 20 | result.quality = quality; 21 | std::reverse(result.quality.begin(), result.quality.end()); 22 | return result; 23 | } 24 | -------------------------------------------------------------------------------- /src/fastqloader.h: -------------------------------------------------------------------------------- 1 | #ifndef FastqLoader_H 2 | #define FastqLoader_H 3 | 4 | #include 5 | #include 6 | #include //https://github.com/mateidavid/zstr 7 | 8 | class FastQ { 9 | public: 10 | template 11 | static void streamFastqFastqFromStream(std::istream& file, bool includeQuality, F f) 12 | { 13 | do 14 | { 15 | std::string line; 16 | std::getline(file, line); 17 | if (!file.good()) break; 18 | if (line.size() == 0) continue; 19 | if (line[0] != '@') continue; 20 | FastQ newread; 21 | if (line.back() == '\r') line.pop_back(); 22 | newread.seq_id = line.substr(1); 23 | std::getline(file, line); 24 | if (line.back() == '\r') line.pop_back(); 25 | newread.sequence = line; 26 | std::getline(file, line); 27 | std::getline(file, line); 28 | if (line.back() == '\r') line.pop_back(); 29 | if (includeQuality) newread.quality = line; 30 | f(newread); 31 | } while (file.good()); 32 | } 33 | template 34 | static void streamFastqFastaFromStream(std::istream& file, bool includeQuality, F f) 35 | { 36 | std::string line; 37 | std::getline(file, line); 38 | do 39 | { 40 | if (line.size() == 0) 41 | { 42 | std::getline(file, line); 43 | continue; 44 | } 45 | if (line[0] != '>') 46 | { 47 | std::getline(file, line); 48 | continue; 49 | } 50 | FastQ newread; 51 | if (line.back() == '\r') line.pop_back(); 52 | newread.seq_id = line.substr(1); 53 | newread.sequence = ""; 54 | do 55 | { 56 | std::getline(file, line); 57 | if (!file.good()) break; 58 | if (line.size() == 0) continue; 59 | if (line[0] == '>') break; 60 | if (line.back() == '\r') line.pop_back(); 61 | newread.sequence += line; 62 | } while (file.good()); 63 | if (includeQuality) 64 | { 65 | for (size_t i = 0; i < newread.sequence.size(); i++) 66 | { 67 | newread.quality += '!'; 68 | } 69 | } 70 | f(newread); 71 | } while (file.good()); 72 | } 73 | template 74 | static void streamFastqFastqFromFile(std::string filename, bool includeQuality, F f) 75 | { 76 | std::ifstream file {filename}; 77 | streamFastqFastqFromStream(file, includeQuality, f); 78 | } 79 | template 80 | static void streamFastqFastaFromFile(std::string filename, bool includeQuality, F f) 81 | { 82 | std::ifstream file {filename}; 83 | streamFastqFastaFromStream(file, includeQuality, f); 84 | } 85 | template 86 | static void streamFastqFastqFromGzippedFile(std::string filename, bool includeQuality, F f) 87 | { 88 | zstr::ifstream file { filename }; 89 | streamFastqFastqFromStream(file, includeQuality, f); 90 | } 91 | template 92 | static void streamFastqFastaFromGzippedFile(std::string filename, bool includeQuality, F f) 93 | { 94 | zstr::ifstream file { filename }; 95 | streamFastqFastaFromStream(file, includeQuality, f); 96 | } 97 | template 98 | static void streamFastqFromFile(std::string filename, bool includeQuality, F f) 99 | { 100 | bool gzipped = false; 101 | std::string originalFilename = filename; 102 | if (filename.size() > 3 && filename.substr(filename.size()-3) == ".gz") 103 | { 104 | gzipped = true; 105 | filename = filename.substr(0, filename.size()-3); 106 | } 107 | bool fastq = false; 108 | bool fasta = false; 109 | if (filename.size() > 6 && filename.substr(filename.size()-6) == ".fastq") fastq = true; 110 | if (filename.size() > 3 && filename.substr(filename.size()-3) == ".fq") fastq = true; 111 | if (filename.size() > 6 && filename.substr(filename.size()-6) == ".fasta") fasta = true; 112 | if (filename.size() > 3 && filename.substr(filename.size()-3) == ".fa") fasta = true; 113 | if (fasta) 114 | { 115 | if (gzipped) 116 | { 117 | streamFastqFastaFromGzippedFile(originalFilename, includeQuality, f); 118 | return; 119 | } 120 | else 121 | { 122 | streamFastqFastaFromFile(originalFilename, includeQuality, f); 123 | return; 124 | } 125 | } 126 | if (fastq) 127 | { 128 | if (gzipped) 129 | { 130 | streamFastqFastqFromGzippedFile(originalFilename, includeQuality, f); 131 | return; 132 | } 133 | else 134 | { 135 | streamFastqFastqFromFile(originalFilename, includeQuality, f); 136 | return; 137 | } 138 | } 139 | } 140 | FastQ reverseComplement() const; 141 | std::string seq_id; 142 | std::string sequence; 143 | std::string quality; 144 | }; 145 | 146 | std::vector loadFastqFromFile(std::string filename, bool includeQuality = true); 147 | 148 | #endif 149 | -------------------------------------------------------------------------------- /src/stream.hpp: -------------------------------------------------------------------------------- 1 | #ifndef STREAM_H 2 | #define STREAM_H 3 | 4 | // from http://www.mail-archive.com/protobuf@googlegroups.com/msg03417.html 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "google/protobuf/stubs/common.h" 13 | #include "google/protobuf/io/zero_copy_stream.h" 14 | #include "google/protobuf/io/zero_copy_stream_impl.h" 15 | #include "google/protobuf/io/gzip_stream.h" 16 | #include "google/protobuf/io/coded_stream.h" 17 | 18 | namespace stream { 19 | 20 | // write objects 21 | // count should be equal to the number of objects to write 22 | // but if it is 0, it is not written 23 | // if not all objects are written, return false, otherwise true 24 | template 25 | bool write(std::ostream& out, uint64_t count, std::function& lambda) { 26 | 27 | ::google::protobuf::io::ZeroCopyOutputStream *raw_out = 28 | new ::google::protobuf::io::OstreamOutputStream(&out); 29 | ::google::protobuf::io::GzipOutputStream *gzip_out = 30 | new ::google::protobuf::io::GzipOutputStream(raw_out); 31 | ::google::protobuf::io::CodedOutputStream *coded_out = 32 | new ::google::protobuf::io::CodedOutputStream(gzip_out); 33 | 34 | // prefix the chunk with the number of objects 35 | coded_out->WriteVarint64(count); 36 | 37 | std::string s; 38 | uint64_t written = 0; 39 | for (uint64_t n = 0; n < count; ++n, ++written) { 40 | lambda(n).SerializeToString(&s); 41 | // and prefix each object with its size 42 | coded_out->WriteVarint32(s.size()); 43 | coded_out->WriteRaw(s.data(), s.size()); 44 | } 45 | 46 | delete coded_out; 47 | delete gzip_out; 48 | delete raw_out; 49 | 50 | return !count || written == count; 51 | } 52 | 53 | template 54 | bool write_buffered(std::ostream& out, std::vector& buffer, uint64_t buffer_limit) { 55 | bool wrote = false; 56 | if (buffer.size() >= buffer_limit) { 57 | std::function lambda = [&buffer](uint64_t n) { return buffer.at(n); }; 58 | #pragma omp critical (stream_out) 59 | wrote = write(out, buffer.size(), lambda); 60 | buffer.clear(); 61 | } 62 | return wrote; 63 | } 64 | 65 | template 66 | bool write_buffered_ptr(std::ostream& out, std::vector& buffer, uint64_t buffer_limit) { 67 | bool wrote = false; 68 | if (buffer.size() >= buffer_limit) { 69 | std::function lambda = [&buffer](uint64_t n) { return *buffer.at(n); }; 70 | #pragma omp critical (stream_out) 71 | wrote = write(out, buffer.size(), lambda); 72 | buffer.clear(); 73 | } 74 | return wrote; 75 | } 76 | 77 | // deserialize the input stream into the objects 78 | // count containts the count read 79 | // takes a callback function to be called on the objects 80 | 81 | template 82 | bool for_each(std::istream& in, 83 | std::function& lambda, 84 | std::function& handle_count) { 85 | 86 | ::google::protobuf::io::ZeroCopyInputStream *raw_in = 87 | new ::google::protobuf::io::IstreamInputStream(&in); 88 | ::google::protobuf::io::GzipInputStream *gzip_in = 89 | new ::google::protobuf::io::GzipInputStream(raw_in); 90 | ::google::protobuf::io::CodedInputStream *coded_in = 91 | new ::google::protobuf::io::CodedInputStream(gzip_in); 92 | 93 | uint64_t count; 94 | coded_in->ReadVarint64((::google::protobuf::uint64*) &count); 95 | // this loop handles a chunked file with many pieces 96 | // such as we might write in a multithreaded process 97 | if (!count) return !count; 98 | do { 99 | 100 | handle_count(count); 101 | 102 | std::string s; 103 | for (uint64_t i = 0; i < count; ++i) { 104 | uint32_t msgSize = 0; 105 | delete coded_in; 106 | coded_in = new ::google::protobuf::io::CodedInputStream(gzip_in); 107 | // the messages are prefixed by their size 108 | coded_in->ReadVarint32(&msgSize); 109 | if ((msgSize > 0) && 110 | (coded_in->ReadString(&s, msgSize))) { 111 | T object; 112 | object.ParseFromString(s); 113 | lambda(object); 114 | } 115 | } 116 | } while (coded_in->ReadVarint64((::google::protobuf::uint64*) &count)); 117 | 118 | delete coded_in; 119 | delete gzip_in; 120 | delete raw_in; 121 | 122 | return !count; 123 | } 124 | 125 | template 126 | bool for_each(std::istream& in, 127 | std::function& lambda) { 128 | std::function noop = [](uint64_t) { }; 129 | return for_each(in, lambda, noop); 130 | } 131 | 132 | template 133 | bool for_each_parallel(std::istream& in, 134 | std::function& lambda, 135 | std::function& handle_count) { 136 | 137 | ::google::protobuf::io::ZeroCopyInputStream *raw_in = 138 | new ::google::protobuf::io::IstreamInputStream(&in); 139 | ::google::protobuf::io::GzipInputStream *gzip_in = 140 | new ::google::protobuf::io::GzipInputStream(raw_in); 141 | ::google::protobuf::io::CodedInputStream *coded_in = 142 | new ::google::protobuf::io::CodedInputStream(gzip_in); 143 | 144 | uint64_t count; 145 | bool more_input = coded_in->ReadVarint64((::google::protobuf::uint64*) &count); 146 | bool more_objects = false; 147 | // this loop handles a chunked file with many pieces 148 | // such as we might write in a multithreaded process 149 | std::list objects; 150 | int64_t object_count = 0; 151 | int64_t read_threshold = 5000; 152 | if (!count) return !count; 153 | #pragma omp parallel shared(more_input, more_objects, objects, count, in, lambda, handle_count, raw_in, gzip_in, coded_in) 154 | do { 155 | 156 | bool has_object = false; 157 | T object; 158 | #pragma omp critical (objects) 159 | { 160 | if (!objects.empty()) { 161 | object = objects.back(); 162 | objects.pop_back(); 163 | --object_count; 164 | has_object = true; 165 | } 166 | } 167 | if (has_object) { 168 | lambda(object); 169 | } 170 | 171 | #pragma omp master 172 | { 173 | while (more_input && object_count < read_threshold) { 174 | handle_count(count); 175 | std::string s; 176 | for (uint64_t i = 0; i < count; ++i) { 177 | uint32_t msgSize = 0; 178 | // the messages are prefixed by their size 179 | delete coded_in; 180 | coded_in = new ::google::protobuf::io::CodedInputStream(gzip_in); 181 | coded_in->ReadVarint32(&msgSize); 182 | if ((msgSize > 0) && 183 | (coded_in->ReadString(&s, msgSize))) { 184 | T object; 185 | object.ParseFromString(s); 186 | #pragma omp critical (objects) 187 | { 188 | objects.push_front(object); 189 | ++object_count; 190 | } 191 | } 192 | } 193 | more_input = coded_in->ReadVarint64((::google::protobuf::uint64*) &count); 194 | } 195 | } 196 | #pragma omp critical (objects) 197 | more_objects = (object_count > 0); 198 | 199 | } while (more_input || more_objects); 200 | 201 | delete coded_in; 202 | delete gzip_in; 203 | delete raw_in; 204 | 205 | return !count; 206 | } 207 | 208 | template 209 | bool for_each_parallel(std::istream& in, 210 | std::function& lambda) { 211 | std::function noop = [](uint64_t) { }; 212 | return for_each_parallel(in, lambda, noop); 213 | } 214 | 215 | } 216 | 217 | #endif 218 | -------------------------------------------------------------------------------- /test/graph.gfa: -------------------------------------------------------------------------------- 1 | S 1 ACGTCATGCAGTCGTAACGTAGTCGTCACAGTCAGTCGTAGCTA 2 | S 2 A 3 | S 3 T 4 | S 4 GTAGCGTCAGTCAGTCAGTCGTAGCGTAACGTCGTAGTCAGT 5 | L 1 + 2 + 0M 6 | L 1 + 3 + 0M 7 | L 2 + 4 + 0M 8 | L 3 + 4 + 0M 9 | -------------------------------------------------------------------------------- /test/read.fa: -------------------------------------------------------------------------------- 1 | >read 2 | TCATCCACGTCGTAACGTAGTCGTCACAGTCAGTCGTAGCTAAGTACGTCAAGTCAGACAGTCGTAGCGTA 3 | --------------------------------------------------------------------------------