├── .gitignore
├── .gitmodules
├── .vscode
    └── settings.json
├── CondaEnvironment.yml
├── LICENSE.md
├── README.md
├── Snakemakes
    └── ErrorCorrect
    │   ├── README.md
    │   ├── Snakefile
    │   └── config.yaml
├── edlib
    ├── include
    │   └── edlib.h
    └── src
    │   └── edlib.cpp
├── makefile
├── scripts
    ├── 10fold_test
    │   ├── gen_test.py
    │   ├── sos.py
    │   └── vg_pb2.py
    ├── summary.py
    ├── test.sh
    └── vg_pb2.py
├── src
    ├── Aligner.cpp
    ├── Aligner.h
    ├── AlignerMain.cpp
    ├── AlignmentCorrectnessEstimation.cpp
    ├── AlignmentCorrectnessEstimation.h
    ├── AlignmentGraph.cpp
    ├── AlignmentGraph.h
    ├── AlignmentSelection.cpp
    ├── AlignmentSelection.h
    ├── AlignmentSubsequenceIdentity.cpp
    ├── ArrayPriorityQueue.h
    ├── BigraphToDigraph.cpp
    ├── BigraphToDigraph.h
    ├── BruteForceExactPrefixSeeds.cpp
    ├── CommonUtils.cpp
    ├── CommonUtils.h
    ├── ComponentPriorityQueue.h
    ├── DijkstraQueue.h
    ├── EValue.cpp
    ├── EValue.h
    ├── EstimateRepeatCount.cpp
    ├── ExtractCorrectedReads.cpp
    ├── ExtractExactPathSubgraph.cpp
    ├── ExtractPathSequence.cpp
    ├── ExtractPathSubgraphNeighbourhood.cpp
    ├── FusionFinder.cpp
    ├── GfaGraph.cpp
    ├── GfaGraph.h
    ├── GraphAligner.h
    ├── GraphAlignerBitvectorBanded.h
    ├── GraphAlignerBitvectorCommon.h
    ├── GraphAlignerBitvectorDijkstra.h
    ├── GraphAlignerCommon.h
    ├── GraphAlignerGAFAlignment.h
    ├── GraphAlignerVGAlignment.h
    ├── GraphAlignerWrapper.cpp
    ├── GraphAlignerWrapper.h
    ├── MafToAlignment.cpp
    ├── MinimizerSeeder.cpp
    ├── MinimizerSeeder.h
    ├── MummerSeeder.cpp
    ├── MummerSeeder.h
    ├── NodePosCsv.cpp
    ├── NodeSlice.h
    ├── PickAdjacentAlnPairs.cpp
    ├── PickMummerSeeds.cpp
    ├── Postprocess.cpp
    ├── ReadCorrection.cpp
    ├── ReadCorrection.h
    ├── ReverseReads.cpp
    ├── SelectLongestAlignment.cpp
    ├── SimulateReads.cpp
    ├── SupportedSubgraph.cpp
    ├── ThreadReadAssertion.cpp
    ├── ThreadReadAssertion.h
    ├── UnitigifyDBG.cpp
    ├── UntipRelative.cpp
    ├── VisualizeAlignment.cpp
    ├── WordSlice.h
    ├── fastqloader.cpp
    ├── fastqloader.h
    ├── stream.hpp
    └── vg.proto
└── test
    ├── graph.gfa
    └── read.fa


/.gitignore:
--------------------------------------------------------------------------------
1 | obj/*
2 | bin/*
3 | src/vg.pb.cc
4 | src/vg.pb.h


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "zstr"]
 2 | 	path = zstr
 3 | 	url = https://github.com/mateidavid/zstr.git
 4 | [submodule "concurrentqueue"]
 5 | 	path = concurrentqueue
 6 | 	url = https://github.com/cameron314/concurrentqueue.git
 7 | [submodule "parallel-hashmap"]
 8 | 	path = parallel-hashmap
 9 | 	url = https://github.com/greg7mdp/parallel-hashmap.git
10 | [submodule "BBHash"]
11 | 	path = BBHash
12 | 	url = https://github.com/maickrau/BBHash.git
13 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "files.associations": {
 3 |         "*.geojson": "json",
 4 |         "vector": "cpp",
 5 |         "__split_buffer": "cpp",
 6 |         "__tree": "cpp",
 7 |         "algorithm": "cpp",
 8 |         "array": "cpp",
 9 |         "deque": "cpp",
10 |         "list": "cpp",
11 |         "locale": "cpp",
12 |         "queue": "cpp",
13 |         "random": "cpp",
14 |         "regex": "cpp",
15 |         "stack": "cpp",
16 |         "string": "cpp",
17 |         "string_view": "cpp",
18 |         "type_traits": "cpp",
19 |         "typeinfo": "cpp",
20 |         "__bit_reference": "cpp",
21 |         "__config": "cpp",
22 |         "__debug": "cpp",
23 |         "__errc": "cpp",
24 |         "__functional_base": "cpp",
25 |         "__hash_table": "cpp",
26 |         "__locale": "cpp",
27 |         "__mutex_base": "cpp",
28 |         "__node_handle": "cpp",
29 |         "__nullptr": "cpp",
30 |         "__string": "cpp",
31 |         "__threading_support": "cpp",
32 |         "__tuple": "cpp",
33 |         "any": "cpp",
34 |         "atomic": "cpp",
35 |         "bit": "cpp",
36 |         "bitset": "cpp",
37 |         "cctype": "cpp",
38 |         "chrono": "cpp",
39 |         "cinttypes": "cpp",
40 |         "cmath": "cpp",
41 |         "complex": "cpp",
42 |         "condition_variable": "cpp",
43 |         "csignal": "cpp",
44 |         "cstdarg": "cpp",
45 |         "cstddef": "cpp",
46 |         "cstdint": "cpp",
47 |         "cstdio": "cpp",
48 |         "cstdlib": "cpp",
49 |         "cstring": "cpp",
50 |         "ctime": "cpp",
51 |         "cwchar": "cpp",
52 |         "cwctype": "cpp",
53 |         "exception": "cpp",
54 |         "forward_list": "cpp",
55 |         "fstream": "cpp",
56 |         "functional": "cpp",
57 |         "future": "cpp",
58 |         "initializer_list": "cpp",
59 |         "iomanip": "cpp",
60 |         "ios": "cpp",
61 |         "iosfwd": "cpp",
62 |         "iostream": "cpp",
63 |         "istream": "cpp",
64 |         "iterator": "cpp",
65 |         "limits": "cpp",
66 |         "map": "cpp",
67 |         "memory": "cpp",
68 |         "mutex": "cpp",
69 |         "new": "cpp",
70 |         "numeric": "cpp",
71 |         "optional": "cpp",
72 |         "ostream": "cpp",
73 |         "ratio": "cpp",
74 |         "scoped_allocator": "cpp",
75 |         "set": "cpp",
76 |         "shared_mutex": "cpp",
77 |         "sstream": "cpp",
78 |         "stdexcept": "cpp",
79 |         "streambuf": "cpp",
80 |         "system_error": "cpp",
81 |         "thread": "cpp",
82 |         "tuple": "cpp",
83 |         "unordered_map": "cpp",
84 |         "unordered_set": "cpp",
85 |         "utility": "cpp",
86 |         "filesystem": "cpp"
87 |     }
88 | }


--------------------------------------------------------------------------------
/CondaEnvironment.yml:
--------------------------------------------------------------------------------
 1 | name: GraphChainer
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - _libgcc_mutex=0.1=main
 8 |   - asn1crypto=0.24.0=py36_0
 9 |   - bcrypt=3.1.4=py36h14c3975_0
10 |   - binutils=2.31=0
11 |   - binutils_impl_linux-64=2.28.1=had2808c_3
12 |   - binutils_linux-64=7.2.0=had2808c_27
13 |   - blas=1.0=mkl
14 |   - boost=1.67.0=py36_4
15 |   - bzip2=1.0.6=h14c3975_5
16 |   - ca-certificates=2019.9.11=hecc5488_0
17 |   - certifi=2019.9.11=py36_0
18 |   - cffi=1.11.5=py36h9745a5d_0
19 |   - chardet=3.0.4=py36_1
20 |   - cloog=0.18.0=0
21 |   - cryptography=2.2.2=py36h14c3975_0
22 |   - docutils=0.14=py36_0
23 |   - dropbox=8.9.0=py36_0
24 |   - filechunkio=1.6=py36_0
25 |   - ftputil=3.2=py36_0
26 |   - gcc_impl_linux-64=7.2.0=habb00fd_3
27 |   - gcc_linux-64=7.2.0=h550dcbe_27
28 |   - gmp=6.1.2=h6c8ec71_1
29 |   - gxx_impl_linux-64=7.2.0=hdf63c60_3
30 |   - gxx_linux-64=7.2.0=h550dcbe_27
31 |   - icu=58.2=h9c2bf20_1
32 |   - idna=2.7=py36_0
33 |   - intel-openmp=2018.0.3=0
34 |   - isl=0.17.1=0
35 |   - jemalloc=5.0.1=hf484d3e_1
36 |   - libboost=1.67.0=h46d08c1_4
37 |   - libdivsufsort=2.0.2=h470a237_2
38 |   - libedit=3.1.20170329=h6b74fdf_2
39 |   - libffi=3.2.1=hd88cf55_4
40 |   - libgcc-ng=8.2.0=hdf63c60_1
41 |   - libgfortran-ng=7.2.0=hdf63c60_3
42 |   - libprotobuf=3.6.0=hdbcaa40_0
43 |   - libsodium=1.0.16=h1bed415_0
44 |   - libstdcxx-ng=8.2.0=hdf63c60_1
45 |   - mkl=2018.0.3=1
46 |   - mkl_fft=1.0.4=py36h4414c95_1
47 |   - mkl_random=1.0.1=py36h4414c95_1
48 |   - mpc=1.1.0=h10f8cd9_1
49 |   - mpfr=4.0.1=hdf1c602_3
50 |   - mummer4=4.0.0beta2=pl526hf484d3e_4
51 |   - ncurses=6.1=hf484d3e_0
52 |   - numpy=1.15.0=py36h1b885b7_0
53 |   - numpy-base=1.15.0=py36h3dfced4_0
54 |   - openssl=1.0.2r=h14c3975_0
55 |   - pandas=0.23.3=py36h04863e7_0
56 |   - paramiko=2.4.1=py36_0
57 |   - perl=5.26.2=h14c3975_0
58 |   - pip=10.0.1=py36_0
59 |   - pkg-config=0.29.2=h1bed415_8
60 |   - protobuf=3.6.0=py36hf484d3e_0
61 |   - psutil=5.4.6=py36h14c3975_0
62 |   - py-boost=1.67.0=py36h04863e7_4
63 |   - pyasn1=0.4.4=py36_0
64 |   - pycparser=2.18=py36_1
65 |   - pynacl=1.2.1=py36h14c3975_0
66 |   - pyopenssl=18.0.0=py36_0
67 |   - pysftp=0.2.9=py36_0
68 |   - pysocks=1.6.8=py36_0
69 |   - python=3.6.6=hc3d631a_0
70 |   - python-dateutil=2.7.3=py36_0
71 |   - pytz=2018.5=py36_0
72 |   - pyyaml=3.13=py36h14c3975_0
73 |   - readline=7.0=ha6073c6_4
74 |   - requests=2.19.1=py36_0
75 |   - sdsl-lite=2.1.1=hc9558a2_1001
76 |   - setuptools=39.2.0=py36_0
77 |   - six=1.11.0=py36_1
78 |   - snakemake=3.13.3=py36_0
79 |   - sparsehash=2.0.3=0
80 |   - sqlite=3.24.0=h84994c4_0
81 |   - tk=8.6.7=hc745277_3
82 |   - urllib3=1.23=py36_0
83 |   - wheel=0.31.1=py36_0
84 |   - wrapt=1.10.11=py36h14c3975_2
85 |   - xz=5.2.4=h14c3975_4
86 |   - yaml=0.1.7=had09818_2
87 |   - zlib=1.2.11=ha838bed_2
88 | 
89 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020-2021 Jun Ma, Manuel Cáceres, Alexandru Tomescu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GraphChainer
 2 | 
 3 | GraphChainer is an accurate aligner of long reads to a variation graph, based on co-linear chaining.
 4 | 
 5 | ### Compiling
 6 | 
 7 | To compile, run these:
 8 | 
 9 | - Install [miniconda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)
10 | - `git submodule update --init --recursive`
11 | - `conda env create -f CondaEnvironment.yml`
12 | - `conda activate GraphChainer`
13 | - `make bin/GraphChainer`
14 | 
15 | ### Running
16 | 
17 | Quickstart: `./bin/GraphChainer -t 4 -f reads.fastq -g graph.gfa -a out.gam`
18 | 
19 | Key parameters:
20 | - `-t` Number of threads (optional, default 1).
21 | - `-f` Input reads. Format .fasta / .fastq / .fasta.gz / .fastq.gz. You can input multiple files with `-f file1 -f file2 ...` or `-f file1 file2 ...`.
22 | - `-g` Input graph, format .gfa / .vg. **This graph must be acyclic**, see below how to construct an acyclic graph with vg.
23 | - `-a` Output file name. Format .gam or .json.
24 | 
25 | Parameters related to colinear chaining:
26 | - `--sampling-step <double>` Sampling step factor (default 1). Use >1 (<1, >0) for faster (slower), but less (more) accurate alignments. It increases (decreases) the sampling sparsity of fragments.
27 | - `--colinear-split-len <int>` The length of the fragments in which the long read is split to create anchors (default 35).
28 | - `--colinear-split-gap <int>` The distance between consecutive fragments (default 35). If `--sampling-step` is set, then always `--colinear-split-gap = ceil(--sampling-step * --colinear-split-len`).
29 | - `--colinear-gap <int>` When converting an optimal chain of anchors into an alignment path, split the path if the distance in the graph between consecutive anchors is greater than this value (default 10000).
30 | 
31 | ### Constructing an (acyclic) variation graph
32 | 
33 | Use [vg](https://github.com/vgteam/vg) and run:
34 | 
35 | `vg construct -t 30 -a -r {ref} -v {vcf} -R 22 -p -m 3000000`
36 | 
37 | ### Datasets availability
38 | 
39 | The graphs built for the experiments of GraphChainer can be found in Zenodo at [https://doi.org/10.5281/zenodo.7729494
40 | ](https://doi.org/10.5281/zenodo.7729494
41 | ), [https://doi.org/10.5281/zenodo.6875064](https://doi.org/10.5281/zenodo.6875064) and at [https://doi.org/10.5281/zenodo.6587252](https://doi.org/10.5281/zenodo.6587252)
42 | 
43 | The real read sets can be found in Zenodo ar [TODO](TODO)
44 | 
45 | The evaluation pipeline used in the paper can be found at [https://github.com/algbio/GraphChainer-scripts](https://github.com/algbio/GraphChainer-scripts)
46 | 
47 | ### Citation
48 | 
49 | If you use GraphChainer, please cite as:
50 | 
51 | Jun Ma, Manuel Cáceres, Leena Salmela, Veli Mäkinen, Alexandru I. Tomescu. Chaining for accurate alignment of erroneous long reads to acyclic variation graphs. Bioinformatics, 2023, 39(8), btad460 [https://doi.org/10.1093/bioinformatics/btad460](https://doi.org/10.1093/bioinformatics/btad460).
52 | 
53 | ### Credits
54 | 
55 | GraphChainer is built on the excellent code base of [GraphAligner](https://github.com/maickrau/GraphAligner), which is released under [MIT License](https://github.com/maickrau/GraphAligner/blob/master/LICENSE.md). GraphAligner is described in the paper [GraphAligner: Rapid and Versatile Sequence-to-Graph Alignment](https://doi.org/10.1186/s13059-020-02157-2) by Mikko Rautiainen and Tobias Marschall.
56 | 


--------------------------------------------------------------------------------
/Snakemakes/ErrorCorrect/README.md:
--------------------------------------------------------------------------------
 1 | A Snakemake pipeline for error correcting long reads based on short reads.
 2 | 
 3 | Installation:
 4 | 
 5 | - Install snakemake, lighter, bcalm2 and GraphAligner: `conda install -c bioconda snakemake lighter bcalm graphaligner`
 6 | - Download the bcalm2 GFA conversion script from https://github.com/GATB/bcalm/blob/master/scripts/convertToGFA.py
 7 | 
 8 | Running:
 9 | 
10 | - Save `Snakefile` and `config.yaml` to a folder
11 | - Edit the parameters in `config.yaml`
12 | - Run `snakemake --cores 8 all` (you can use more than 8 cores)
13 | - The corrected reads will be in the output folder:
14 |   - `corrected.fasta` has the reads with the aligned sequence replaced by the alignment path. Uppercase sequence are corrected and lowercase are uncorrected.
15 |   - `corrected_clipped.fasta` has the reads cut across non-corrected parts. All sequence is corrected and the read name contains the position in the original read.
16 | 


--------------------------------------------------------------------------------
/Snakemakes/ErrorCorrect/Snakefile:
--------------------------------------------------------------------------------
  1 | configfile: "config.yaml"
  2 | GRAPHALIGNERPATH = config["GraphAlignerPath"]
  3 | BCALMPATH = config["BcalmPath"]
  4 | BCALMCONVERTPATH = config["BcalmConvertPath"]
  5 | LIGHTERPATH = config["LighterPath"]
  6 | GENOMESIZE = config["GenomeSize"]
  7 | SHORTREADCOVERAGE = config["ShortreadCoverage"]
  8 | TMPDIR = config["TempDirectory"]
  9 | OUTDIR = config["OutputDirectory"]
 10 | SHORTREADDIR = config["ShortReadDirectory"]
 11 | SHORTREADS = config["ShortReads"]
 12 | LONGREADDIR = config["LongReadDirectory"]
 13 | LONGREADS = config["LongReads"]
 14 | SMALLK = config["SmallK"]
 15 | BIGK = config["BigK"]
 16 | ABUNDANCE = config["Abundance"]
 17 | GRAPHALIGNERPARAMS = config["GraphAlignerParams"]
 18 | 
 19 | SHORTREADNAMES = [n.split('.')[0] for n in SHORTREADS]
 20 | SHORTREADEXTENSIONS = ['.'.join(n.split('.')[1:]) for n in SHORTREADS]
 21 | 
 22 | rule all:
 23 | 	input:
 24 | 		OUTDIR + "corrected.fa",
 25 | 		OUTDIR + "corrected_clipped.fa",
 26 | 		OUTDIR + "stats.txt"
 27 | 
 28 | rule correct_short_reads:
 29 | 	input:
 30 | 		expand(SHORTREADDIR + "{name}.{ext}", zip, name=SHORTREADNAMES, ext=SHORTREADEXTENSIONS)
 31 | 	output:
 32 | 		temp(expand(TMPDIR + "{name}.cor.{ext}", zip, name=SHORTREADNAMES, ext=SHORTREADEXTENSIONS))
 33 | 	params:
 34 | 		files = lambda wildcards, input: ' '.join(["-r " + name for name in input]),
 35 | 		alpha = 7.0 / SHORTREADCOVERAGE
 36 | 	threads: 40
 37 | 	log:
 38 | 		stdout = TMPDIR + "lighter_stdout.txt",
 39 | 		stderr = TMPDIR + "lighter_stderr.txt"
 40 | 	shell:
 41 | 		"/usr/bin/time -v {LIGHTERPATH} -od {TMPDIR} -t {threads} -k {SMALLK} {GENOMESIZE} {params.alpha} {params.files} 1> {log.stdout} 2> {log.stderr}"
 42 | 
 43 | rule read_names:
 44 | 	input: rules.correct_short_reads.output
 45 | 	output: temp("filenames")
 46 | 	shell: "readlink -f {input} > {output}"
 47 | 
 48 | rule run_bcalm:
 49 | 	input: 
 50 | 		name = "filenames",
 51 | 		files = rules.correct_short_reads.output
 52 | 	output: temp("filenames.unitigs.fa")
 53 | 	shadow: "full"
 54 | 	log:
 55 | 		stdout = TMPDIR + "bcalm_stdout.txt",
 56 | 		stderr = TMPDIR + "bcalm_stderr.txt"
 57 | 	threads: 40
 58 | 	shell: "/usr/bin/time -v {BCALMPATH} -nb-cores {threads} -in {input.name} -kmer-size {BIGK} -abundance-min {ABUNDANCE} > {log.stdout} 2> {log.stderr}"
 59 | 
 60 | rule convert_bcalm:
 61 | 	input: rules.run_bcalm.output
 62 | 	output: TMPDIR + "graph.gfa"
 63 | 	shell: "{BCALMCONVERTPATH} {input} {output} {BIGK}"
 64 | 
 65 | rule align_reads:
 66 | 	input:
 67 | 		graph = TMPDIR + "graph.gfa",
 68 | 		reads = expand(LONGREADDIR + "{name}", name=LONGREADS)
 69 | 	params:
 70 | 		readconcat = lambda wildcards, input: ' '.join(input.reads)
 71 | 	output:
 72 | 		corrected = OUTDIR + "corrected.fa",
 73 | 		clipped = OUTDIR + "corrected_clipped.fa"
 74 | 	log:
 75 | 		stdout = TMPDIR + "aligner_stdout.txt",
 76 | 		stderr = TMPDIR + "aligner_stderr.txt"
 77 | 	threads: 40
 78 | 	shell:
 79 | 		"/usr/bin/time -v {GRAPHALIGNERPATH} -g {input.graph} --corrected-out {output.corrected} --corrected-clipped-out {output.clipped} -f {params.readconcat} -t {threads} {GRAPHALIGNERPARAMS} 1> {log.stdout} 2> {log.stderr}"
 80 | 
 81 | rule get_stats:
 82 | 	input:
 83 | 		aligner_stdout = TMPDIR + "aligner_stdout.txt",
 84 | 		aligner_stderr = TMPDIR + "aligner_stderr.txt",
 85 | 		bcalm_stdout = TMPDIR + "bcalm_stdout.txt",
 86 | 		bcalm_stderr = TMPDIR + "bcalm_stderr.txt",
 87 | 		lighter_stdout = TMPDIR + "lighter_stdout.txt",
 88 | 		lighter_stderr = TMPDIR + "lighter_stderr.txt"
 89 | 	output:
 90 | 		OUTDIR + "stats.txt"
 91 | 	run:
 92 | 		shell("grep 'Input reads' < {input.aligner_stdout} >> {output}")
 93 | 		shell("grep 'Reads with a seed' < {input.aligner_stdout} >> {output}")
 94 | 		shell("grep 'Reads with an alignment' < {input.aligner_stdout} >> {output}")
 95 | 		shell("grep 'Alignments' < {input.aligner_stdout} >> {output}")
 96 | 		shell("grep 'End-to-end alignments' < {input.aligner_stdout} >> {output}")
 97 | 		shell("echo 'Lighter' >> {output}"),
 98 | 		shell("grep 'User time' < {input.lighter_stderr} >> {output}")
 99 | 		shell("grep 'System time' < {input.lighter_stderr} >> {output}")
100 | 		shell("grep 'Elapsed (wall clock)' < {input.lighter_stderr} >> {output}")
101 | 		shell("grep 'Maximum resident set size' < {input.lighter_stderr} >> {output}")
102 | 		shell("echo 'BCalm' >> {output}"),
103 | 		shell("grep 'User time' < {input.bcalm_stderr} >> {output}")
104 | 		shell("grep 'System time' < {input.bcalm_stderr} >> {output}")
105 | 		shell("grep 'Elapsed (wall clock)' < {input.bcalm_stderr} >> {output}")
106 | 		shell("grep 'Maximum resident set size' < {input.bcalm_stderr} >> {output}")
107 | 		shell("echo 'Aligner' >> {output}"),
108 | 		shell("grep 'User time' < {input.aligner_stderr} >> {output}")
109 | 		shell("grep 'System time' < {input.aligner_stderr} >> {output}")
110 | 		shell("grep 'Elapsed (wall clock)' < {input.aligner_stderr} >> {output}")
111 | 		shell("grep 'Maximum resident set size' < {input.aligner_stderr} >> {output}")
112 | 


--------------------------------------------------------------------------------
/Snakemakes/ErrorCorrect/config.yaml:
--------------------------------------------------------------------------------
 1 | ### Change these!!
 2 | GenomeSize: 4600000
 3 | ShortreadCoverage: 200
 4 | 
 5 | ShortReadDirectory: shortreads/
 6 | # NOTE: short read endings MUST be .fq or .fa instead of .fastq or .fasta
 7 | # gzip is allowed
 8 | ShortReads:
 9 | - reads1.fq
10 | - reads2.fq.gz
11 | 
12 | LongReadDirectory: longreads/
13 | LongReads:
14 | - reads1.fq
15 | - reads2.fq.gz
16 | 
17 | TempDirectory: tmp/
18 | OutputDirectory: output/
19 | 
20 | # https://github.com/maickrau/GraphAligner
21 | GraphAlignerPath: GraphAligner
22 | # https://github.com/GATB/bcalm
23 | BcalmPath: bcalm
24 | # https://github.com/GATB/bcalm/blob/master/scripts/convertToGFA.py
25 | BcalmConvertPath: bcalm/scripts/convertToGFA.py
26 | # https://github.com/mourisl/Lighter
27 | LighterPath: lighter
28 | 
29 | 
30 | ### Misc params. Defaults might work
31 | 
32 | # k for error correcting the reads. Try between 10-30
33 | SmallK: 23
34 | # k for the de Bruijn graph. Try between ~1/2 and ~2/3 of short read length
35 | BigK: 63
36 | # minimum k-mer abundance for the de Bruijn graph. Try between 1/100 to 2/100 of short read coverage, but not below 2.
37 | Abundance: 3
38 | # Parameters for GraphAligner
39 | GraphAlignerParams: -x dbg
40 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | GPP=$(CXX)
 2 | CPPFLAGS=-w -std=c++17 -O3 -Iconcurrentqueue -I edlib/include -IBBHash -Izstr/src -Iparallel-hashmap/parallel_hashmap/ `pkg-config --cflags protobuf` `pkg-config --cflags libsparsehash` `pkg-config --cflags mummer` -fopenmp -Wno-unused-parameter
 3 | 
 4 | ODIR=obj
 5 | BINDIR=bin
 6 | SRCDIR=src
 7 | 
 8 | LIBS=-lm -lz -lboost_serialization -lboost_program_options `pkg-config --libs mummer`  `pkg-config --libs protobuf` -lsdsl
 9 | JEMALLOCFLAGS= -L`jemalloc-config --libdir` -Wl,-rpath,`jemalloc-config --libdir` -Wl,-Bstatic -ljemalloc -Wl,-Bdynamic `jemalloc-config --libs`
10 | 
11 | _DEPS = vg.pb.h fastqloader.h GraphAlignerWrapper.h vg.pb.h BigraphToDigraph.h stream.hpp Aligner.h ThreadReadAssertion.h AlignmentGraph.h CommonUtils.h GfaGraph.h AlignmentCorrectnessEstimation.h MummerSeeder.h ReadCorrection.h MinimizerSeeder.h AlignmentSelection.h EValue.h
12 | DEPS = $(patsubst %, $(SRCDIR)/%, $(_DEPS))
13 | 
14 | _OBJ = Aligner.o vg.pb.o fastqloader.o BigraphToDigraph.o ThreadReadAssertion.o AlignmentGraph.o CommonUtils.o GraphAlignerWrapper.o GfaGraph.o AlignmentCorrectnessEstimation.o MummerSeeder.o ReadCorrection.o MinimizerSeeder.o AlignmentSelection.o EValue.o
15 | OBJ = $(patsubst %, $(ODIR)/%, $(_OBJ))
16 | 
17 | LINKFLAGS = $(CPPFLAGS) -Wl,-Bstatic $(LIBS) -Wl,-Bdynamic -Wl,--as-needed -lpthread -pthread -static-libstdc++ $(JEMALLOCFLAGS) `pkg-config --libs libdivsufsort` `pkg-config --libs libdivsufsort64`
18 | 
19 | VERSION := Branch $(shell git rev-parse --abbrev-ref HEAD) commit $(shell git rev-parse HEAD) $(shell git show -s --format=%ci)
20 | 
21 | $(shell mkdir -p bin)
22 | $(shell mkdir -p obj)
23 | 
24 | $(BINDIR)/GraphChainer: $(ODIR)/AlignerMain.o $(OBJ) edlib/src/edlib.cpp
25 | 	$(GPP) -o $@ $^ $(LINKFLAGS)
26 | 
27 | $(ODIR)/GraphAlignerWrapper.o: $(SRCDIR)/GraphAlignerWrapper.cpp $(SRCDIR)/GraphAligner.h $(SRCDIR)/NodeSlice.h $(SRCDIR)/WordSlice.h $(SRCDIR)/ArrayPriorityQueue.h $(SRCDIR)/ComponentPriorityQueue.h $(SRCDIR)/GraphAlignerVGAlignment.h $(SRCDIR)/GraphAlignerGAFAlignment.h $(SRCDIR)/GraphAlignerBitvectorBanded.h $(SRCDIR)/GraphAlignerBitvectorCommon.h $(SRCDIR)/GraphAlignerCommon.h $(SRCDIR)/DijkstraQueue.h $(SRCDIR)/GraphAlignerBitvectorDijkstra.h $(DEPS)
28 | 
29 | $(ODIR)/AlignerMain.o: $(SRCDIR)/AlignerMain.cpp $(DEPS)
30 | 	$(GPP) -c -o $@ $< $(CPPFLAGS) -DVERSION="\"$(VERSION)\""
31 | 
32 | $(ODIR)/%.o: $(SRCDIR)/%.cpp $(DEPS)
33 | 	$(GPP) -c -o $@ $< $(CPPFLAGS)
34 | 
35 | $(ODIR)/vg.pb.o: $(SRCDIR)/vg.pb.cc
36 | 	$(GPP) -c -o $@ $< $(CPPFLAGS)
37 | 
38 | $(SRCDIR)/%.pb.cc $(SRCDIR)/%.pb.h: $(SRCDIR)/%.proto
39 | 	protoc -I=$(SRCDIR) --cpp_out=$(SRCDIR) $<
40 | 
41 | $(BINDIR)/FusionFinder: $(SRCDIR)/FusionFinder.cpp $(OBJ)
42 | 	$(GPP) -o $@ $^ $(LINKFLAGS) -DVERSION="\"$(VERSION)\""
43 | 
44 | $(BINDIR)/ExtractPathSequence: $(SRCDIR)/ExtractPathSequence.cpp $(ODIR)/CommonUtils.o $(ODIR)/GfaGraph.o $(ODIR)/ThreadReadAssertion.o $(ODIR)/fastqloader.o $(ODIR)/vg.pb.o
45 | 	$(GPP) -o $@ $^ $(LINKFLAGS)
46 | 
47 | $(BINDIR)/SelectLongestAlignment: $(SRCDIR)/SelectLongestAlignment.cpp $(ODIR)/CommonUtils.o $(ODIR)/vg.pb.o $(ODIR)/fastqloader.o
48 | 	$(GPP) -o $@ $^ $(LINKFLAGS)
49 | 
50 | $(BINDIR)/Postprocess: $(SRCDIR)/Postprocess.cpp $(ODIR)/AlignmentSelection.o $(ODIR)/CommonUtils.o $(ODIR)/vg.pb.o $(ODIR)/fastqloader.o
51 | 	$(GPP) -o $@ $^ $(LINKFLAGS)
52 | 
53 | $(BINDIR)/AlignmentSubsequenceIdentity: $(SRCDIR)/AlignmentSubsequenceIdentity.cpp $(ODIR)/CommonUtils.o $(ODIR)/vg.pb.o $(ODIR)/GfaGraph.o $(ODIR)/fastqloader.o $(ODIR)/ThreadReadAssertion.o
54 | 	$(GPP) -o $@ $^ $(LINKFLAGS)
55 | 
56 | $(BINDIR)/UntipRelative: $(SRCDIR)/UntipRelative.cpp $(ODIR)/CommonUtils.o $(ODIR)/vg.pb.o $(ODIR)/GfaGraph.o $(ODIR)/fastqloader.o $(ODIR)/ThreadReadAssertion.o
57 | 	$(GPP) -o $@ $^ $(LINKFLAGS)
58 | 
59 | $(BINDIR)/PickAdjacentAlnPairs: $(SRCDIR)/PickAdjacentAlnPairs.cpp $(ODIR)/CommonUtils.o $(ODIR)/vg.pb.o $(ODIR)/GfaGraph.o $(ODIR)/fastqloader.o $(ODIR)/ThreadReadAssertion.o
60 | 	$(GPP) -o $@ $^ $(LINKFLAGS)
61 | 
62 | $(BINDIR)/ExtractCorrectedReads: $(SRCDIR)/ExtractCorrectedReads.cpp $(ODIR)/ReadCorrection.o $(ODIR)/CommonUtils.o $(ODIR)/vg.pb.o $(ODIR)/GfaGraph.o $(ODIR)/fastqloader.o $(ODIR)/ThreadReadAssertion.o
63 | 	$(GPP) -o $@ $^ $(LINKFLAGS)
64 | 
65 | all: $(BINDIR)/GraphAligner $(BINDIR)/ExtractPathSequence $(BINDIR)/SelectLongestAlignment $(BINDIR)/AlignmentSubsequenceIdentity $(BINDIR)/PickAdjacentAlnPairs $(BINDIR)/ExtractCorrectedReads $(BINDIR)/UntipRelative
66 | 
67 | clean:
68 | 	rm -f $(ODIR)/*
69 | 	rm -f $(BINDIR)/*
70 | 	rm -f $(SRCDIR)/vg.pb.cc
71 | 	rm -f $(SRCDIR)/vg.pb.h
72 | 


--------------------------------------------------------------------------------
/scripts/10fold_test/gen_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import os
 4 | 
 5 | N = 2
 6 | Bin = "./bin/GraphAligner"
 7 | Graphs = ["/mnt/c/Code/Summer/GCimplements/data/LRC/LRC.vg", "/mnt/c/Code/Summer/GCimplements/data/MHC/MHC1.vg"]
 8 | Data = "./data/"
 9 | Gams = "./gams/"
10 | Logs = "./logs/"
11 | force_redo = True
12 | Threads = 4
13 | # default badread is length ~ (mean=15000,std=10000)
14 | # default pbsim clr is length ~ (mean=3000,std=2300)
15 | # or more `real` length ~ (mean=15000,std=10000)
16 | read_length_mean, read_length_std = 3000, 2300
17 | 
18 | def mkdir_safe(path):
19 |     if not os.path.exists(path):
20 |         os.system(f"mkdir -p {path}")
21 | mkdir_safe(Data)
22 | mkdir_safe(Gams)
23 | mkdir_safe(Logs)
24 | 
25 | 
26 | 
27 | def time_cmd(cmd, log):
28 |     return f"/usr/bin/time -o {log} -a -v {cmd}"
29 | def log_cmd(cmd, log):
30 |     return f"{cmd} 1>>{log} 2>>{log}"
31 | def run(cmd, log = "", t = True, l = True, v = True):
32 |     if v:
33 |         print(cmd)
34 |     if log != "":
35 |         open(log, 'a').write(cmd + '\n')
36 |         if t:
37 |             cmd = time_cmd(cmd, log)
38 |         if l:
39 |             cmd = log_cmd(cmd, log)
40 |     try:
41 |         os.system(cmd)
42 |     except KeyboardInterrupt:
43 |         exit(1)
44 | 
45 | params = []
46 | params.append((150,150,10000))
47 | # [(100,31), (100,17), ]
48 | # for L, S in [(150,150)]:
49 | #     for G in [10000]:
50 | #         params.append((L, S, G))
51 | 
52 | gen_log = f"{Logs}/gen.log.txt"
53 | for Graph in Graphs:
54 |     for idx in range(N):
55 |         seed = idx
56 |         id = Graph.split('/')[-1].split('.')[0] + '_' + str(idx)
57 |         Reads_prefix = f"{Data}/{id}"
58 |         Reads = f"{Reads_prefix}.fastq"
59 |         if not os.path.exists(Reads):
60 |             print(f"generating read set #{idx} to {id}")
61 |             Ref = f"{Data}/{id}.fasta"
62 |             run(f"{Bin} --generate-path --generate-path-seed {seed} -g {Graph} -f {Ref} -x vg -a {Data}/{id}.gam", gen_log)
63 |             # id.path.txt has the node indices of the generated path
64 |             run(f"mv {Data}/{id}.gam {Data}/{id}.path.txt", gen_log)
65 |             # # simulate a PacBio long read dataset of 15x coverage using `badread` (commit 9e030e84849281e7dc92f0c9767b601c4dc9701e from https://github.com/rrwick/Badread.git)
66 |             # run(f"badread simulate --seed {seed} --reference {Ref} --quantity 15x --length 15000,10000 --error_model pacbio --junk_reads 0 --random_reads 0 --chimeras 0 > {Reads} 2>{Data}/{id}_br.log.txt", gen_log, True, False)
67 |             
68 |             # simulate a PacBio long read dataset of 20x coverage using 'pbsim' (commit e014b1dd40e87a8799346a9835d70a4da3dc857c from https://github.com/pfaucon/PBSIM-PacBio-Simulator.git)
69 |             prefix = 'xxxx'
70 |             run(f"./bin/pbsim --data-type CLR --depth 20 --seed {seed} --model_qc ./bin/model_qc_clr {Ref} --prefix {prefix} --length-mean {read_length_mean} --length-sd {read_length_std}", gen_log, True, True)
71 |             # rename the reads generated
72 |             run(f"mv ./{prefix}_0001.fastq {Reads}")
73 |             ReadsMaf = f"{Reads_prefix}.maf"
74 |             run(f"mv ./{prefix}_0001.maf {ReadsMaf}")
75 | 
76 |             # run(f"badread simulate --seed {seed} --reference {Ref} --quantity 15x --length 15000,10000 --error_model pacbio --junk_reads 0 --random_reads 0 --chimeras 0 > {Reads} 2>{Data}/{id}_br.log.txt", gen_log, True, False)
77 |         
78 |         long_log = f"{Logs}/{id}_long.log.txt"
79 |         long_gam = f"{Gams}/{id}_long.gam"
80 |         if force_redo or not os.path.exists(long_gam):
81 |             run(f"{Bin} -t {Threads} -x vg -f {Reads} -g {Graph} -a {long_gam}", long_log)
82 |         
83 |         aln_clc_log = f"{Logs}/{id}_long.log.txt"
84 |         for L, S, G in params:
85 |             clc_gam = f"{Gams}/{id}_clc_{L}_{S}_{G}.gam"
86 |             clc_log = f"{Gams}/{id}_clc_{L}_{S}_{G}.log.txt"
87 |             if force_redo or not os.path.exists(clc_gam):
88 |                 run(f"{Bin} -t {Threads} --colinear-chaining -x vg -f {Reads} -g {Graph} -a {clc_gam} --colinear-gap {G} --colinear-split-len {L} --colinear-split-gap {S} --short-verbose", clc_log)
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/scripts/summary.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import os.path
  4 | import json
  5 | import edlib
  6 | #            0              1                     2                  3    4
  7 | # python summary.py ../data/LRC/LRC.gfa ./data/badreads.fastq ga outdir
  8 | 
  9 | graph = sys.argv[1]
 10 | reads = sys.argv[2]
 11 | id = sys.argv[3]
 12 | 
 13 | outdir = "out"
 14 | if 4 < len(sys.argv):
 15 | 	outdir = sys.argv[4]
 16 | if not outdir.endswith(id + '/'):
 17 | 	outdir = outdir + '/' + id + '/'
 18 | 
 19 | def LoadGfaGraph(filename):
 20 | 	VL, E = {}, {}
 21 | 	for line in open(filename).readlines():
 22 | 		if line[0] == 'S':
 23 | 			# S 92533   A
 24 | 			i, s = line[1:].strip().split()
 25 | 			VL[int(i)] = s
 26 | 		elif line[0] == 'L':
 27 | 			# L 104890  +   104892  +   0M
 28 | 			li, lr, ri, rr, ov = line[1:].strip().split()
 29 | 			li, ri = int(li), int(ri)
 30 | 			if li not in E:
 31 | 				E[li] = []
 32 | 			E[li].append(ri)
 33 | 	return VL, E
 34 | 
 35 | VL, E = LoadGfaGraph(graph)
 36 | 
 37 | ed_global = lambda s1, s2 : edlib.align(s1, s2, mode='NW')['editDistance']
 38 | ed_local = lambda s1, s2 : edlib.align(s1, s2, mode='HW')['editDistance']
 39 | list2idx = lambda a: { a[i] : i for i in range(len(a)) }
 40 | revc = lambda s: ''.join({"A":"T","T":"A","C":"G","G":"C"}[c] for c in s[::-1])
 41 | 
 42 | import vg_pb2
 43 | import gzip
 44 | from google.protobuf.internal.encoder import _VarintBytes
 45 | from google.protobuf.internal.decoder import _DecodeVarint32, _VarintDecoder
 46 | def _VarintDecoder(mask):
 47 | 	local_ord = ord
 48 | 	def DecodeVarint(buffer, pos):
 49 | 		result = 0
 50 | 		shift = 0
 51 | 		while 1:
 52 | 			b = local_ord(buffer[pos])
 53 | 			result |= ((b & 0x7f) << shift)
 54 | 			pos += 1
 55 | 			if not (b & 0x80):
 56 | 				result &= mask
 57 | 				return (result, pos)
 58 | 			shift += 7
 59 | 			# if shift >= 64:
 60 | 			# 	raise _DecodeError('Too many bytes when decoding varint.')
 61 | 	return DecodeVarint
 62 | _DecodeVarint64 = _VarintDecoder((1 << 64) - 1)
 63 | def read_alignments(gam_filename):
 64 | 	with open(gam_filename, 'rb') as f:
 65 | 		buf = gzip.GzipFile(fileobj=f).read()
 66 | 		n = 0
 67 | 		while n < len(buf):
 68 | 			an, n = _DecodeVarint32(buf, n)
 69 | 			for i in range(an):
 70 | 				msg_len, n = _DecodeVarint32(buf, n)
 71 | 				msg_buf = buf[n:n+msg_len]
 72 | 				n += msg_len
 73 | 				aln = vg_pb2.Alignment()
 74 | 				aln.ParseFromString(msg_buf)
 75 | 				yield aln
 76 | 
 77 | def parse_alignment(aln):
 78 | 	# bps = sum(len(VL[x.position.node_id]) for x in a.path.mapping)
 79 | 	name = aln.name.split()[0]
 80 | 	seq = ''
 81 | 
 82 | 	rev_cnt = 0
 83 | 	for x in aln.path.mapping:
 84 | 		idx = x.position.node_id
 85 | 		ll = VL[idx]
 86 | 		if x.position.is_reverse:
 87 | 			rev_cnt += 1
 88 | 			seq += revc(ll)
 89 | 		else:
 90 | 			seq += ll
 91 | 	return {'name':name, 'seq':seq, 'path_cnt':len(aln.path.mapping), 'revcnt':rev_cnt, 'path_bps':len(seq)}
 92 | 
 93 | def parse_gam(filename):
 94 | 	ret = {}
 95 | 	for aln in read_alignments(filename):
 96 | 		a = parse_alignment(aln)
 97 | 		ret[a['name']] = a
 98 | 	return ret
 99 | 
100 | seqs_long = parse_gam(f'{outdir}{id}_long.gam')
101 | seqs_clcs = parse_gam(f'{outdir}{id}_clc.gam')
102 | 
103 | def read_fastq(fastq_filename):
104 | 	reads_lines = open(fastq_filename).readlines()
105 | 	for i, line in enumerate(reads_lines):
106 | 		if line[0] == '@':
107 | 			info = reads_lines[i].strip()
108 | 			# name = info.split()[0][1:]
109 | 			seq = reads_lines[i + 1].strip()
110 | 			yield (info, seq)
111 | 
112 | seqs_read = {info.split()[0][1:] : (seq, info) for info, seq in read_fastq(f'{reads}')}
113 | 
114 | class CSV:
115 | 	def __init__(self):
116 | 		self.h = []
117 | 		self.hidx = {}
118 | 		self.r = []
119 | 		self.ridx = {}
120 | 		self.data = []
121 | 	def add_headers(self, headers):
122 | 		for h in headers:
123 | 			if h not in self.hidx:
124 | 				self.h.append(h)
125 | 				self.hidx[h] = len(self.h) - 1
126 | 	def get_hids(self, headers):
127 | 		return [self.hidx[x] for x in headers]
128 | 	def add(self, row, hids = []):
129 | 		if len(hids) == 0:
130 | 			self.data.append(row[:])
131 | 		else:
132 | 			tmp = [''] * len(self.h)
133 | 			for i in range(len(hids)):
134 | 				tmp[hids[i]] = row[i]
135 | 			self.data.append(tmp[:])
136 | 	def save(self, filename):
137 | 		fout = open(filename, 'w')
138 | 		fout.write(','.join(self.h) + '\n')
139 | 		for d in self.data:
140 | 			if len(d) < len(self.h):
141 | 				d += [''] * (len(self.h) - len(d))
142 | 			fout.write(','.join(d) + '\n')
143 | 		fout.close()
144 | 
145 | csv = CSV()
146 | csv.add_headers(['name', 'length', 'br_id_rate']) #0,1,2
147 | csv.add_headers(['long_pathcnt', 'long_path_bps', 'long_revcnt']) #3,4,5
148 | csv.add_headers(['clcs_pathcnt', 'clcs_path_bps', 'clcs_revcnt']) #6,7,8
149 | csv.add_headers(['long_align_rate']) #9
150 | csv.add_headers([
151 | 'global_ed_read_long',  #10
152 | # 'global_ed_long_true',
153 | 'global_ed_read_clcs',  #11
154 | # 'global_ed_clcs_true',
155 | # 'local_ed_read_long',
156 | # 'local_ed_long_read',
157 | # 'local_ed_true_long',
158 | # 'local_ed_long_true',
159 | # 'local_ed_read_clcs',
160 | # 'local_ed_clcs_read',
161 | # 'local_ed_true_clcs',
162 | # 'local_ed_clcs_true',
163 | # 'global_ed_read_clcs',
164 | # 'global_ed_long_clcs',
165 | # 'global_ed_read_true',
166 | # 'global_ed_clcs_true',
167 | ])
168 | 
169 | reads_cnt = 0
170 | for name in seqs_read:
171 | 	reads_cnt += 1
172 | 	if reads_cnt % (len(seqs_read) // 5 + 1) == 0:
173 | 		print(reads_cnt, '/', len(seqs_read))
174 | 	seq, info = seqs_read[name]
175 | 	row = [''] * len(csv.h)
176 | 	row[0] = name
177 | 	for t in info.split():
178 | 		if t.startswith('length='):
179 | 			row[1] = t.split('=')[-1]
180 | 	row[2] = str('%.3f'%(float(info.split()[-1].split('=')[-1][:-1]) / 100))
181 | 
182 | 	long_seq = ''
183 | 	if name in seqs_long:
184 | 		a = seqs_long[name]
185 | 		long_seq = a['seq']
186 | 		row[3] = str(a['path_cnt'])
187 | 		row[4] = str(a['path_bps'])
188 | 		row[5] = str(a['revcnt'])
189 | 		row[10] = str(ed_global(seq, long_seq))
190 | 	row[9] = str(len(long_seq) / len(seq))
191 | 	clcs_seq = ''
192 | 	if name in seqs_clcs:
193 | 		a = seqs_clcs[name]
194 | 		clcs_seq = a['seq']
195 | 		row[6] = str(a['path_cnt'])
196 | 		row[7] = str(a['path_bps'])
197 | 		row[8] = str(a['revcnt'])
198 | 		row[11] = str(ed_global(seq, clcs_seq))
199 | 	csv.add(row)
200 | 
201 | csv.save(f'{outdir}{id}_summary.csv')
202 | 


--------------------------------------------------------------------------------
/scripts/test.sh:
--------------------------------------------------------------------------------
 1 | # output cmd to screen before execution
 2 | set -x
 3 | 
 4 | # parameters
 5 | G=1000
 6 | L=150
 7 | S=33
 8 | 
 9 | # identifiers for this experiment
10 | idx=LRC_15x
11 | 
12 | # data files
13 | Graph=/mnt/c/Code/Summer/GCimplements/data/LRC/LRC.gfa
14 | Reads=/mnt/d/summer/data/$idx.fastq
15 | 
16 | # path to binary
17 | Bin=../bin/GraphAligner
18 | 
19 | # check whether dataset exists
20 | if [ ! -f "$Reads" ]; then
21 |     # sample a longest path on the graph as referrence sequence
22 |     Ref=/mnt/d/summer/data/$idx.fasta
23 |     time $Bin --generate-path -g $Graph -f $Ref -x vg -a /mnt/d/summer/data/$idx.gam
24 |     # idx.txt has the node indices of the generated path
25 |     time mv /mnt/d/summer/data/$idx.gam /mnt/d/summer/data/$idx.txt
26 |     # simulate a PacBio long read dataset of 15x coverage using `badread` (commit 9e030e84849281e7dc92f0c9767b601c4dc9701e from https://github.com/rrwick/Badread.git)
27 |     time badread simulate --reference $Ref --quantity 15x --length 15000,10000 --error_model pacbio --junk_reads 0 --random_reads 0 --chimeras 0 > $Reads 2>/mnt/d/summer/data/$idx\_br.log
28 | fi
29 | 
30 | # experiment output folder
31 | Out=out_$idx/ga
32 | mkdir -p $Out/
33 | Log=$Out/log.txt
34 | 
35 | echo "params=" $G $L $S > $Log
36 | 
37 | # use /usr/bin/time to measure "Max memory(kb)", "User time(s)", "System time(s)"
38 | TimeCmd="/usr/bin/time -o $Log -a -f '%Mkb,%Us,%Ss'"
39 | 
40 | # align directly by original GraphAligner
41 | $TimeCmd $Bin -x vg -f $Reads -g $Graph -a $Out/ga_long.gam 1>>$Log 2>>$Log
42 | 
43 | # align by colinear chaining
44 | $TimeCmd $Bin --colinear-chaining -x vg -f $Reads -g $Graph -a $Out/ga_clc.gam --colinear-gap $G --colinear-split-len $L colinear-split-gap $S  1>>$Log 2>>$Log
45 | 
46 | # generate summary.csv
47 | $TimeCmd python summary.py $Graph $Reads ga out_$idx  1>>$Log 2>>$Log
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/src/Aligner.h:
--------------------------------------------------------------------------------
 1 | #ifndef Aligner_h
 2 | #define Aligner_h
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | #include "AlignmentGraph.h"
 7 | #include "vg.pb.h"
 8 | #include "AlignmentSelection.h"
 9 | 
10 | struct AlignerParams
11 | {
12 | 	std::string graphFile;
13 | 	std::vector<std::string> fastqFiles;
14 | 	size_t numThreads;
15 | 	size_t initialBandwidth;
16 | 	size_t rampBandwidth;
17 | 	bool dynamicRowStart;
18 | 	size_t maxCellsPerSlice;
19 | 	std::vector<std::string> seedFiles;
20 | 	std::string outputGAMFile;
21 | 	std::string outputJSONFile;
22 | 	std::string outputGAFFile;
23 | 	std::string outputCorrectedFile;
24 | 	std::string outputCorrectedClippedFile;
25 | 	std::string IndexMpcFile;
26 | 	bool verboseMode;
27 | 	bool shortVerboseMode;
28 | 	bool tryAllSeeds;
29 | 	bool highMemory;
30 | 	size_t mxmLength;
31 | 	size_t mumCount;
32 | 	size_t memCount;
33 | 	std::string seederCachePrefix;
34 | 	AlignmentSelection::SelectionMethod alignmentSelectionMethod;
35 | 	double selectionECutoff;
36 | 	bool forceGlobal;
37 | 	bool compressCorrected;
38 | 	bool compressClipped;
39 | 	bool preciseClipping;
40 | 	size_t minimizerLength;
41 | 	size_t minimizerWindowSize;
42 | 	double minimizerSeedDensity;
43 | 	size_t seedClusterMinSize;
44 | 	double minimizerDiscardMostNumerousFraction;
45 | 	double seedExtendDensity;
46 | 	bool nondeterministicOptimizations;
47 | 	bool optimalDijkstra;
48 | 	double preciseClippingIdentityCutoff;
49 | 	int Xdropcutoff;
50 | 	size_t DPRestartStride;
51 | 	bool cigarMatchMismatchMerge;
52 | 
53 | 	bool colinearChaining;
54 | 	bool generatePath;
55 | 	bool graphStatistics;
56 | 	long long generatePathSeed;
57 | 	long long colinearGap;
58 | 	long long colinearSplitLen;
59 | 	long long colinearSplitGap;
60 | 	double samplingStep;
61 | 	bool fastMode;
62 | 	
63 | };
64 | 
65 | void alignReads(AlignerParams params);
66 | void replaceDigraphNodeIdsWithOriginalNodeIds(vg::Alignment& alignment, const AlignmentGraph& graph);
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/src/AlignmentCorrectnessEstimation.cpp:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | #include <vector>
  3 | #include "AlignmentCorrectnessEstimation.h"
  4 | #include "ThreadReadAssertion.h"
  5 | 
  6 | //empirically from aligning one ONT to its correct position in the genome
  7 | const double correctMean = 0.1875;
  8 | const double correctStddev = 0.0955;
  9 | //empirically from aligning one random read to a position
 10 | const double wrongMean = 0.5;
 11 | const double wrondStddev = 0.0291;
 12 | 
 13 | const int wordSize = 64;
 14 | 
 15 | const double falseToCorrectTransitionLogProbability = log(0.00001); //10^-5. arbitrary.
 16 | const double falseToFalseTransitionLogProbability = log(1.0 - 0.00001);
 17 | const double correctToFalseTransitionLogProbability = log(0.0000000001); //10^-10. arbitrary.
 18 | const double correctToCorrectTransitionLogProbability = log(1.0 - 0.0000000001);
 19 | 
 20 | double stddistlog(double val, double mean, double stddev)
 21 | {
 22 | 	return -(val-mean)*(val-mean)/(2*stddev*stddev);
 23 | }
 24 | 
 25 | void normalize(std::vector<double>& logs)
 26 | {
 27 | 	double sum = 0;
 28 | 	for (auto x : logs)
 29 | 	{
 30 | 		sum += exp(x);
 31 | 	}
 32 | 	double add = log(1.0/sum);
 33 | 	for (auto& x : logs)
 34 | 	{
 35 | 		x += add;
 36 | 	}
 37 | }
 38 | 
 39 | std::vector<double> getCorrectLogOdds()
 40 | {
 41 | 	std::vector<double> result;
 42 | 	for (int i = 0; i <= wordSize/2; i++)
 43 | 	{
 44 | 		result.push_back(stddistlog(i, correctMean*wordSize, correctStddev*wordSize));
 45 | 	}
 46 | 	normalize(result);
 47 | 	for (int i = wordSize/2; i < wordSize; i++)
 48 | 	{
 49 | 		result.push_back(result.back());
 50 | 	}
 51 | 	return result;
 52 | }
 53 | 
 54 | std::vector<double> getWrongLogOdds()
 55 | {
 56 | 	std::vector<double> result;
 57 | 	for (int i = 0; i <= wordSize/2; i++)
 58 | 	{
 59 | 		result.push_back(stddistlog(i, wrongMean*wordSize, wrondStddev*wordSize));
 60 | 	}
 61 | 	normalize(result);
 62 | 	for (int i = wordSize/2; i < wordSize; i++)
 63 | 	{
 64 | 		result.push_back(result.back());
 65 | 	}
 66 | 	return result;
 67 | }
 68 | 
 69 | const std::vector<double> precomputedCorrectLogOdds = getCorrectLogOdds();
 70 | const std::vector<double> precomputedWrongLogOdds = getWrongLogOdds();
 71 | 
 72 | AlignmentCorrectnessEstimationState::AlignmentCorrectnessEstimationState() :
 73 | correctLogOdds(log(0.8)), //80% arbitrarily
 74 | falseLogOdds(log(0.2)), //20% arbitrarily
 75 | correctFromCorrectTrace(false),
 76 | falseFromCorrectTrace(false)
 77 | {
 78 | }
 79 | 
 80 | bool AlignmentCorrectnessEstimationState::CurrentlyCorrect() const
 81 | {
 82 | 	return correctLogOdds > falseLogOdds;
 83 | }
 84 | 
 85 | bool AlignmentCorrectnessEstimationState::CorrectFromCorrect() const
 86 | {
 87 | 	return correctFromCorrectTrace;
 88 | }
 89 | 
 90 | bool AlignmentCorrectnessEstimationState::FalseFromCorrect() const
 91 | {
 92 | 	return falseFromCorrectTrace;
 93 | }
 94 | 
 95 | double AlignmentCorrectnessEstimationState::CorrectLogOdds() const
 96 | {
 97 | 	return correctLogOdds;
 98 | }
 99 | 
100 | double AlignmentCorrectnessEstimationState::FalseLogOdds() const
101 | {
102 | 	return falseLogOdds;
103 | }
104 | 
105 | AlignmentCorrectnessEstimationState AlignmentCorrectnessEstimationState::NextState(int mismatches, int rowSize) const
106 | {
107 | 	assert(rowSize == 64);
108 | 	// assert(rowSize == 64 || rowSize == 1);
109 | 	assert(mismatches >= 0);
110 | 	AlignmentCorrectnessEstimationState result;
111 | 	result.correctFromCorrectTrace = correctLogOdds + correctToCorrectTransitionLogProbability >= falseLogOdds + falseToCorrectTransitionLogProbability;
112 | 	result.falseFromCorrectTrace = correctLogOdds + correctToFalseTransitionLogProbability >= falseLogOdds + falseToFalseTransitionLogProbability;
113 | 	double newCorrectProbability = std::max(correctLogOdds + correctToCorrectTransitionLogProbability, falseLogOdds + falseToCorrectTransitionLogProbability);
114 | 	double newFalseProbability = std::max(correctLogOdds + correctToFalseTransitionLogProbability, falseLogOdds + falseToFalseTransitionLogProbability);
115 | 	assert(precomputedCorrectLogOdds.size() == precomputedWrongLogOdds.size());
116 | 	if ((size_t)mismatches < precomputedCorrectLogOdds.size())
117 | 	{
118 | 		newCorrectProbability += precomputedCorrectLogOdds[(size_t)mismatches];
119 | 		newFalseProbability += precomputedWrongLogOdds[(size_t)mismatches];
120 | 	}
121 | 	else
122 | 	{
123 | 		newCorrectProbability += precomputedCorrectLogOdds.back();
124 | 		newFalseProbability += precomputedWrongLogOdds.back();
125 | 	}
126 | 	result.correctLogOdds = newCorrectProbability;
127 | 	result.falseLogOdds = newFalseProbability;
128 | 	return result;
129 | }
130 | 


--------------------------------------------------------------------------------
/src/AlignmentCorrectnessEstimation.h:
--------------------------------------------------------------------------------
 1 | #ifndef AlignmentCorrectnessEstimation_h
 2 | #define AlignmentCorrectnessEstimation_h
 3 | 
 4 | class AlignmentCorrectnessEstimationState
 5 | {
 6 | public:
 7 | 	AlignmentCorrectnessEstimationState();
 8 | 	bool CurrentlyCorrect() const;
 9 | 	bool CorrectFromCorrect() const;
10 | 	bool FalseFromCorrect() const;
11 | 	double CorrectLogOdds() const;
12 | 	double FalseLogOdds() const;
13 | 	AlignmentCorrectnessEstimationState NextState(int mismatches, int rowSize) const;
14 | private:
15 | 	double correctLogOdds;
16 | 	double falseLogOdds;
17 | 	bool correctFromCorrectTrace;
18 | 	bool falseFromCorrectTrace;
19 | };
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/src/AlignmentGraph.h:
--------------------------------------------------------------------------------
  1 | #ifndef AlignmentGraph_h
  2 | #define AlignmentGraph_h
  3 | 
  4 | #include <functional>
  5 | #include <vector>
  6 | #include <set>
  7 | #include <unordered_map>
  8 | #include <unordered_set>
  9 | #include <tuple>
 10 | #include <phmap.h>
 11 | #include "ThreadReadAssertion.h"
 12 | 
 13 | 
 14 | class AlignmentGraph
 15 | {
 16 | public:
 17 | 	//determines extra band size, shouldn't be too high because of extra slices
 18 | 	//should be 0 mod (wordsize/2 == 32), otherwise storage has overhead
 19 | 	//64 is the fastest out of 32, 64, 96
 20 | 	static constexpr int SPLIT_NODE_SIZE = 64;
 21 | 	static constexpr size_t BP_IN_CHUNK = sizeof(size_t) * 8 / 2;
 22 | 	static constexpr size_t CHUNKS_IN_NODE = (SPLIT_NODE_SIZE + BP_IN_CHUNK - 1) / BP_IN_CHUNK;
 23 | 
 24 | 	struct NodeChunkSequence
 25 | 	{
 26 | 		size_t& operator[](size_t pos)
 27 | 		{
 28 | 			return s[pos];
 29 | 		}
 30 | 		size_t operator[](size_t pos) const
 31 | 		{
 32 | 			return s[pos];
 33 | 		}
 34 | 		size_t s[CHUNKS_IN_NODE];
 35 | 	};
 36 | 	struct AmbiguousChunkSequence
 37 | 	{
 38 | 		static_assert(SPLIT_NODE_SIZE == sizeof(size_t)*8);
 39 | 		//weird interface because it should behave like NodeChunkSequence, which is just a number
 40 | 		AmbiguousChunkSequence operator[](size_t pos) const
 41 | 		{
 42 | 			AmbiguousChunkSequence result = *this;
 43 | 			result.A >>= pos * BP_IN_CHUNK;
 44 | 			result.C >>= pos * BP_IN_CHUNK;
 45 | 			result.G >>= pos * BP_IN_CHUNK;
 46 | 			result.T >>= pos * BP_IN_CHUNK;
 47 | 			return result;
 48 | 		}
 49 | 		//weird interface because it should behave like NodeChunkSequence, which is just a number
 50 | 		AmbiguousChunkSequence operator>>=(size_t amount)
 51 | 		{
 52 | 			assert(amount % 2 == 0);
 53 | 			A >>= amount / 2;
 54 | 			T >>= amount / 2;
 55 | 			C >>= amount / 2;
 56 | 			G >>= amount / 2;
 57 | 			return *this;
 58 | 		}
 59 | 		//weird interface because it should behave like NodeChunkSequence, which is just a number
 60 | 		AmbiguousChunkSequence operator&(size_t val)
 61 | 		{
 62 | 			return *this;
 63 | 		}
 64 | 		size_t A;
 65 | 		size_t T;
 66 | 		size_t C;
 67 | 		size_t G;
 68 | 	};
 69 | 
 70 | 	struct MatrixPosition
 71 | 	{
 72 | 		MatrixPosition(size_t node, size_t nodeOffset, size_t seqPos);
 73 | 		bool operator==(const MatrixPosition& other) const;
 74 | 		bool operator!=(const MatrixPosition& other) const;
 75 | 		size_t node;
 76 | 		size_t nodeOffset;
 77 | 		size_t seqPos;
 78 | 	};
 79 | 
 80 | 	class SeedHit
 81 | 	{
 82 | 	public:
 83 | 		SeedHit(size_t seqPos, int nodeId, size_t nodePos) : sequencePosition(seqPos), nodeId(nodeId), nodePos(nodePos) {};
 84 | 		size_t sequencePosition;
 85 | 		int nodeId;
 86 | 		size_t nodePos;
 87 | 	};
 88 | 	AlignmentGraph();
 89 | 	void ReserveNodes(size_t numNodes, size_t numSplitNodes);
 90 | 	void AddNode(int nodeId, const std::string& sequence, const std::string& name, bool reverseNode, const std::vector<size_t>& breakpoints);
 91 | 	void AddEdgeNodeId(int node_id_from, int node_id_to, size_t startOffset);
 92 | 	void Finalize(int wordSize);
 93 | 	AlignmentGraph GetSubgraph(const std::unordered_map<size_t, size_t>& nodeMapping) const;
 94 | 	std::pair<int, size_t> GetReversePosition(int nodeId, size_t offset) const;
 95 | 	size_t GetReverseNode(size_t node) const;
 96 | 	size_t NodeSize() const;
 97 | 	size_t SizeInBP() const;
 98 | 	size_t NodeOffset(size_t node) const;
 99 | 	size_t NodeID(size_t node) const;
100 | 	size_t NodeLength(size_t nodeIndex) const;
101 | 	char NodeSequences(size_t node, size_t offset) const;
102 | 	NodeChunkSequence NodeChunks(size_t node) const;
103 | 	AmbiguousChunkSequence AmbiguousNodeChunks(size_t node) const;
104 | 	size_t GetUnitigNode(int nodeId, size_t offset) const;
105 | 	// size_t MinDistance(size_t pos, const std::vector<size_t>& targets) const;
106 | 	// std::set<size_t> ProjectForward(const std::set<size_t>& startpositions, size_t amount) const;
107 | 	std::string OriginalNodeName(int nodeId) const;
108 | 	size_t ComponentSize() const;
109 | 	static AlignmentGraph DummyGraph();
110 | 	size_t getDBGoverlap() const;
111 | 
112 | 	struct Anchor {
113 | 		std::vector<size_t> path;
114 | 		size_t x, y;
115 | 	};
116 | 	void buildMPC();
117 | 	void buildComponentsMap();
118 | 	void loadMPC(const std::string &filename);
119 | 	void saveMPC(const std::string &filename);
120 | 	std::vector<size_t> generatePath(const std::string &seq_out, const std::string &path_out, const size_t seed = 0);
121 | 	std::vector<size_t> colinearChaining(const std::vector<Anchor> &anchors, long long sep_limit) const;
122 | 	std::vector<size_t> getChainPath(size_t s, size_t t, long long sep_limit) const;
123 | 
124 | private:
125 | 	void fixChainApproxPos(const size_t start);
126 | 	std::pair<bool, size_t> findBubble(const size_t start, const std::vector<bool>& ignorableTip);
127 | 	void chainBubble(const size_t start, const std::vector<bool>& ignorableTip, std::vector<size_t>& rank);
128 | 	phmap::flat_hash_map<size_t, std::unordered_set<size_t>> chainTips(std::vector<size_t>& rank, std::vector<bool>& ignorableTip);
129 | 	void chainCycles(std::vector<size_t>& rank, std::vector<bool>& ignorableTip);
130 | 	void findChains();
131 | 	void findLinearizable();
132 | 	void AddNode(int nodeId, int offset, const std::string& sequence, bool reverseNode);
133 | 	void RenumberAmbiguousToEnd();
134 | 	void doComponentOrder();
135 | 
136 | 	std::vector<std::vector<size_t>> greedyCover(size_t cid) const;
137 | 	std::vector<std::vector<size_t>> shrink(size_t cid, const std::vector<std::vector<size_t>> &pc);
138 | 	void computeMPCIndex(size_t cid, const std::vector<std::vector<size_t>> &pc);
139 | 	bool checkMinPathCover(const std::vector<std::vector<size_t>> &pc);
140 | 	std::pair<std::vector<size_t>, size_t> colinearChainingByComponent(size_t cid, const std::vector<Anchor> &anchors, const std::vector<size_t> &aids, long long sep_limit) const;
141 | 	
142 | 
143 | 
144 | 
145 | 	std::vector<size_t> nodeLength;
146 | 	std::unordered_map<int, std::vector<size_t>> nodeLookup;
147 | 	std::unordered_map<int, size_t> originalNodeSize;
148 | 	std::unordered_map<int, std::string> originalNodeName;
149 | 	std::vector<size_t> nodeOffset;
150 | 	std::vector<int> nodeIDs;
151 | 	std::vector<std::vector<size_t>> inNeighbors;
152 | 	std::vector<std::vector<size_t>> outNeighbors;
153 | 	std::vector<bool> reverse;
154 | 	std::vector<bool> linearizable;
155 | 	std::vector<NodeChunkSequence> nodeSequences;
156 | 	size_t bpSize;
157 | 	std::vector<AmbiguousChunkSequence> ambiguousNodeSequences;
158 | 	std::vector<bool> ambiguousNodes;
159 | 	std::vector<size_t> componentNumber;
160 | 	std::vector<size_t> chainNumber;
161 | 	std::vector<size_t> chainApproxPos;
162 | 	size_t firstAmbiguous;
163 | 	size_t DBGoverlap;
164 | 	bool finalized;
165 | 
166 | 	std::vector<size_t> component_map;
167 | 	std::vector<size_t> component_idx;
168 | 	std::vector<std::vector<size_t>> component_ids;
169 | 
170 | 	std::vector<std::vector<size_t>> topo, topo_ids;
171 | 	std::vector<std::vector<std::vector<size_t>>> mpc, paths;
172 | 	std::vector<std::vector<std::vector<std::pair<size_t, size_t>>>> forwards, backwards;
173 | 	// std::vector<std::vector<size_t>> backwards;
174 | 
175 | 	template <typename LengthType, typename ScoreType, typename Word>
176 | 	friend class GraphAligner;
177 | 	template <typename LengthType, typename ScoreType, typename Word>
178 | 	friend class GraphAlignerVGAlignment;
179 | 	template <typename LengthType, typename ScoreType, typename Word>
180 | 	friend class GraphAlignerGAFAlignment;
181 | 	template <typename LengthType, typename ScoreType, typename Word>
182 | 	friend class GraphAlignerBitvectorBanded;
183 | 	template <typename LengthType, typename ScoreType, typename Word>
184 | 	friend class GraphAlignerBitvectorCommon;
185 | 	template <typename LengthType, typename ScoreType, typename Word>
186 | 	friend class GraphAlignerBitvectorDijkstra;
187 | 	friend class DirectedGraph;
188 | 	friend class MinimizerSeeder;
189 | };
190 | 
191 | 
192 | #endif
193 | 


--------------------------------------------------------------------------------
/src/AlignmentSelection.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <cstdint>
 3 | #include <cmath>
 4 | #include "AlignmentSelection.h"
 5 | #include "EValue.h"
 6 | 
 7 | //an overlap which is larger than the fraction cutoff of the smaller alignment means the alignments are incompatible
 8 | //eg alignments 12000bp and 15000bp, overlap of 12000*0.05 = 600bp means they are incompatible
 9 | const float OverlapIncompatibleFractionCutoff = 0.05;
10 | 
11 | namespace AlignmentSelection
12 | {
13 | 	bool alignmentIncompatible(const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right)
14 | 	{
15 | 		auto minOverlapLen = std::min((left.alignmentEnd - left.alignmentStart), (right.alignmentEnd - right.alignmentStart)) * OverlapIncompatibleFractionCutoff;
16 | 		assert(left.alignmentStart >= 0);
17 | 		assert(right.alignmentStart >= 0);
18 | 		size_t leftStart = left.alignmentStart;
19 | 		size_t leftEnd = left.alignmentEnd;
20 | 		size_t rightStart = right.alignmentStart;
21 | 		size_t rightEnd = right.alignmentEnd;
22 | 		if (leftStart > rightStart)
23 | 		{
24 | 			std::swap(leftStart, rightStart);
25 | 			std::swap(leftEnd, rightEnd);
26 | 		}
27 | 		int overlap = 0;
28 | 		assert(leftStart <= rightStart);
29 | 		if (leftEnd > rightStart) overlap = leftEnd - rightStart;
30 | 		return overlap > minOverlapLen;
31 | 	}
32 | 
33 | 	//lower E-value is better
34 | 	bool alignmentECompare(const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right, size_t m, size_t n, const EValueCalculator& EValueCalc)
35 | 	{
36 | 		return EValueCalc.getEValue(m, n, left.alignmentLength(), left.alignmentScore) < EValueCalc.getEValue(m, n, right.alignmentLength(), right.alignmentScore);
37 | 	}
38 | 
39 | 	bool alignmentScoreCompare(const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right, const EValueCalculator& EValueCalc)
40 | 	{
41 | 		return EValueCalc.getAlignmentScore(left.alignmentLength(), left.alignmentScore) > EValueCalc.getAlignmentScore(right.alignmentLength(), right.alignmentScore);
42 | 	}
43 | 
44 | 	//longer is better, after that lower score is better
45 | 	bool alignmentLengthCompare(const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right)
46 | 	{
47 | 		if ((left.alignmentEnd - left.alignmentStart) > (right.alignmentEnd - right.alignmentStart)) return true;
48 | 		if ((right.alignmentEnd - right.alignmentStart) > (left.alignmentEnd - left.alignmentStart)) return false;
49 | 		if (left.alignmentScore < right.alignmentScore) return true;
50 | 		return false;
51 | 	}
52 | 
53 | 	std::vector<AlignmentResult::AlignmentItem> SelectAlignments(const std::vector<AlignmentResult::AlignmentItem>& allAlignments, SelectionOptions options)
54 | 	{
55 | 		// roundabout to fit the signature of const ref while allowing filtering
56 | 		std::vector<AlignmentResult::AlignmentItem> filteredByE;
57 | 		if (options.ECutoff != -1)
58 | 		{
59 | 			filteredByE = SelectECutoff(allAlignments, options.graphSize, options.readSize, options.ECutoff, options.EValueCalc);
60 | 		}
61 | 		const std::vector<AlignmentResult::AlignmentItem>& alignments { (options.ECutoff != -1) ? filteredByE : allAlignments };
62 | 		switch(options.method)
63 | 		{
64 | 			case GreedyLength:
65 | 				return GreedySelectAlignments(alignments, alignmentLengthCompare);
66 | 			case GreedyScore:
67 | 				return GreedySelectAlignments(alignments, [options](const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right) { return alignmentScoreCompare(left, right, options.EValueCalc); });
68 | 			case GreedyE:
69 | 				return GreedySelectAlignments(alignments, [options](const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right) {return alignmentECompare(left, right, options.graphSize, options.readSize, options.EValueCalc); });
70 | 			case ScheduleInverseESum:
71 | 				return ScheduleSelectAlignments(alignments, [options](const AlignmentResult::AlignmentItem aln) { return 1.0 / options.EValueCalc.getEValue(options.graphSize, options.readSize, aln.alignmentLength(), aln.alignmentScore); });
72 | 			case ScheduleInverseEProduct:
73 | 				return ScheduleSelectAlignments(alignments, [options](const AlignmentResult::AlignmentItem aln) { return -log(options.EValueCalc.getEValue(options.graphSize, options.readSize, aln.alignmentLength(), aln.alignmentScore)); });
74 | 			case ScheduleScore:
75 | 				return ScheduleSelectAlignments(alignments, [options](const AlignmentResult::AlignmentItem aln) { return options.EValueCalc.getAlignmentScore(aln.alignmentLength(), aln.alignmentScore); });
76 | 			case ScheduleLength:
77 | 				return ScheduleSelectAlignments(alignments, [](const AlignmentResult::AlignmentItem aln) { return (aln.alignmentEnd - aln.alignmentStart) + 0.5 - 0.5 / (aln.alignmentScore); });
78 | 			default:
79 | 			case All:
80 | 				return alignments;
81 | 		}
82 | 		assert(false);
83 | 		return alignments;
84 | 	}
85 | 
86 | 	std::vector<AlignmentResult::AlignmentItem> SelectECutoff(const std::vector<AlignmentResult::AlignmentItem>& alignments, size_t m, size_t n, double cutoff, const EValueCalculator& EValueCalc)
87 | 	{
88 | 		std::vector<AlignmentResult::AlignmentItem> result;
89 | 		for (size_t i = 0; i < alignments.size(); i++)
90 | 		{
91 | 			if (EValueCalc.getEValue(m, n, alignments[i].alignmentLength(), alignments[i].alignmentScore) <= cutoff) result.push_back(alignments[i]);
92 | 		}
93 | 		return result;
94 | 	}
95 | 
96 | }


--------------------------------------------------------------------------------
/src/AlignmentSelection.h:
--------------------------------------------------------------------------------
  1 | #ifndef AlignmentSelection_h
  2 | #define AlignmentSelection_h
  3 | 
  4 | #include <cmath>
  5 | #include <cstdint>
  6 | #include <vector>
  7 | #include <functional>
  8 | #include "vg.pb.h"
  9 | #include "GraphAlignerCommon.h"
 10 | #include "EValue.h"
 11 | 
 12 | namespace AlignmentSelection
 13 | {
 14 | 	enum SelectionMethod
 15 | 	{
 16 | 		GreedyLength,
 17 | 		GreedyScore,
 18 | 		GreedyE,
 19 | 		ScheduleInverseESum,
 20 | 		ScheduleInverseEProduct,
 21 | 		ScheduleScore,
 22 | 		ScheduleLength,
 23 | 		All
 24 | 	};
 25 | 	struct SelectionOptions
 26 | 	{
 27 | 		SelectionMethod method;
 28 | 		size_t graphSize;
 29 | 		size_t readSize;
 30 | 		double ECutoff;
 31 | 		EValueCalculator EValueCalc;
 32 | 	};
 33 | 	std::vector<AlignmentResult::AlignmentItem> SelectAlignments(const std::vector<AlignmentResult::AlignmentItem>& alignments, SelectionOptions options);
 34 | 	bool alignmentIncompatible(const AlignmentResult::AlignmentItem& left, const AlignmentResult::AlignmentItem& right);
 35 | 
 36 | 	template <typename AlnScorer>
 37 | 	std::vector<AlignmentResult::AlignmentItem> GreedySelectAlignments(const std::vector<AlignmentResult::AlignmentItem>& alignments, AlnScorer alnScorer)
 38 | 	{
 39 | 		std::vector<size_t> items;
 40 | 		for (size_t i = 0; i < alignments.size(); i++)
 41 | 		{
 42 | 			items.push_back(i);
 43 | 		}
 44 | 		std::sort(items.begin(), items.end(), [&alignments, alnScorer](size_t left, size_t right) { return alnScorer(alignments[left], alignments[right]); });
 45 | 		std::vector<AlignmentResult::AlignmentItem> result;
 46 | 		for (auto i : items)
 47 | 		{
 48 | 			if (!std::any_of(result.begin(), result.end(), [&alignments, i](const AlignmentResult::AlignmentItem& existing) { return alignmentIncompatible(existing, alignments[i]); }))
 49 | 			{
 50 | 				result.push_back(alignments[i]);
 51 | 			}
 52 | 		}
 53 | 		return result;
 54 | 	}
 55 | 
 56 | 	template <typename AlnScorer>
 57 | 	std::vector<AlignmentResult::AlignmentItem> ScheduleSelectAlignments(const std::vector<AlignmentResult::AlignmentItem>& alignments, AlnScorer alnScorer)
 58 | 	{
 59 | 		std::vector<size_t> items;
 60 | 		for (size_t i = 0; i < alignments.size(); i++)
 61 | 		{
 62 | 			items.push_back(i);
 63 | 		}
 64 | 		std::sort(items.begin(), items.end(), [&alignments](size_t left, size_t right) { return alignments[left].alignmentEnd < alignments[right].alignmentEnd; });
 65 | 		std::vector<size_t> backtrace;
 66 | 		std::vector<double> score;
 67 | 		backtrace.resize(items.size(), std::numeric_limits<size_t>::max());
 68 | 		score.resize(items.size(), 0);
 69 | 		for (size_t i = 0; i < items.size(); i++)
 70 | 		{
 71 | 			double rawScore = alnScorer(alignments[items[i]]);
 72 | 			score[i] = rawScore;
 73 | 			for (size_t j = 0; j < i; j++)
 74 | 			{
 75 | 				if (alignmentIncompatible(alignments[items[i]], alignments[items[j]])) continue;
 76 | 				if (score[j] + rawScore > score[i])
 77 | 				{
 78 | 					backtrace[i] = j;
 79 | 					score[i] = score[j] + rawScore;
 80 | 				}
 81 | 			}
 82 | 		}
 83 | 		size_t maxPos = 0;
 84 | 		for (size_t i = 0; i < items.size(); i++)
 85 | 		{
 86 | 			if (score[i] > score[maxPos]) maxPos = i;
 87 | 		}
 88 | 		std::vector<AlignmentResult::AlignmentItem> result;
 89 | 		while (maxPos != std::numeric_limits<size_t>::max())
 90 | 		{
 91 | 			result.push_back(alignments[items[maxPos]]);
 92 | 			assert(backtrace[maxPos] < maxPos || backtrace[maxPos] == std::numeric_limits<size_t>::max());
 93 | 			maxPos = backtrace[maxPos];
 94 | 		}
 95 | 		return result;
 96 | 	}
 97 | 
 98 | 	std::vector<AlignmentResult::AlignmentItem> SelectECutoff(const std::vector<AlignmentResult::AlignmentItem>& alignments, size_t m, size_t n, double cutoff, const EValueCalculator& EValueCalc);
 99 | 
100 | };
101 | 
102 | #endif
103 | 


--------------------------------------------------------------------------------
/src/AlignmentSubsequenceIdentity.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <algorithm>
  3 | #include <iostream>
  4 | #include <tuple>
  5 | #include "CommonUtils.h"
  6 | #include "GfaGraph.h"
  7 | #include "fastqloader.h"
  8 | 
  9 | bool fakeLengths = false;
 10 | 
 11 | namespace std 
 12 | {
 13 | 	template <> 
 14 | 	struct hash<std::pair<size_t, size_t>>
 15 | 	{
 16 | 		size_t operator()(const std::pair<size_t, size_t>& x) const
 17 | 		{
 18 | 			return hash<size_t>()(x.first) ^ hash<size_t>()(x.second);
 19 | 		}
 20 | 	};
 21 | }
 22 | 
 23 | struct Node
 24 | {
 25 | 	int nodeId;
 26 | 	bool reverse;
 27 | 	bool operator==(const Node& other) const
 28 | 	{
 29 | 		return nodeId == other.nodeId && reverse == other.reverse;
 30 | 	}
 31 | };
 32 | 
 33 | struct Alignment
 34 | {
 35 | 	std::vector<Node> path;
 36 | 	std::vector<size_t> length;
 37 | 	std::string name;
 38 | };
 39 | 
 40 | Alignment convertVGtoAlignment(const vg::Alignment& vgAln)
 41 | {
 42 | 	Alignment result;
 43 | 	result.name = vgAln.name();
 44 | 	for (int i = 0; i < vgAln.path().mapping_size(); i++)
 45 | 	{
 46 | 		result.path.emplace_back();
 47 | 		result.path.back().nodeId = vgAln.path().mapping(i).position().node_id();
 48 | 		result.path.back().reverse = vgAln.path().mapping(i).position().is_reverse();
 49 | 		result.length.emplace_back(vgAln.path().mapping(i).edit(0).to_length());
 50 | 	}
 51 | 	return result;
 52 | }
 53 | 
 54 | Alignment reverse(const Alignment& old)
 55 | {
 56 | 	Alignment result;
 57 | 	result.name = old.name;
 58 | 	for (size_t i = 0; i < old.path.size(); i++)
 59 | 	{
 60 | 		result.path.emplace_back();
 61 | 		result.path.back().nodeId = old.path[i].nodeId;
 62 | 		result.path.back().reverse = !old.path[i].reverse;
 63 | 	}
 64 | 	result.length = old.length;
 65 | 	std::reverse(result.path.begin(), result.path.end());
 66 | 	std::reverse(result.length.begin(), result.length.end());
 67 | 	return result;
 68 | }
 69 | 
 70 | std::pair<double, size_t> getAlignmentIdentity(const Alignment& read, const Alignment& transcript, const std::unordered_map<std::string, size_t>& readLengths)
 71 | {
 72 | 	std::vector<std::vector<size_t>> matchLen;
 73 | 	matchLen.resize(read.path.size()+1);
 74 | 	for (size_t i = 0; i < read.path.size()+1; i++)
 75 | 	{
 76 | 		matchLen[i].resize(transcript.path.size()+1, 0);
 77 | 	}
 78 | 	size_t maxMatch = 0;
 79 | 	size_t maxmatchIndex = 0;
 80 | 	size_t maxmatchMissing = 0;
 81 | 	for (size_t i = 0; i < read.path.size(); i++)
 82 | 	{
 83 | 		for (size_t j = 0; j < transcript.path.size(); j++)
 84 | 		{
 85 | 			matchLen[i+1][j+1] = std::max(matchLen[i+1][j], matchLen[i][j+1]);
 86 | 			if (read.path[i] == transcript.path[j])
 87 | 			{
 88 | 				matchLen[i+1][j+1] = std::max(matchLen[i+1][j+1], matchLen[i][j] + std::min(read.length[i], transcript.length[j]));
 89 | 			}
 90 | 			else
 91 | 			{
 92 | 				matchLen[i+1][j+1] = std::max(matchLen[i+1][j+1], matchLen[i][j]);
 93 | 			}
 94 | 			if (matchLen[i+1][j+1] > maxMatch)
 95 | 			{
 96 | 				maxMatch = matchLen[i+1][j+1];
 97 | 				maxmatchIndex = j;
 98 | 				maxmatchMissing = 0;
 99 | 				if (read.length[i] < transcript.length[j]) maxmatchMissing = transcript.length[j] - read.length[i];
100 | 			}
101 | 		}
102 | 	}
103 | 	size_t threeprimeDistance = maxmatchMissing;
104 | 	for (size_t i = maxmatchIndex+1; i < transcript.length.size(); i++)
105 | 	{
106 | 		threeprimeDistance += transcript.length[i];
107 | 	}
108 | 	assert(maxMatch >= 0);
109 | 	double length;
110 | 	if (fakeLengths)
111 | 	{
112 | 		length = 1;
113 | 	}
114 | 	else
115 | 	{
116 | 		assert(readLengths.count(read.name) == 1);
117 | 		assert(maxMatch <= readLengths.at(read.name));
118 | 		length = (double)maxMatch / (double)readLengths.at(read.name);
119 | 	}
120 | 	return std::make_pair(length, threeprimeDistance);
121 | }
122 | 
123 | int main(int argc, char** argv)
124 | {
125 | 	std::string transcriptFile { argv[1] };
126 | 	std::string readAlignmentFile { argv[2] };
127 | 	std::string readFastaFile { argv[3] };
128 | 
129 | 	if (argc >= 5 && argv[4][0] == '1') fakeLengths = true;
130 | 
131 | 	std::unordered_map<std::string, size_t> readLengths;
132 | 	{
133 | 		auto reads = loadFastqFromFile(readFastaFile);
134 | 		for (auto read : reads)
135 | 		{
136 | 			readLengths[read.seq_id] = read.sequence.size();
137 | 		}
138 | 	}
139 | 
140 | 	std::vector<Alignment> transcripts;
141 | 	std::vector<Alignment> reads;
142 | 	{
143 | 		auto vgtranscripts = CommonUtils::LoadVGAlignments(transcriptFile);
144 | 		for (auto vg : vgtranscripts)
145 | 		{
146 | 			transcripts.push_back(convertVGtoAlignment(vg));
147 | 		}
148 | 	}
149 | 	{
150 | 		auto vgreads = CommonUtils::LoadVGAlignments(readAlignmentFile);
151 | 		for (auto vg : vgreads)
152 | 		{
153 | 			reads.push_back(convertVGtoAlignment(vg));
154 | 		}
155 | 	}
156 | 
157 | 	std::unordered_map<int, std::vector<size_t>> transcriptsCrossingNode;
158 | 	for (size_t i = 0; i < transcripts.size(); i++)
159 | 	{
160 | 		for (int j = 0; j < transcripts[i].path.size(); j++)
161 | 		{
162 | 			transcriptsCrossingNode[transcripts[i].path[j].nodeId].push_back(i);
163 | 		}
164 | 	}
165 | 
166 | 	std::unordered_map<std::pair<size_t, size_t>, std::pair<double, size_t>> readTranscriptBestPair;
167 | 
168 | 	for (size_t readi = 0; readi < reads.size(); readi++)
169 | 	{
170 | 		auto read = reads[readi];
171 | 		std::set<size_t> possibleTranscripts;
172 | 		for (size_t i = 0; i < read.path.size(); i++)
173 | 		{
174 | 			possibleTranscripts.insert(transcriptsCrossingNode[read.path[i].nodeId].begin(), transcriptsCrossingNode[read.path[i].nodeId].end());
175 | 		}
176 | 		auto reverseread = reverse(read);
177 | 		for (auto i : possibleTranscripts)
178 | 		{
179 | 			auto identityFw = getAlignmentIdentity(read, transcripts[i], readLengths);
180 | 			auto identityBw = getAlignmentIdentity(reverseread, transcripts[i], readLengths);
181 | 			auto bigger = identityFw;
182 | 			if (identityBw.first > identityFw.first) bigger = identityBw;
183 | 			if (bigger.first > 0 && (readTranscriptBestPair.count(std::make_pair(readi, i)) == 0 || readTranscriptBestPair[std::make_pair(readi, i)].first < bigger.first))
184 | 			{
185 | 				readTranscriptBestPair[std::make_pair(readi, i)] = bigger;
186 | 			}
187 | 		}
188 | 	}
189 | 	for (auto mapping : readTranscriptBestPair)
190 | 	{
191 | 		std::cout << reads[mapping.first.first].name << "\t" << transcripts[mapping.first.second].name << "\t" << mapping.second.first << "\t" << mapping.second.second << std::endl;
192 | 	}
193 | }
194 | 


--------------------------------------------------------------------------------
/src/ArrayPriorityQueue.h:
--------------------------------------------------------------------------------
  1 | #ifndef ArrayPriorityQueue_h
  2 | #define ArrayPriorityQueue_h
  3 | 
  4 | #include <queue>
  5 | #include <phmap.h>
  6 | #include "ThreadReadAssertion.h"
  7 | 
  8 | template <typename T, bool SparseStorage>
  9 | class ArrayPriorityQueue
 10 | {
 11 | public:
 12 | 	constexpr bool IsComponentPriorityQueue() { return false; }
 13 | 	ArrayPriorityQueue(size_t maxPriority, size_t maxExtras) :
 14 | 	activeQueues(),
 15 | 	extras(),
 16 | 	queues(),
 17 | 	numItems(0)
 18 | 	{
 19 | 		initialize(maxPriority, maxExtras);
 20 | 	}
 21 | 	ArrayPriorityQueue() :
 22 | 	activeQueues(),
 23 | 	extras(),
 24 | 	queues(),
 25 | 	numItems(0)
 26 | 	{
 27 | 	}
 28 | 	template <bool Sparse = SparseStorage>
 29 | 	typename std::enable_if<Sparse>::type initialize(size_t maxPriority, size_t maxExtras)
 30 | 	{
 31 | 		queues.resize(maxPriority);
 32 | 	}
 33 | 	template <bool Sparse = SparseStorage>
 34 | 	typename std::enable_if<!Sparse>::type initialize(size_t maxPriority, size_t maxExtras)
 35 | 	{
 36 | 		extras.resize(maxExtras, std::vector<T>{});
 37 | 		queues.resize(maxPriority);
 38 | 	}
 39 | #ifdef NDEBUG
 40 | 	__attribute__((always_inline))
 41 | #endif
 42 | 	T& top()
 43 | 	{
 44 | 		assert(activeQueues.size() > 0);
 45 | 		size_t queue = activeQueues.top();
 46 | 		assert(queues[queue].size() > 0);
 47 | 		return queues[queue].back();
 48 | 	}
 49 | #ifdef NDEBUG
 50 | 	__attribute__((always_inline))
 51 | #endif
 52 | 	void pop()
 53 | 	{
 54 | 		size_t queue = activeQueues.top();
 55 | 		assert(queues[queue].size() > 0);
 56 | 		queues[queue].pop_back();
 57 | 		if (queues[queue].size() == 0) activeQueues.pop();
 58 | 		numItems--;
 59 | 	}
 60 | #ifdef NDEBUG
 61 | 	__attribute__((always_inline))
 62 | #endif
 63 | 	size_t size() const
 64 | 	{
 65 | 		return numItems;
 66 | 	}
 67 | 	void insert(size_t component, int score, const T& item)
 68 | 	{
 69 | 		assert(false);
 70 | 	}
 71 | #ifdef NDEBUG
 72 | 	__attribute__((always_inline))
 73 | #endif
 74 | 	void insert(size_t priority, const T& item)
 75 | 	{
 76 | 		assert(priority < queues.size());
 77 | 		queues[priority].push_back(item);
 78 | 		assert(SparseStorage || getId(item) < extras.size());
 79 | 		extras[getId(item)].push_back(item);
 80 | 		if (queues[priority].size() == 1) activeQueues.emplace(priority);
 81 | 		numItems++;
 82 | 	}
 83 | 	void clear()
 84 | 	{
 85 | 		while (activeQueues.size() > 0)
 86 | 		{
 87 | 			size_t queue = activeQueues.top();
 88 | 			for (auto item : queues[queue])
 89 | 			{
 90 | 				removeExtras(getId(item));
 91 | 			}
 92 | 			queues[queue].clear();
 93 | 			activeQueues.pop();
 94 | 		}
 95 | 		numItems = 0;
 96 | 		sparsify();
 97 | 	}
 98 | 
 99 | 	template<bool Sparse = SparseStorage>
100 | 	typename std::enable_if<Sparse>::type sparsify()
101 | 	{
102 | 		decltype(extras) empty;
103 | 		std::swap(extras, empty);
104 | 	}
105 | 	template<bool Sparse = SparseStorage>
106 | 	typename std::enable_if<!Sparse>::type sparsify()
107 | 	{
108 | 	}
109 | 	const std::vector<T>& getExtras(size_t index)
110 | 	{
111 | 		assert(SparseStorage || index < extras.size());
112 | 		return getVec(extras, index);
113 | 	}
114 | 	void removeExtras(size_t index)
115 | 	{
116 | 		assert(SparseStorage || index < extras.size());
117 | 		extras[index].clear();
118 | 	}
119 | 	size_t extraSize(size_t index) const
120 | 	{
121 | 		assert(SparseStorage || index < extras.size());
122 | 		return getVec(extras, index).size();
123 | 	}
124 | private:
125 | 	const std::vector<T>& getVec(const std::vector<std::vector<T>>& list, size_t index) const
126 | 	{
127 | 		return list[index];
128 | 	}
129 | 	const std::vector<T>& getVec(const phmap::flat_hash_map<size_t, std::vector<T>>& list, size_t index) const
130 | 	{
131 | 		static std::vector<T> empty;
132 | 		auto found = list.find(index);
133 | 		if (found == list.end()) return empty;
134 | 		return found->second;
135 | 	}
136 | 	size_t getId(const T& item) const
137 | 	{
138 | 		return item.target;
139 | 	}
140 | 	std::priority_queue<size_t, std::vector<size_t>, std::greater<size_t>> activeQueues;
141 | 	typename std::conditional<SparseStorage, phmap::flat_hash_map<size_t, std::vector<T>>, std::vector<std::vector<T>>>::type extras;
142 | 	std::vector<std::vector<T>> queues;
143 | 	size_t numItems;
144 | };
145 | 
146 | #endif
147 | 


--------------------------------------------------------------------------------
/src/BigraphToDigraph.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <sstream>
  3 | #include <cassert>
  4 | #include <unordered_map>
  5 | #include "CommonUtils.h"
  6 | #include "vg.pb.h"
  7 | #include "fastqloader.h"
  8 | #include "BigraphToDigraph.h"
  9 | #include "ThreadReadAssertion.h"
 10 | #include "stream.hpp"
 11 | 
 12 | static std::vector<bool> getAllowedNucleotides()
 13 | {
 14 | 	std::vector<bool> result;
 15 | 	result.resize(256, false);
 16 | 	result['a'] = true;
 17 | 	result['A'] = true;
 18 | 	result['c'] = true;
 19 | 	result['C'] = true;
 20 | 	result['g'] = true;
 21 | 	result['G'] = true;
 22 | 	result['t'] = true;
 23 | 	result['T'] = true;
 24 | 	result['y'] = true;
 25 | 	result['Y'] = true;
 26 | 	result['r'] = true;
 27 | 	result['R'] = true;
 28 | 	result['w'] = true;
 29 | 	result['W'] = true;
 30 | 	result['s'] = true;
 31 | 	result['S'] = true;
 32 | 	result['k'] = true;
 33 | 	result['K'] = true;
 34 | 	result['m'] = true;
 35 | 	result['M'] = true;
 36 | 	result['d'] = true;
 37 | 	result['D'] = true;
 38 | 	result['v'] = true;
 39 | 	result['V'] = true;
 40 | 	result['h'] = true;
 41 | 	result['H'] = true;
 42 | 	result['b'] = true;
 43 | 	result['B'] = true;
 44 | 	result['n'] = true;
 45 | 	result['N'] = true;
 46 | 	return result;
 47 | }
 48 | 
 49 | auto allowed = getAllowedNucleotides();
 50 | 
 51 | DirectedGraph::Node::Node(int nodeId, int originalNodeId, bool rightEnd, std::string sequence, std::string name) :
 52 | nodeId(nodeId),
 53 | originalNodeId(originalNodeId),
 54 | rightEnd(rightEnd),
 55 | sequence(sequence),
 56 | name(name)
 57 | {
 58 | }
 59 | 
 60 | DirectedGraph::Edge::Edge(size_t from, size_t to, size_t overlap) :
 61 | fromId(from),
 62 | toId(to),
 63 | overlap(overlap)
 64 | {
 65 | }
 66 | 
 67 | std::pair<DirectedGraph::Node, DirectedGraph::Node> DirectedGraph::ConvertVGNodeToNodes(const vg::Node& node)
 68 | {
 69 | 	assert(node.id() < std::numeric_limits<int>::max() / 2);
 70 | 	assert(node.id()+1 < std::numeric_limits<int>::max() / 2);
 71 | 	return std::make_pair(DirectedGraph::Node { (int)node.id() * 2, (int)node.id(), true, node.sequence(), node.name() }, DirectedGraph::Node { (int)node.id() * 2 + 1, (int)node.id(), false, CommonUtils::ReverseComplement(node.sequence()), node.name() });
 72 | }
 73 | 
 74 | std::pair<DirectedGraph::Edge, DirectedGraph::Edge> DirectedGraph::ConvertVGEdgeToEdges(const vg::Edge& edge)
 75 | {
 76 | 	assert(edge.overlap() == 0);
 77 | 	size_t fromLeft, fromRight, toLeft, toRight;
 78 | 	if (edge.from_start())
 79 | 	{
 80 | 		fromLeft = edge.from() * 2;
 81 | 		fromRight = edge.from() * 2 + 1;
 82 | 	}
 83 | 	else
 84 | 	{
 85 | 		fromLeft = edge.from() * 2 + 1;
 86 | 		fromRight = edge.from() * 2;
 87 | 	}
 88 | 	if (edge.to_end())
 89 | 	{
 90 | 		toLeft = edge.to() * 2;
 91 | 		toRight = edge.to() * 2 + 1;
 92 | 	}
 93 | 	else
 94 | 	{
 95 | 		toLeft = edge.to() * 2 + 1;
 96 | 		toRight = edge.to() * 2;
 97 | 	}
 98 | 	return std::make_pair(DirectedGraph::Edge { fromRight, toRight, 0 }, DirectedGraph::Edge { toLeft, fromLeft, 0 });
 99 | }
100 | 
101 | std::pair<DirectedGraph::Node, DirectedGraph::Node> DirectedGraph::ConvertGFANodeToNodes(int id, const std::string& sequence, const std::string& name)
102 | {
103 | 	return std::make_pair(DirectedGraph::Node { id * 2, id, true, sequence, name }, DirectedGraph::Node { id * 2 + 1, id, false, CommonUtils::ReverseComplement(sequence), name });
104 | }
105 | 
106 | std::pair<DirectedGraph::Edge, DirectedGraph::Edge> DirectedGraph::ConvertGFAEdgeToEdges(int from, const std::string& fromstart, int to, const std::string& toend, size_t overlap)
107 | {
108 | 	assert(fromstart == "+" || fromstart == "-");
109 | 	assert(toend == "+" || toend == "-");
110 | 	size_t fromLeft, fromRight, toLeft, toRight;
111 | 	if (fromstart == "-")
112 | 	{
113 | 		fromLeft = from * 2;
114 | 		fromRight = from * 2 + 1;
115 | 	}
116 | 	else
117 | 	{
118 | 		fromLeft = from * 2 + 1;
119 | 		fromRight = from * 2;
120 | 	}
121 | 	if (toend == "-")
122 | 	{
123 | 		toLeft = to * 2;
124 | 		toRight = to * 2 + 1;
125 | 	}
126 | 	else
127 | 	{
128 | 		toLeft = to * 2 + 1;
129 | 		toRight = to * 2;
130 | 	}
131 | 	return std::make_pair(DirectedGraph::Edge { fromRight, toRight, overlap }, DirectedGraph::Edge { toLeft, fromLeft, overlap });
132 | }
133 | 
134 | AlignmentGraph DirectedGraph::StreamVGGraphFromFile(std::string filename)
135 | {
136 | 	AlignmentGraph result;
137 | 	{
138 | 		std::vector<size_t> breakpointsFw;
139 | 		std::vector<size_t> breakpointsBw;
140 | 		breakpointsFw.push_back(0);
141 | 		breakpointsBw.push_back(0);
142 | 		std::ifstream graphfile { filename, std::ios::in | std::ios::binary };
143 | 		std::function<void(vg::Graph&)> lambda = [&result, &breakpointsFw, &breakpointsBw](vg::Graph& g) {
144 | 			for (int i = 0; i < g.node_size(); i++)
145 | 			{
146 | 				for (size_t j = 0; j < g.node(i).sequence().size(); j++)
147 | 				{
148 | 					if (!allowed[g.node(i).sequence()[j]])
149 | 					{
150 | 						throw CommonUtils::InvalidGraphException("Invalid sequence character: " + g.node(i).sequence()[j]);
151 | 					}
152 | 				}
153 | 				auto nodes = ConvertVGNodeToNodes(g.node(i));
154 | 				assert(nodes.first.sequence.size() == nodes.second.sequence.size());
155 | 				breakpointsFw.push_back(g.node(i).sequence().size());
156 | 				breakpointsBw.push_back(g.node(i).sequence().size());
157 | 				result.AddNode(nodes.first.nodeId, nodes.first.sequence, nodes.first.name, !nodes.first.rightEnd, breakpointsFw);
158 | 				result.AddNode(nodes.second.nodeId, nodes.second.sequence, nodes.second.name, !nodes.second.rightEnd, breakpointsBw);
159 | 				breakpointsFw.erase(breakpointsFw.begin()+1, breakpointsFw.end());
160 | 				breakpointsBw.erase(breakpointsBw.begin()+1, breakpointsBw.end());
161 | 			}
162 | 		};
163 | 		stream::for_each(graphfile, lambda);
164 | 	}
165 | 	{
166 | 		std::ifstream graphfile { filename, std::ios::in | std::ios::binary };
167 | 		std::function<void(vg::Graph&)> lambda = [&result](vg::Graph& g) {
168 | 			for (int i = 0; i < g.edge_size(); i++)
169 | 			{
170 | 				auto edges = ConvertVGEdgeToEdges(g.edge(i));
171 | 				result.AddEdgeNodeId(edges.first.fromId, edges.first.toId, edges.first.overlap);
172 | 				result.AddEdgeNodeId(edges.second.fromId, edges.second.toId, edges.second.overlap);
173 | 			}
174 | 		};
175 | 		stream::for_each(graphfile, lambda);
176 | 	}
177 | 	result.Finalize(64);
178 | 	return result;
179 | }
180 | 
181 | AlignmentGraph DirectedGraph::BuildFromVG(const vg::Graph& graph)
182 | {
183 | 	AlignmentGraph result;
184 | 	std::vector<size_t> breakpointsFw;
185 | 	std::vector<size_t> breakpointsBw;
186 | 	breakpointsFw.push_back(0);
187 | 	breakpointsBw.push_back(0);
188 | 	for (int i = 0; i < graph.node_size(); i++)
189 | 	{
190 | 		for (size_t j = 0; j < graph.node(i).sequence().size(); j++)
191 | 		{
192 | 			if (!allowed[graph.node(i).sequence()[j]])
193 | 			{
194 | 				throw CommonUtils::InvalidGraphException("Invalid sequence character: " + graph.node(i).sequence()[j]);
195 | 			}
196 | 		}
197 | 		auto nodes = ConvertVGNodeToNodes(graph.node(i));
198 | 		breakpointsFw.push_back(graph.node(i).sequence().size());
199 | 		breakpointsBw.push_back(graph.node(i).sequence().size());
200 | 		result.AddNode(nodes.first.nodeId, nodes.first.sequence, nodes.first.name, !nodes.first.rightEnd, breakpointsFw);
201 | 		result.AddNode(nodes.second.nodeId, nodes.second.sequence, nodes.second.name, !nodes.second.rightEnd, breakpointsBw);
202 | 		breakpointsFw.erase(breakpointsFw.begin()+1, breakpointsFw.end());
203 | 		breakpointsBw.erase(breakpointsBw.begin()+1, breakpointsBw.end());
204 | 	}
205 | 	for (int i = 0; i < graph.edge_size(); i++)
206 | 	{
207 | 		auto edges = ConvertVGEdgeToEdges(graph.edge(i));
208 | 		result.AddEdgeNodeId(edges.first.fromId, edges.first.toId, edges.first.overlap);
209 | 		result.AddEdgeNodeId(edges.second.fromId, edges.second.toId, edges.second.overlap);
210 | 	}
211 | 	result.Finalize(64);
212 | 	return result;
213 | }
214 | 
215 | AlignmentGraph DirectedGraph::BuildFromGFA(const GfaGraph& graph)
216 | {
217 | 	AlignmentGraph result;
218 | 	result.DBGoverlap = graph.edgeOverlap;
219 | 	std::unordered_map<int, std::vector<size_t>> breakpoints;
220 | 	for (auto pair : graph.varyingOverlaps)
221 | 	{
222 | 		int to = pair.first.second.id * 2;
223 | 		if (!pair.first.second.end) to += 1;
224 | 		int from = pair.first.first.Reverse().id * 2;
225 | 		if (!pair.first.first.Reverse().end) from += 1;
226 | 		breakpoints[from].push_back(pair.second);
227 | 		breakpoints[to].push_back(pair.second);
228 | 	}
229 | 	for (auto node : graph.nodes)
230 | 	{
231 | 		for (size_t j = 0; j < node.second.size(); j++)
232 | 		{
233 | 			if (!allowed[node.second[j]])
234 | 			{
235 | 				throw CommonUtils::InvalidGraphException("Invalid sequence character: " + node.second[j]);
236 | 			}
237 | 		}
238 | 		std::string name = graph.OriginalNodeName(node.first);
239 | 		auto nodes = ConvertGFANodeToNodes(node.first, node.second, name);
240 | 		std::vector<size_t> breakpointsFw = breakpoints[node.first * 2];
241 | 		std::vector<size_t> breakpointsBw = breakpoints[node.first * 2 + 1];
242 | 		breakpointsFw.push_back(0);
243 | 		breakpointsFw.push_back(node.second.size());
244 | 		breakpointsBw.push_back(0);
245 | 		breakpointsBw.push_back(node.second.size());
246 | 		std::sort(breakpointsFw.begin(), breakpointsFw.end());
247 | 		std::sort(breakpointsBw.begin(), breakpointsBw.end());
248 | 		result.AddNode(nodes.first.nodeId, nodes.first.sequence, nodes.first.name, !nodes.first.rightEnd, breakpointsFw);
249 | 		result.AddNode(nodes.second.nodeId, nodes.second.sequence, nodes.second.name, !nodes.second.rightEnd, breakpointsBw);
250 | 	}
251 | 	for (auto edge : graph.edges)
252 | 	{
253 | 		for (auto target : edge.second)
254 | 		{
255 | 			auto overlap = graph.edgeOverlap;
256 | 			if (graph.varyingOverlaps.count(std::make_pair(edge.first, target)) == 1)
257 | 			{
258 | 				overlap = graph.varyingOverlaps.at(std::make_pair(edge.first, target));
259 | 			}
260 | 			auto pair = ConvertGFAEdgeToEdges(edge.first.id, edge.first.end ? "+" : "-", target.id, target.end ? "+" : "-", overlap);
261 | 			result.AddEdgeNodeId(pair.first.fromId, pair.first.toId, pair.first.overlap);
262 | 			result.AddEdgeNodeId(pair.second.fromId, pair.second.toId, pair.second.overlap);
263 | 		}
264 | 	}
265 | 	result.Finalize(64);
266 | 	return result;
267 | }
268 | 


--------------------------------------------------------------------------------
/src/BigraphToDigraph.h:
--------------------------------------------------------------------------------
 1 | #ifndef BigraphToDigraph_H
 2 | #define BigraphToDigraph_H
 3 | 
 4 | #include <tuple>
 5 | #include <vector>
 6 | #include <string>
 7 | #include "AlignmentGraph.h"
 8 | #include "vg.pb.h"
 9 | #include "GfaGraph.h"
10 | 
11 | class DirectedGraph
12 | {
13 | public:
14 | 	struct Node
15 | 	{
16 | 		Node(int nodeId, int originalNodeId, bool rightEnd, std::string sequence, std::string name);
17 | 		int nodeId;
18 | 		int originalNodeId;
19 | 		bool rightEnd;
20 | 		std::string sequence;
21 | 		std::string name;
22 | 	};
23 | 	struct Edge
24 | 	{
25 | 		Edge(size_t from, size_t to, size_t overlap);
26 | 		size_t fromId;
27 | 		size_t toId;
28 | 		size_t overlap;
29 | 	};
30 | 	static std::pair<Node, Node> ConvertVGNodeToNodes(const vg::Node& node);
31 | 	static std::pair<Edge, Edge> ConvertVGEdgeToEdges(const vg::Edge& edge);
32 | 	static std::pair<Node, Node> ConvertGFANodeToNodes(int id, const std::string& seq, const std::string& name);
33 | 	static std::pair<Edge, Edge> ConvertGFAEdgeToEdges(int from, const std::string& fromStart, int to, const std::string& toEnd, size_t overlap);
34 | 	static AlignmentGraph BuildFromVG(const vg::Graph& graph);
35 | 	static AlignmentGraph BuildFromGFA(const GfaGraph& graph);
36 | 	static AlignmentGraph StreamVGGraphFromFile(std::string filename);
37 | private:
38 | };
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/src/BruteForceExactPrefixSeeds.cpp:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | #include <fstream>
  3 | #include "vg.pb.h"
  4 | #include "stream.hpp"
  5 | #include "GfaGraph.h"
  6 | #include "fastqloader.h"
  7 | 
  8 | void bruteForceAddPaths(const GfaGraph& graph, std::vector<std::vector<std::pair<int, size_t>>>& result, int node, size_t offset, size_t label, size_t length, size_t k)
  9 | {
 10 | 	label <<= 2;
 11 | 	switch(graph.nodes.at(node)[offset])
 12 | 	{
 13 | 		case 'A':
 14 | 		case 'a':
 15 | 			label += 0;
 16 | 			break;
 17 | 		case 'C':
 18 | 		case 'c':
 19 | 			label += 1;
 20 | 			break;
 21 | 		case 'G':
 22 | 		case 'g':
 23 | 			label += 2;
 24 | 			break;
 25 | 		case 'T':
 26 | 		case 't':
 27 | 			label += 3;
 28 | 			break;
 29 | 		default:
 30 | 			assert(false);
 31 | 	}
 32 | 	if (length == k - 1)
 33 | 	{
 34 | 		assert(label < result.size());
 35 | 		result[label].emplace_back(node, offset);
 36 | 		return;
 37 | 	}
 38 | 	if (offset < graph.nodes.at(node).size() - 1)
 39 | 	{
 40 | 		bruteForceAddPaths(graph, result, node, offset+1, label, length+1, k);
 41 | 	}
 42 | 	else
 43 | 	{
 44 | 		if (graph.edges.count(NodePos{node, true}) == 1)
 45 | 		{
 46 | 			for (auto edge : graph.edges.at(NodePos{node, true}))
 47 | 			{
 48 | 				bruteForceAddPaths(graph, result, edge.id, graph.edgeOverlap, label, length+1, k);
 49 | 			}
 50 | 		}
 51 | 	}
 52 | }
 53 | 
 54 | std::vector<std::vector<std::pair<int, size_t>>> buildBruteForcePathIndex(const GfaGraph& graph, const int k)
 55 | {
 56 | 	std::vector<std::vector<std::pair<int, size_t>>> result;
 57 | 	result.resize(pow(4, k));
 58 | 	for (auto node : graph.nodes)
 59 | 	{
 60 | 		for (size_t i = 0; i < node.second.size(); i++)
 61 | 		{
 62 | 			bruteForceAddPaths(graph, result, node.first, i, 0, 0, k);
 63 | 		}
 64 | 	}
 65 | 	return result;
 66 | }
 67 | 
 68 | int main(int argc, char** argv)
 69 | {
 70 | 	std::string graphFile { argv[1] };
 71 | 	std::string readFile { argv[2] };
 72 | 	int k = std::stoi(argv[3]);
 73 | 	std::string outputSeedFile { argv[4] };
 74 | 
 75 | 	std::cerr << "load graph" << std::endl;
 76 | 	auto graph = GfaGraph::LoadFromFile(graphFile);
 77 | 	std::cerr << "build index" << std::endl;
 78 | 	auto index = buildBruteForcePathIndex(graph, k);
 79 | 
 80 | 	std::cerr << "load reads" << std::endl;
 81 | 	auto reads = loadFastqFromFile(readFile);
 82 | 	std::vector<std::pair<std::string, size_t>> readLabels;
 83 | 	readLabels.reserve(reads.size());
 84 | 	std::vector<vg::Alignment> seeds;
 85 | 	size_t numSeeds = 0;
 86 | 	std::cerr << "count seeds" << std::endl;
 87 | 	for (auto read : reads)
 88 | 	{
 89 | 		if (read.sequence.size() < k) continue;
 90 | 		size_t label = 0;
 91 | 		for (int i = 0; i < k; i++)
 92 | 		{
 93 | 			label <<= 2;
 94 | 			switch(read.sequence[i])
 95 | 			{
 96 | 				case 'A':
 97 | 				case 'a':
 98 | 					label += 0;
 99 | 					break;
100 | 				case 'C':
101 | 				case 'c':
102 | 					label += 1;
103 | 					break;
104 | 				case 'T':
105 | 				case 't':
106 | 					label += 2;
107 | 					break;
108 | 				case 'G':
109 | 				case 'g':
110 | 					label += 3;
111 | 					break;
112 | 				default:
113 | 					break;
114 | 			}
115 | 		}
116 | 		readLabels.emplace_back(read.seq_id, label);
117 | 		numSeeds += index[label].size();
118 | 	}
119 | 	std::cerr << numSeeds << " seeds" << std::endl;
120 | 	seeds.reserve(numSeeds);
121 | 	std::cerr << "get seeds" << std::endl;
122 | 	for (auto pair : readLabels)
123 | 	{
124 | 		for (auto pos : index[pair.second])
125 | 		{
126 | 			vg::Alignment seed;
127 | 			seed.set_name(pair.first);
128 | 			auto mapping = seed.mutable_path()->add_mapping();
129 | 			auto edit = mapping->add_edit();
130 | 			edit->set_from_length(k);
131 | 			edit->set_to_length(k);
132 | 			mapping->mutable_position()->set_node_id(pos.first);
133 | 			mapping->mutable_position()->set_offset(pos.second);
134 | 			seeds.push_back(seed);
135 | 			seed.set_query_position(k-1);
136 | 		}
137 | 	}
138 | 
139 | 	std::cerr << "write seeds" << std::endl;
140 | 	std::ofstream outFile { outputSeedFile, std::ios::binary };
141 | 	stream::write_buffered(outFile, seeds, 0);
142 | }


--------------------------------------------------------------------------------
/src/CommonUtils.cpp:
--------------------------------------------------------------------------------
  1 | #include "CommonUtils.h"
  2 | #include "stream.hpp"
  3 | 
  4 | namespace CommonUtils
  5 | {
  6 | 	InvalidGraphException::InvalidGraphException(const char* c) : std::runtime_error(c) 
  7 | 	{
  8 | 	}
  9 | 
 10 | 	InvalidGraphException::InvalidGraphException(std::string c) : std::runtime_error(c) 
 11 | 	{
 12 | 	}
 13 | 
 14 | 	void mergeGraphs(vg::Graph& graph, const vg::Graph& part)
 15 | 	{
 16 | 		for (int i = 0; i < part.node_size(); i++)
 17 | 		{
 18 | 			auto node = graph.add_node();
 19 | 			node->set_id(part.node(i).id());
 20 | 			node->set_sequence(part.node(i).sequence());
 21 | 			node->set_name(part.node(i).name());
 22 | 		}
 23 | 		for (int i = 0; i < part.edge_size(); i++)
 24 | 		{
 25 | 			auto edge = graph.add_edge();
 26 | 			edge->set_from(part.edge(i).from());
 27 | 			edge->set_to(part.edge(i).to());
 28 | 			edge->set_from_start(part.edge(i).from_start());
 29 | 			edge->set_to_end(part.edge(i).to_end());
 30 | 			edge->set_overlap(part.edge(i).overlap());
 31 | 		}
 32 | 	}
 33 | 
 34 | 	vg::Graph LoadVGGraph(std::string filename)
 35 | 	{
 36 | 		vg::Graph result;
 37 | 		std::ifstream graphfile { filename, std::ios::in | std::ios::binary };
 38 | 		std::function<void(vg::Graph&)> lambda = [&result](vg::Graph& g) {
 39 | 			mergeGraphs(result, g);
 40 | 		};
 41 | 		stream::for_each(graphfile, lambda);
 42 | 		return result;
 43 | 	}
 44 | 
 45 | 	std::vector<vg::Alignment> LoadVGAlignments(std::string filename)
 46 | 	{
 47 | 		std::vector<vg::Alignment> result;
 48 | 		std::ifstream graphfile { filename, std::ios::in | std::ios::binary };
 49 | 		std::function<void(vg::Alignment&)> lambda = [&result](vg::Alignment& g) {
 50 | 			result.push_back(g);
 51 | 		};
 52 | 		stream::for_each(graphfile, lambda);
 53 | 		return result;
 54 | 	}
 55 | 
 56 | 	vg::Alignment LoadVGAlignment(std::string filename)
 57 | 	{
 58 | 		vg::Alignment result;
 59 | 		std::ifstream graphfile { filename, std::ios::in | std::ios::binary };
 60 | 		std::function<void(vg::Alignment&)> lambda = [&result](vg::Alignment& g) {
 61 | 			result = g;
 62 | 		};
 63 | 		stream::for_each(graphfile, lambda);
 64 | 		return result;
 65 | 	}
 66 | 
 67 | 	std::string ReverseComplement(std::string str)
 68 | 	{
 69 | 		std::string result;
 70 | 		result.reserve(str.size());
 71 | 		for (int i = str.size()-1; i >= 0; i--)
 72 | 		{
 73 | 			result += Complement(str[i]);
 74 | 		}
 75 | 		return result;
 76 | 	}
 77 | 
 78 | 	char Complement(char c)
 79 | 	{
 80 | 		switch (c)
 81 | 		{
 82 | 			case 'A':
 83 | 			case 'a':
 84 | 				return 'T';
 85 | 			case 'C':
 86 | 			case 'c':
 87 | 				return 'G';
 88 | 			case 'T':
 89 | 			case 't':
 90 | 				return 'A';
 91 | 			case 'G':
 92 | 			case 'g':
 93 | 				return 'C';
 94 | 			case 'N':
 95 | 			case 'n':
 96 | 				return 'N';
 97 | 			case 'U':
 98 | 			case 'u':
 99 | 				return 'A';
100 | 			case 'R':
101 | 			case 'r':
102 | 				return 'Y';
103 | 			case 'Y':
104 | 			case 'y':
105 | 				return 'R';
106 | 			case 'K':
107 | 			case 'k':
108 | 				return 'M';
109 | 			case 'M':
110 | 			case 'm':
111 | 				return 'K';
112 | 			case 'S':
113 | 			case 's':
114 | 				return 'S';
115 | 			case 'W':
116 | 			case 'w':
117 | 				return 'W';
118 | 			case 'B':
119 | 			case 'b':
120 | 				return 'V';
121 | 			case 'V':
122 | 			case 'v':
123 | 				return 'B';
124 | 			case 'D':
125 | 			case 'd':
126 | 				return 'H';
127 | 			case 'H':
128 | 			case 'h':
129 | 				return 'D';
130 | 			default:
131 | 				assert(false);
132 | 				return 'N';
133 | 	}
134 | 	}
135 | 
136 | }
137 | 
138 | BufferedWriter::BufferedWriter() : stream(nullptr) {};
139 | BufferedWriter::BufferedWriter(std::ostream& stream) : stream(&stream) {};
140 | BufferedWriter& BufferedWriter::operator<<(FlushClass)
141 | {
142 | 	if (stream == nullptr) return *this;
143 | 	flush();
144 | 	return *this;
145 | }
146 | void BufferedWriter::flush()
147 | {
148 | 	if (stream == nullptr) return;
149 | 	stringstream << std::endl;
150 | 	(*stream) << stringstream.str();
151 | 	stringstream.str("");
152 | }
153 | bool BufferedWriter::inputDiscarded() const
154 | {
155 | 	return stream == nullptr;
156 | }


--------------------------------------------------------------------------------
/src/CommonUtils.h:
--------------------------------------------------------------------------------
 1 | #ifndef CommonUtils_h
 2 | #define CommonUtils_h
 3 | 
 4 | #include <algorithm>
 5 | #include <functional>
 6 | #include <string>
 7 | #include <vector>
 8 | #include <sstream>
 9 | #include "vg.pb.h"
10 | 
11 | namespace CommonUtils
12 | {
13 | 	struct InvalidGraphException : std::runtime_error
14 | 	{
15 | 		InvalidGraphException(const char* c);
16 | 		InvalidGraphException(std::string c);
17 | 	};
18 | 	vg::Graph LoadVGGraph(std::string filename);
19 | 	char Complement(char original);
20 | 	std::string ReverseComplement(std::string original);
21 | 	vg::Alignment LoadVGAlignment(std::string filename);
22 | 	std::vector<vg::Alignment> LoadVGAlignments(std::string filename);
23 | }
24 | 
25 | class BufferedWriter : std::ostream
26 | {
27 | public:
28 | 	class FlushClass {};
29 | 	BufferedWriter();
30 | 	BufferedWriter(std::ostream& stream);
31 | 	BufferedWriter(const BufferedWriter& other) = default;
32 | 	BufferedWriter(BufferedWriter&& other) = default;
33 | 	BufferedWriter& operator=(const BufferedWriter& other) = default;
34 | 	BufferedWriter& operator=(BufferedWriter&& other) = default;
35 | 	template <typename T>
36 | 	BufferedWriter& operator<<(T obj)
37 | 	{
38 | 		if (stream == nullptr) return *this;
39 | 		stringstream << obj;
40 | 		return *this;
41 | 	}
42 | 	BufferedWriter& operator<<(FlushClass f);
43 | 	void flush();
44 | 	bool inputDiscarded() const;
45 | 	static FlushClass Flush;
46 | private:
47 | 	std::ostream* stream;
48 | 	std::stringstream stringstream;
49 | };
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/src/ComponentPriorityQueue.h:
--------------------------------------------------------------------------------
  1 | #ifndef ComponentPriorityQueue_h
  2 | #define ComponentPriorityQueue_h
  3 | 
  4 | #include <queue>
  5 | #include <phmap.h>
  6 | #include "ThreadReadAssertion.h"
  7 | 
  8 | template <typename T, bool SparseStorage>
  9 | class ComponentPriorityQueue
 10 | {
 11 | 	struct PrioritizedItem
 12 | 	{
 13 | 		PrioritizedItem(size_t component, int score, size_t index) : component(component), score(score), index(index) {}
 14 | 		size_t component;
 15 | 		int score;
 16 | 		size_t index;
 17 | 		bool operator<(const PrioritizedItem& other) const { return component < other.component || (component == other.component && score < other.score); }
 18 | 		bool operator>(const PrioritizedItem& other) const { return component > other.component || (component == other.component && score > other.score); }
 19 | 	};
 20 | public:
 21 | 	constexpr bool IsComponentPriorityQueue() { return true; }
 22 | 	ComponentPriorityQueue(size_t maxNode) :
 23 | 	activeQueues(),
 24 | 	active(),
 25 | 	extras()
 26 | 	{
 27 | 		initialize(maxNode);
 28 | 	}
 29 | 	ComponentPriorityQueue() :
 30 | 	activeQueues(),
 31 | 	active(),
 32 | 	extras()
 33 | 	{
 34 | 	}
 35 | 	template <bool Sparse = SparseStorage>
 36 | 	typename std::enable_if<Sparse>::type initialize(size_t maxNode)
 37 | 	{
 38 | 		active.resize(maxNode, false);
 39 | 	}
 40 | 	template <bool Sparse = SparseStorage>
 41 | 	typename std::enable_if<!Sparse>::type initialize(size_t maxNode)
 42 | 	{
 43 | 		extras.resize(maxNode);
 44 | 		active.resize(maxNode, false);
 45 | 	}
 46 | #ifdef NDEBUG
 47 | 	__attribute__((always_inline))
 48 | #endif
 49 | 	T& top()
 50 | 	{
 51 | 		assert(activeQueues.size() > 0);
 52 | 		auto index = activeQueues.top().index;
 53 | 		assert(active[index]);
 54 | 		assert(extras[index].size() > 0);
 55 | 		return extras[index][0];
 56 | 	}
 57 | #ifdef NDEBUG
 58 | 	__attribute__((always_inline))
 59 | #endif
 60 | 	void pop()
 61 | 	{
 62 | 		assert(activeQueues.size() > 0);
 63 | 		size_t index = activeQueues.top().index;
 64 | 		assert(active[index]);
 65 | 		assert(extras[index].size() > 0);
 66 | 		extras[index].clear();
 67 | 		active[index] = false;
 68 | 		activeQueues.pop();
 69 | 	}
 70 | #ifdef NDEBUG
 71 | 	__attribute__((always_inline))
 72 | #endif
 73 | 	size_t size() const
 74 | 	{
 75 | 		return activeQueues.size();
 76 | 	}
 77 | 	void insert(size_t component, const T& item)
 78 | 	{
 79 | 		assert(false);
 80 | 	}
 81 | #ifdef NDEBUG
 82 | 	__attribute__((always_inline))
 83 | #endif
 84 | 	void insert(size_t component, int score, const T& item)
 85 | 	{
 86 | 		size_t index = getId(item);
 87 | 		assert(SparseStorage || index < extras.size());
 88 | 		if (!active[index])
 89 | 		{
 90 | 			assert(extras[index].size() == 0);
 91 | 			activeQueues.emplace(component, score, index);
 92 | 			active[index] = true;
 93 | 		}
 94 | 		extras[index].push_back(item);
 95 | 	}
 96 | 	void clear()
 97 | 	{
 98 | 		while (activeQueues.size() > 0)
 99 | 		{
100 | 			size_t index = activeQueues.top().index;
101 | 			assert(active[index]);
102 | 			removeExtras(index);
103 | 			active[index] = false;
104 | 			activeQueues.pop();
105 | 		}
106 | 		sparsify();
107 | 	}
108 | 	template<bool Sparse = SparseStorage>
109 | 	typename std::enable_if<Sparse>::type sparsify()
110 | 	{
111 | 		decltype(extras) empty;
112 | 		std::swap(extras, empty);
113 | 	}
114 | 	template<bool Sparse = SparseStorage>
115 | 	typename std::enable_if<!Sparse>::type sparsify()
116 | 	{
117 | 	}
118 | 	const std::vector<T>& getExtras(size_t index) const
119 | 	{
120 | 		assert(SparseStorage ||index < extras.size());
121 | 		return getVec(extras, index);
122 | 	}
123 | 	void removeExtras(size_t index)
124 | 	{
125 | 		assert(SparseStorage ||index < extras.size());
126 | 		extras[index].clear();
127 | 	}
128 | 	size_t extraSize(size_t index) const
129 | 	{
130 | 		assert(SparseStorage ||index < extras.size());
131 | 		return getVec(extras, index).size();
132 | 	}
133 | 	bool valid() const
134 | 	{
135 | 		return active.size() > 0;
136 | 	}
137 | private:
138 | 	const std::vector<T>& getVec(const std::vector<std::vector<T>>& list, size_t index) const
139 | 	{
140 | 		return list[index];
141 | 	}
142 | 	const std::vector<T>& getVec(const phmap::flat_hash_map<size_t, std::vector<T>>& list, size_t index) const
143 | 	{
144 | 		static std::vector<T> empty;
145 | 		auto found = list.find(index);
146 | 		if (found == list.end()) return empty;
147 | 		return found->second;
148 | 	}
149 | 	size_t getId(const T& item) const
150 | 	{
151 | 		return item.target;
152 | 	}
153 | 	std::priority_queue<PrioritizedItem, std::vector<PrioritizedItem>, std::greater<PrioritizedItem>> activeQueues;
154 | 	std::vector<bool> active;
155 | 	typename std::conditional<SparseStorage, phmap::flat_hash_map<size_t, std::vector<T>>, std::vector<std::vector<T>>>::type extras;
156 | };
157 | 
158 | #endif
159 | 


--------------------------------------------------------------------------------
/src/DijkstraQueue.h:
--------------------------------------------------------------------------------
  1 | #ifndef DijkstraQueue_h
  2 | #define DijkstraQueue_h
  3 | 
  4 | #include <queue>
  5 | #include <phmap.h>
  6 | #include "ThreadReadAssertion.h"
  7 | 
  8 | namespace std
  9 | {
 10 |     template<>
 11 |     struct hash<std::pair<size_t, size_t>>
 12 |     {
 13 |     	size_t operator()(const std::pair<size_t, size_t> pair) const
 14 |     	{
 15 |     		return std::hash<size_t>{}(pair.first) ^ std::hash<size_t>{}(pair.second);
 16 |     	}
 17 |     };
 18 | }
 19 | 
 20 | template <typename T>
 21 | class DijkstraPriorityQueue
 22 | {
 23 | public:
 24 | 	constexpr bool IsComponentPriorityQueue() { return false; }
 25 | 	DijkstraPriorityQueue() :
 26 | 	activeQueues(),
 27 | 	extras(),
 28 | 	queues(),
 29 | 	numItems(0),
 30 | 	zeroScore(0)
 31 | 	{
 32 | 		initialize(129);
 33 | 	}
 34 | 	void initialize(size_t maxPriority)
 35 | 	{
 36 | 		queues.resize(maxPriority);
 37 | 	}
 38 | #ifdef NDEBUG
 39 | 	__attribute__((always_inline))
 40 | #endif
 41 | 	T& top()
 42 | 	{
 43 | 		assert(activeQueues.size() > 0);
 44 | 		size_t queue = activeQueues.top();
 45 | 		assert(queues[queue].size() > 0);
 46 | 		return queues[queue].back();
 47 | 	}
 48 | #ifdef NDEBUG
 49 | 	__attribute__((always_inline))
 50 | #endif
 51 | 	void pop()
 52 | 	{
 53 | 		assert(numItems > 0);
 54 | 		assert(activeQueues.size() > 0);
 55 | 		size_t queue = activeQueues.top();
 56 | 		assert(queue < queues.size());
 57 | 		assert(queues[queue].size() > 0);
 58 | 		queues[queue].pop_back();
 59 | 		if (queues[queue].size() == 0) activeQueues.pop();
 60 | 		numItems--;
 61 | 	}
 62 | #ifdef NDEBUG
 63 | 	__attribute__((always_inline))
 64 | #endif
 65 | 	size_t size() const
 66 | 	{
 67 | 		return numItems;
 68 | 	}
 69 | 	void insert(size_t component, int score, const T& item)
 70 | 	{
 71 | 		assert(false);
 72 | 	}
 73 | #ifdef NDEBUG
 74 | 	__attribute__((always_inline))
 75 | #endif
 76 | 	void insert(size_t priority, const T& item)
 77 | 	{
 78 | 		assert(priority >= zeroScore);
 79 | 		priority -= zeroScore;
 80 | 		assert(priority < queues.size());
 81 | 		queues[priority].push_back(item);
 82 | 		extras[getId(item)].push_back(item);
 83 | 		if (queues[priority].size() == 1) activeQueues.emplace(priority);
 84 | 		numItems++;
 85 | 	}
 86 | 	void clear()
 87 | 	{
 88 | 		for (size_t i = 0; i < queues.size(); i++)
 89 | 		{
 90 | 			queues[i].clear();
 91 | 		}
 92 | 		typename std::remove_reference<decltype(activeQueues)>::type tmp;
 93 | 		std::swap(tmp, activeQueues);
 94 | 		typename std::remove_reference<decltype(extras)>::type tmp2;
 95 | 		std::swap(tmp2, extras);
 96 | 		numItems = 0;
 97 | 		zeroScore = 0;
 98 | 		sparsify();
 99 | 	}
100 | 	void increaseScore(size_t increase)
101 | 	{
102 | 		assert(increase > 0);
103 | 		assert(increase < queues.size());
104 | 		assert(activeQueues.size() == 0 || activeQueues.top() >= increase);
105 | 		typename std::remove_reference<decltype(activeQueues)>::type tmp;
106 | 		std::swap(tmp, activeQueues);
107 | 		for (size_t i = 0; i < queues.size() - increase; i++)
108 | 		{
109 | 			assert(queues[i].size() == 0);
110 | 			std::swap(queues[i], queues[i + increase]);
111 | 			if (queues[i].size() > 0) activeQueues.emplace(i);
112 | 		}
113 | 		for (size_t i = queues.size() - increase; i < queues.size(); i++)
114 | 		{
115 | 			assert(queues[i].size() == 0);
116 | 		}
117 | 		zeroScore += increase;
118 | 	}
119 | 	void sparsify()
120 | 	{
121 | 		decltype(extras) empty;
122 | 		std::swap(extras, empty);
123 | 	}
124 | 	const std::vector<T>& getExtras(size_t slice, size_t index)
125 | 	{
126 | 		return getExtras(std::make_pair(slice, index));
127 | 	}
128 | 	const std::vector<T>& getExtras(std::pair<size_t, size_t> index)
129 | 	{
130 | 		return getVec(index);
131 | 	}
132 | 	void removeExtras(size_t slice, size_t index)
133 | 	{
134 | 		removeExtras(std::make_pair(slice, index));
135 | 	}
136 | 	void removeExtras(std::pair<size_t, size_t> index)
137 | 	{
138 | 		extras[index].clear();
139 | 	}
140 | 	size_t extraSize(size_t slice, size_t index) const
141 | 	{
142 | 		return extraSize(std::make_pair(slice, index));
143 | 	}
144 | 	size_t extraSize(std::pair<size_t, size_t> index) const
145 | 	{
146 | 		return getVec(index).size();
147 | 	}
148 | 	size_t zero() const
149 | 	{
150 | 		return zeroScore;
151 | 	}
152 | private:
153 | 	const std::vector<T>& getVec(std::pair<size_t, size_t> index) const
154 | 	{
155 | 		static std::vector<T> empty;
156 | 		auto found = extras.find(index);
157 | 		if (found == extras.end()) return empty;
158 | 		return found->second;
159 | 	}
160 | 	std::pair<size_t, size_t> getId(const T& item) const
161 | 	{
162 | 		return std::make_pair(item.slice, item.target);
163 | 	}
164 | 	std::priority_queue<size_t, std::vector<size_t>, std::greater<size_t>> activeQueues;
165 | 	phmap::flat_hash_map<std::pair<size_t, size_t>, std::vector<T>> extras;
166 | 	std::vector<std::vector<T>> queues;
167 | 	size_t numItems;
168 | 	size_t zeroScore;
169 | };
170 | 
171 | #endif
172 | 


--------------------------------------------------------------------------------
/src/EValue.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <cmath>
  3 | #include "ThreadReadAssertion.h"
  4 | #include "EValue.h"
  5 | 
  6 | // model the alignment as one sequence with the alphabet {match, mismatch}
  7 | // with random alignments having P(match) = P(mismatch) = 0.5
  8 | // and match having score +1, mismatch score <0 chosen such that an alignment at minIdentity has score 0
  9 | // then use Karlin-Altschul equation to calculate E
 10 | // get lambda and K numerically depending no the match & mismatch score
 11 | 
 12 | // ...except the bitvector algorithm doesn't give the number of matches and mismatches
 13 | // so approximate the alignment score as (length * matchScore + numEdits * (mismatchScore-matchScore))
 14 | // this is close enough hopefully
 15 | 
 16 | constexpr double e = 2.71828182845904523536028747135266249775724709369995;
 17 | 
 18 | EValueCalculator::EValueCalculator() :
 19 | matchScore(-1),
 20 | mismatchScore(-1),
 21 | lambda(-1),
 22 | K(-1)
 23 | {
 24 | }
 25 | 
 26 | EValueCalculator::EValueCalculator(double minIdentity) :
 27 | matchScore(1),
 28 | mismatchScore(-minIdentity / (1.0 - minIdentity)),
 29 | lambda(-1),
 30 | K(-1)
 31 | {
 32 | 	initializeLambda();
 33 | 	initializeK();
 34 | }
 35 | 
 36 | double EValueCalculator::getEValue(size_t databaseSize, size_t querySize, double alignmentScore) const
 37 | {
 38 | 	return K * databaseSize * querySize * pow(e, -lambda * alignmentScore);
 39 | }
 40 | 
 41 | double EValueCalculator::getEValue(size_t databaseSize, size_t querySize, size_t alignmentLength, size_t numEdits) const
 42 | {
 43 | 	return getEValue(databaseSize, querySize, getAlignmentScore(alignmentLength, numEdits));
 44 | }
 45 | 
 46 | double EValueCalculator::getAlignmentScore(size_t alignmentLength, size_t numEdits) const
 47 | {
 48 | 	return alignmentLength * matchScore - numEdits * (mismatchScore - matchScore);
 49 | }
 50 | 
 51 | void EValueCalculator::initializeLambda()
 52 | {
 53 | 	// lambda is bounded by 0 < lambda < ln(2) < 0.7
 54 | 	double guessMin = 0;
 55 | 	double guessMax = 0.7;
 56 | 	// bisect, max error 2^-100
 57 | 	for (int i = 0; i < 100; i++)
 58 | 	{
 59 | 		double guessMid = (guessMin + guessMax) * 0.5;
 60 | 		double valueMid = pow(e, guessMid*matchScore) * .5 + pow(e, guessMid*mismatchScore) * 0.5 - 1;
 61 | 		if (valueMid < 0) guessMin = guessMid;
 62 | 		if (valueMid > 0) guessMax = guessMid;
 63 | 		// due to floating point precision limits
 64 | 		if (valueMid == 0)
 65 | 		{
 66 | 			guessMin = guessMid;
 67 | 			guessMax = guessMid;
 68 | 			break;
 69 | 		}
 70 | 		if (guessMin == guessMax) break;
 71 | 	}
 72 | 	lambda = (guessMin + guessMax) / 2;
 73 | }
 74 | 
 75 | void EValueCalculator::initializeK()
 76 | {
 77 | 	assert(lambda != -1);
 78 | 	double seriesSum = 0;
 79 | 	std::vector<size_t> pascalsTriangle;
 80 | 	pascalsTriangle.push_back(1);
 81 | 	for (int k = 1; k < 10; k++)
 82 | 	{
 83 | 		std::vector<size_t> newTriangle;
 84 | 		newTriangle.resize(pascalsTriangle.size()+1, 0);
 85 | 		for (size_t j = 0; j < pascalsTriangle.size(); j++)
 86 | 		{
 87 | 			newTriangle[j] += pascalsTriangle[j];
 88 | 			newTriangle[j+1] += pascalsTriangle[j];
 89 | 		}
 90 | 		pascalsTriangle = newTriangle;
 91 | 		assert(pascalsTriangle[0] == 1);
 92 | 		assert(pascalsTriangle.back() == 1);
 93 | 		assert(pascalsTriangle.size() == k+1);
 94 | 		size_t triangleSum = 0;
 95 | 		for (auto n : pascalsTriangle) triangleSum += n;
 96 | 		double negativeExpectation = 0;
 97 | 		double greaterProbability = 0;
 98 | 		for (size_t j = 0; j < pascalsTriangle.size(); j++)
 99 | 		{
100 | 			size_t matches = j;
101 | 			size_t mismatches = pascalsTriangle.size() - 1 - j;
102 | 			double score = (double)matches * matchScore + (double)mismatches * mismatchScore;
103 | 			double probability = (double)pascalsTriangle[j] / (double)triangleSum;
104 | 			if (score < 0) negativeExpectation += pow(e, lambda * score) * probability;
105 | 			if (score >= 0) greaterProbability += probability;
106 | 		}
107 | 		seriesSum += (negativeExpectation + greaterProbability) / (double)k;
108 | 	}
109 | 	double expectation = .5 * matchScore * pow(e, lambda * matchScore) + .5 * mismatchScore * pow(e, lambda * mismatchScore);
110 | 	double Cstar = pow(e, -2 * seriesSum) / (lambda * expectation);
111 | 	// assume delta is 1 even though its not really
112 | 	K = Cstar * lambda / (1.0 - pow(e, -lambda));
113 | }
114 | 


--------------------------------------------------------------------------------
/src/EValue.h:
--------------------------------------------------------------------------------
 1 | #ifndef EValue_h
 2 | #define EValue_h
 3 | 
 4 | class EValueCalculator
 5 | {
 6 | public:
 7 | 	EValueCalculator();
 8 | 	EValueCalculator(double minIdentity);
 9 | 	double getAlignmentScore(size_t alignmentLength, size_t numEdits) const;
10 | 	double getEValue(size_t databaseSize, size_t querySize, size_t alignmentLength, size_t numEdits) const;
11 | 	double getEValue(size_t databaseSize, size_t querySize, double alignmentScore) const;
12 | private:
13 | 	void initializeLambda();
14 | 	void initializeK();
15 | 	double matchScore;
16 | 	double mismatchScore;
17 | 	double lambda;
18 | 	double K;
19 | };
20 | 
21 | #endif


--------------------------------------------------------------------------------
/src/EstimateRepeatCount.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <fstream>
  3 | #include <iostream>
  4 | #include <vector>
  5 | #include <unordered_map>
  6 | #include <unordered_set>
  7 | #include "CommonUtils.h"
  8 | #include "GfaGraph.h"
  9 | 
 10 | int main(int argc, char** argv)
 11 | {
 12 | 	std::string ingraphfilename {argv[1]};
 13 | 	std::string inalignmentfilename {argv[2]};
 14 | 	std::string outfilename {argv[3]};
 15 | 
 16 | 	std::cerr << "load graph" << std::endl;
 17 | 	auto graph = GfaGraph::LoadFromFile(ingraphfilename);
 18 | 	std::cerr << "process graph" << std::endl;
 19 | 	std::unordered_map<int, std::unordered_map<std::string, size_t>> baseCounts;
 20 | 	std::unordered_map<int, std::vector<int>> outNeighbors;
 21 | 	std::unordered_map<int, std::vector<int>> leftInneighbors;
 22 | 	std::unordered_map<int, std::vector<int>> rightInneighbors;
 23 | 	std::unordered_map<int, size_t> counts;
 24 | 	{
 25 | 		std::unordered_map<NodePos, std::unordered_set<NodePos>> edges;
 26 | 		for (auto edge : graph.edges)
 27 | 		{
 28 | 			for (auto target : edge.second)
 29 | 			{
 30 | 				edges[edge.first].insert(target);
 31 | 				edges[NodePos{target.id, !target.end}].insert(NodePos{edge.first.id, !edge.first.end});
 32 | 			}
 33 | 		}
 34 | 		for (auto node : graph.nodes)
 35 | 		{
 36 | 			NodePos end {node.first, true};
 37 | 			if (edges.count(end) == 1 && edges[end].size() == 1)
 38 | 			{
 39 | 				outNeighbors[node.first].push_back(edges[end].begin()->id);
 40 | 				if (edges[end].begin()->end)
 41 | 				{
 42 | 					rightInneighbors[edges[end].begin()->id].push_back(node.first);
 43 | 				}
 44 | 				else
 45 | 				{
 46 | 					leftInneighbors[edges[end].begin()->id].push_back(node.first);
 47 | 				}
 48 | 			}
 49 | 			NodePos start {node.first, false};
 50 | 			if (edges.count(start) == 1 && edges[start].size() == 1)
 51 | 			{
 52 | 				outNeighbors[node.first].push_back(edges[start].begin()->id);
 53 | 				if (edges[start].begin()->end)
 54 | 				{
 55 | 					rightInneighbors[edges[start].begin()->id].push_back(node.first);
 56 | 				}
 57 | 				else
 58 | 				{
 59 | 					leftInneighbors[edges[start].begin()->id].push_back(node.first);
 60 | 				}
 61 | 			}
 62 | 			if (edges.count(start) == 1) counts[node.first] = std::max(counts[node.first], edges[start].size());
 63 | 			if (edges.count(end) == 1) counts[node.first] = std::max(counts[node.first], edges[end].size());
 64 | 		}
 65 | 	}
 66 | 
 67 | 	std::cerr << "load alignment" << std::endl;
 68 | 	auto alignments = CommonUtils::LoadVGAlignments(inalignmentfilename);
 69 | 	for (auto aln : alignments)
 70 | 	{
 71 | 		for (size_t i = 0; i < aln.path().mapping_size(); i++)
 72 | 		{
 73 | 			baseCounts[aln.path().mapping(i).position().node_id()][aln.name()] += 1;
 74 | 		}
 75 | 	}
 76 | 	std::cerr << "init counts" << std::endl;
 77 | 	for (auto pair : baseCounts)
 78 | 	{
 79 | 		for (auto count : pair.second)
 80 | 		{
 81 | 			counts[pair.first] = std::max(counts[pair.first], count.second);
 82 | 		}
 83 | 	}
 84 | 
 85 | 	std::cerr << "iterate" << std::endl;
 86 | 	std::vector<int> updateQueue;
 87 | 	updateQueue.reserve(graph.nodes.size());
 88 | 	for (const auto& node : graph.nodes)
 89 | 	{
 90 | 		updateQueue.push_back(node.first);
 91 | 	}
 92 | 	std::cerr << "numnodes " << updateQueue.size() << std::endl;
 93 | 	size_t iterated = 0;
 94 | 	int maxcount = 0;
 95 | 	while (updateQueue.size() > 0)
 96 | 	{
 97 | 		auto node = updateQueue.back();
 98 | 		updateQueue.pop_back();
 99 | 		iterated++;
100 | 		if (iterated % 1000000 == 0) std::cerr << "iterated " << iterated << std::endl;
101 | 		int leftCountShouldBe = 0;
102 | 		if (leftInneighbors.count(node) == 1)
103 | 		{
104 | 			for (auto neighbor : leftInneighbors.at(node))
105 | 			{
106 | 				leftCountShouldBe += counts[neighbor];
107 | 			}
108 | 		}
109 | 		int rightCountShouldBe = 0;
110 | 		if (rightInneighbors.count(node) == 1)
111 | 		{
112 | 			for (auto neighbor : rightInneighbors.at(node))
113 | 			{
114 | 				rightCountShouldBe += counts[neighbor];
115 | 			}
116 | 		}
117 | 		if (counts[node] >= leftCountShouldBe && counts[node] >= rightCountShouldBe) continue;
118 | 		counts[node] = std::max(leftCountShouldBe, rightCountShouldBe);
119 | 		if (counts[node] > maxcount)
120 | 		{
121 | 			maxcount = counts[node];
122 | 			std::cerr << "node " << node << " iter " << iterated << " maxcount " << maxcount << std::endl;
123 | 		}
124 | 		if (outNeighbors.count(node) == 1)
125 | 		{
126 | 			for (auto neighbor : outNeighbors.at(node))
127 | 			{
128 | 				updateQueue.push_back(neighbor);
129 | 			}
130 | 		}
131 | 	}
132 | 	std::cerr << "iteration done with " << iterated << std::endl;
133 | 
134 | 	std::cerr << "write result" << std::endl;
135 | 	std::ofstream out {outfilename};
136 | 	out << "node,_minalntoporepeatcount";
137 | 	out << std::endl;
138 | 	std::vector<int> nodevec;
139 | 	for (const auto& node : graph.nodes)
140 | 	{
141 | 		nodevec.push_back(node.first);
142 | 	}
143 | 	std::sort(nodevec.begin(), nodevec.end());
144 | 	for (auto node : nodevec)
145 | 	{
146 | 		out << node;
147 | 		out << "," << counts[node];
148 | 		out << std::endl;
149 | 	}
150 | }
151 | 


--------------------------------------------------------------------------------
/src/ExtractCorrectedReads.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <unordered_map>
  3 | #include <vector>
  4 | #include <fstream>
  5 | #include <iostream>
  6 | #include <functional>
  7 | #include "GfaGraph.h"
  8 | #include "vg.pb.h"
  9 | #include "stream.hpp"
 10 | #include "CommonUtils.h"
 11 | #include "fastqloader.h"
 12 | #include "ReadCorrection.h"
 13 | 
 14 | std::string toLower(std::string seq);
 15 | 
 16 | void addPartial(const std::unordered_map<int, int>& ids, std::unordered_map<std::string, std::vector<Correction>>& partials, std::function<std::string(int)> seqGetter, const vg::Alignment& v)
 17 | {
 18 | 	Correction result;
 19 | 	result.startIndex = v.query_position();
 20 | 	result.endIndex = v.query_position() + v.sequence().size();
 21 | 	result.corrected = "";
 22 | 	for (int i = 0; i < v.path().mapping_size(); i++)
 23 | 	{
 24 | 		auto nodeid = v.path().mapping(i).position().node_id();
 25 | 		auto sequence = seqGetter(ids.at(nodeid));
 26 | 		int len = 0;
 27 | 		for (int j = 0; j < v.path().mapping(i).edit_size(); j++)
 28 | 		{
 29 | 			len += v.path().mapping(i).edit(j).from_length();
 30 | 		}
 31 | 		if (v.path().mapping(i).position().is_reverse())
 32 | 		{
 33 | 			sequence = CommonUtils::ReverseComplement(sequence);
 34 | 		}
 35 | 		if (v.path().mapping(i).position().offset() > 0)
 36 | 		{
 37 | 			sequence = sequence.substr(v.path().mapping(i).position().offset());
 38 | 		}
 39 | 		sequence = sequence.substr(0, len);
 40 | 		result.corrected += sequence;
 41 | 	}
 42 | 	partials[v.name()].push_back(result);
 43 | }
 44 | 
 45 | void addPartial(const vg::Graph& g, const std::unordered_map<int, int>& ids, const vg::Alignment& v, std::unordered_map<std::string, std::vector<Correction>>& partials)
 46 | {
 47 | 	addPartial(ids, partials, [&g](int id) {return g.node(id).sequence();}, v);
 48 | }
 49 | 
 50 | void addPartial(const GfaGraph& g, const std::unordered_map<int, int>& ids, const vg::Alignment& v, std::unordered_map<std::string, std::vector<Correction>>& partials)
 51 | {
 52 | 	addPartial(ids, partials, [&g](int id) {return g.nodes.at(id);}, v);
 53 | }
 54 | 
 55 | void mergePartials(const std::unordered_map<std::string, std::vector<Correction>>& partials, const std::vector<FastQ>& reads, size_t maxOverlap)
 56 | {
 57 | 	for (auto read : reads)
 58 | 	{
 59 | 		if (partials.count(read.seq_id) == 0)
 60 | 		{
 61 | 			std::cout << ">" << read.seq_id << std::endl << toLower(read.sequence) << std::endl;
 62 | 			continue;
 63 | 		}
 64 | 		auto p = partials.at(read.seq_id);
 65 | 		std::sort(p.begin(), p.end(), [](const Correction& left, const Correction& right) { return left.startIndex < right.startIndex; });
 66 | 		auto corrected = getCorrected(read.sequence, p, maxOverlap);
 67 | 		std::cout << ">" << read.seq_id << std::endl << corrected << std::endl;
 68 | 	}
 69 | }
 70 | 
 71 | int main(int argc, char** argv)
 72 | {
 73 | 	std::string graphfilename {argv[1]};
 74 | 	std::string alnfilename { argv[2] };
 75 | 	std::string readfilename { argv[3] };
 76 | 	//output in stdout
 77 | 	auto reads = loadFastqFromFile(readfilename);
 78 | 	for (int i = 4; i < argc; i++)
 79 | 	{
 80 | 		auto extrareads = loadFastqFromFile(argv[i]);
 81 | 		reads.insert(reads.end(), extrareads.begin(), extrareads.end());
 82 | 	}
 83 | 
 84 | 	size_t maxOverlap = 0;
 85 | 
 86 | 	std::unordered_map<std::string, std::vector<Correction>> partials;
 87 | 	if (graphfilename.substr(graphfilename.size()-3) == ".vg")
 88 | 	{
 89 | 		vg::Graph graph = CommonUtils::LoadVGGraph(argv[1]);
 90 | 		std::unordered_map<int, int> ids;
 91 | 		for (int i = 0; i < graph.node_size(); i++)
 92 | 		{
 93 | 			ids[graph.node(i).id()] = i;
 94 | 		}
 95 | 		{
 96 | 			std::ifstream alnfile { argv[2], std::ios::in | std::ios::binary };
 97 | 			std::function<void(vg::Alignment&)> lambda = [&graph, &ids, &partials](vg::Alignment& aln) {
 98 | 				addPartial(graph, ids, aln, partials);
 99 | 			};
100 | 			stream::for_each(alnfile, lambda);
101 | 		}
102 | 	}
103 | 	else if (graphfilename.substr(graphfilename.size() - 4) == ".gfa")
104 | 	{
105 | 		GfaGraph graph = GfaGraph::LoadFromFile(argv[1]);
106 | 		maxOverlap = graph.edgeOverlap;
107 | 		std::unordered_map<int, int> ids;
108 | 		for (auto node : graph.nodes)
109 | 		{
110 | 			ids[node.first] = node.first;
111 | 		}
112 | 		{
113 | 			std::ifstream alnfile { argv[2], std::ios::in | std::ios::binary };
114 | 			std::function<void(vg::Alignment&)> lambda = [&graph, &ids, &partials](vg::Alignment& aln) {
115 | 				addPartial(graph, ids, aln, partials);
116 | 			};
117 | 			stream::for_each(alnfile, lambda);
118 | 		}
119 | 	}
120 | 
121 | 
122 | 	mergePartials(partials, reads, maxOverlap);
123 | }


--------------------------------------------------------------------------------
/src/ExtractExactPathSubgraph.cpp:
--------------------------------------------------------------------------------
 1 | #include <fstream>
 2 | #include <iostream>
 3 | #include <queue>
 4 | #include "GfaGraph.h"
 5 | #include "CommonUtils.h"
 6 | 
 7 | int main(int argc, char** argv)
 8 | {
 9 | 	std::string infile {argv[1]};
10 | 	std::string outfile {argv[2]};
11 | 	std::string alignmentfile {argv[3]};
12 | 	auto alignments = CommonUtils::LoadVGAlignments(alignmentfile);
13 | 	auto graph = GfaGraph::LoadFromFile(infile);
14 | 	std::unordered_set<int> pickedNodes;
15 | 	std::unordered_set<std::pair<NodePos, NodePos>> pickedEdges;
16 | 	for (const auto& alignment : alignments)
17 | 	{
18 | 		pickedNodes.insert(alignment.path().mapping(0).position().node_id());
19 | 		for (int i = 1; i < alignment.path().mapping_size(); i++)
20 | 		{
21 | 			pickedNodes.insert(alignment.path().mapping(i).position().node_id());
22 | 			NodePos from {alignment.path().mapping(i-1).position().node_id(), alignment.path().mapping(i-1).position().is_reverse()};
23 | 			NodePos to {alignment.path().mapping(i).position().node_id(), alignment.path().mapping(i).position().is_reverse()};
24 | 			pickedEdges.emplace(from, to);
25 | 		}
26 | 	}
27 | 	std::cerr << pickedNodes.size() << " nodes, ~" << pickedEdges.size() << " edges" << std::endl;
28 | 	auto result = graph.GetSubgraph(pickedNodes, pickedEdges);
29 | 	result.SaveToFile(outfile);
30 | }


--------------------------------------------------------------------------------
/src/ExtractPathSequence.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <unordered_map>
 3 | #include <vector>
 4 | #include <fstream>
 5 | #include <iostream>
 6 | #include <functional>
 7 | #include "GfaGraph.h"
 8 | #include "vg.pb.h"
 9 | #include "stream.hpp"
10 | #include "CommonUtils.h"
11 | 
12 | void printPath(const std::unordered_map<int, int>& ids, std::function<std::string(int)> seqGetter, const vg::Alignment& v)
13 | {
14 | 	std::cout << ">" << v.name() << "_" << v.query_position() << "_" << (v.query_position() + v.sequence().size()) << std::endl;
15 | 	for (int i = 0; i < v.path().mapping_size(); i++)
16 | 	{
17 | 		auto nodeid = v.path().mapping(i).position().node_id();
18 | 		auto sequence = seqGetter(ids.at(nodeid));
19 | 		int len = 0;
20 | 		for (int j = 0; j < v.path().mapping(i).edit_size(); j++)
21 | 		{
22 | 			len += v.path().mapping(i).edit(j).from_length();
23 | 		}
24 | 		if (v.path().mapping(i).position().is_reverse())
25 | 		{
26 | 			sequence = CommonUtils::ReverseComplement(sequence);
27 | 		}
28 | 		if (v.path().mapping(i).position().offset() > 0)
29 | 		{
30 | 			sequence = sequence.substr(v.path().mapping(i).position().offset());
31 | 		}
32 | 		sequence = sequence.substr(0, len);
33 | 		std::cout << sequence;
34 | 	}
35 | 	std::cout << std::endl;
36 | }
37 | 
38 | void printPath(const vg::Graph& g, const std::unordered_map<int, int>& ids, const vg::Alignment& v)
39 | {
40 | 	printPath(ids, [&g](int id) {return g.node(id).sequence();}, v);
41 | }
42 | 
43 | void printPath(const GfaGraph& g, const std::unordered_map<int, int>& ids, const vg::Alignment& v)
44 | {
45 | 	printPath(ids, [&g](int id) {return g.nodes.at(id);}, v);
46 | }
47 | 
48 | int main(int argc, char** argv)
49 | {
50 | 	std::string graphfilename {argv[1]};
51 | 	std::string alnfilename { argv[2] };
52 | 	std::unordered_map<int, int> ids;
53 | 	if (graphfilename.substr(graphfilename.size()-3) == ".vg")
54 | 	{
55 | 		vg::Graph graph = CommonUtils::LoadVGGraph(argv[1]);
56 | 		for (int i = 0; i < graph.node_size(); i++)
57 | 		{
58 | 			ids[graph.node(i).id()] = i;
59 | 		}
60 | 		{
61 | 			std::ifstream graphfile { argv[2], std::ios::in | std::ios::binary };
62 | 			std::function<void(vg::Alignment&)> lambda = [&graph, &ids](vg::Alignment& g) {
63 | 				std::cerr << g.name() << std::endl;
64 | 				printPath(graph, ids, g);
65 | 			};
66 | 			stream::for_each(graphfile, lambda);
67 | 		}
68 | 	}
69 | 	else if (graphfilename.substr(graphfilename.size() - 4) == ".gfa")
70 | 	{
71 | 		GfaGraph graph = GfaGraph::LoadFromFile(argv[1]);
72 | 		for (auto node : graph.nodes)
73 | 		{
74 | 			ids[node.first] = node.first;
75 | 		}
76 | 		{
77 | 			std::ifstream graphfile { argv[2], std::ios::in | std::ios::binary };
78 | 			std::function<void(vg::Alignment&)> lambda = [&graph, &ids](vg::Alignment& g) {
79 | 				std::cerr << g.name() << std::endl;
80 | 				printPath(graph, ids, g);
81 | 			};
82 | 			stream::for_each(graphfile, lambda);
83 | 		}
84 | 	}
85 | 
86 | }


--------------------------------------------------------------------------------
/src/ExtractPathSubgraphNeighbourhood.cpp:
--------------------------------------------------------------------------------
 1 | #include <fstream>
 2 | #include <iostream>
 3 | #include <queue>
 4 | #include "GfaGraph.h"
 5 | #include "CommonUtils.h"
 6 | 
 7 | class PriorityNode
 8 | {
 9 | public:
10 | 	PriorityNode(NodePos pos, int priority) :
11 | 	pos(pos),
12 | 	priority(priority)
13 | 	{}
14 | 	NodePos pos;
15 | 	int priority;
16 | 	bool operator>(const PriorityNode& other) const
17 | 	{
18 | 		return priority > other.priority;
19 | 	}
20 | };
21 | 
22 | int main(int argc, char** argv)
23 | {
24 | 	std::string infile {argv[1]};
25 | 	std::string outfile {argv[2]};
26 | 	std::string alignmentfile {argv[3]};
27 | 	int length = std::stoi(argv[4]);
28 | 	std::cerr << "length: " << length << std::endl;
29 | 	auto alignments = CommonUtils::LoadVGAlignments(alignmentfile);
30 | 	auto graph = GfaGraph::LoadFromFile(infile);
31 | 	std::priority_queue<PriorityNode, std::vector<PriorityNode>, std::greater<PriorityNode>> queue;
32 | 	for (const auto& alignment : alignments)
33 | 	{
34 | 		for (const auto& pos : alignment.path().mapping())
35 | 		{
36 | 			queue.emplace(NodePos {pos.position().node_id(), pos.position().is_reverse()}, 0);
37 | 		}
38 | 	}
39 | 	std::unordered_map<NodePos, size_t> distance;
40 | 	while (queue.size() != 0)
41 | 	{
42 | 		auto top = queue.top();
43 | 		queue.pop();
44 | 		if (top.priority > length) break;
45 | 		if (distance.count(top.pos) == 1 && distance[top.pos] <= top.priority) continue;
46 | 		distance[top.pos] = top.priority;
47 | 		if (graph.edges.count(top.pos) == 1)
48 | 		{
49 | 			for (auto edge : graph.edges.at(top.pos))
50 | 			{
51 | 				assert(graph.nodes.at(top.pos.id).size() > graph.edgeOverlap);
52 | 				queue.emplace(edge, top.priority + graph.nodes.at(top.pos.id).size() - graph.edgeOverlap);
53 | 			}
54 | 		}
55 | 	}
56 | 	std::unordered_set<int> picked;
57 | 	for (auto pair : distance)
58 | 	{
59 | 		picked.insert(pair.first.id);
60 | 	}
61 | 	std::cerr << picked.size() << std::endl;
62 | 	auto result = graph.GetSubgraph(picked);
63 | 	result.SaveToFile(outfile);
64 | }


--------------------------------------------------------------------------------
/src/GfaGraph.h:
--------------------------------------------------------------------------------
 1 | #ifndef GfaGraph_h
 2 | #define GfaGraph_h
 3 | 
 4 | #include <istream>
 5 | #include <ostream>
 6 | #include <vector>
 7 | #include <string>
 8 | #include <unordered_map>
 9 | #include <unordered_set>
10 | 
11 | class NodePos
12 | {
13 | public:
14 | 	NodePos();
15 | 	NodePos(int id, bool end);
16 | 	int id;
17 | 	bool end;
18 | 	NodePos Reverse() const;
19 | 	bool operator==(const NodePos& other) const;
20 | 	bool operator!=(const NodePos& other) const;
21 | };
22 | 
23 | namespace std 
24 | {
25 | 	template <> 
26 | 	struct hash<NodePos>
27 | 	{
28 | 		size_t operator()(const NodePos& x) const
29 | 		{
30 | 			return hash<int>()(x.id) ^ hash<bool>()(x.end);
31 | 		}
32 | 	};
33 | 	template <> 
34 | 	struct hash<std::pair<NodePos, NodePos>>
35 | 	{
36 | 		size_t operator()(const std::pair<NodePos, NodePos>& x) const
37 | 		{
38 | 			// simple hashing with hash<NodePos>()(x.first) ^ hash<NodePos>()(x.second) collides each edge formed like (x -> x+1)
39 | 			// instead: 
40 | 			// https://stackoverflow.com/questions/682438/hash-function-providing-unique-uint-from-an-integer-coordinate-pair
41 | 			// https://en.wikipedia.org/wiki/Pairing_function#Cantor_pairing_function
42 | 			// and arbitrarily ignore directionality
43 | 			size_t pairing = .5 * (x.first.id + x.second.id) * (x.first.id + x.second.id + 1) + x.second.id;
44 | 			return hash<size_t>()(pairing);
45 | 		}
46 | 	};
47 | }
48 | 
49 | class GfaGraph
50 | {
51 | public:
52 | 	GfaGraph();
53 | 	static GfaGraph LoadFromFile(std::string filename, bool allowVaryingOverlaps=false, bool warnAboutMissingNodes=false);
54 | 	static GfaGraph LoadFromStream(std::istream& stream, bool allowVaryingOverlaps=false, bool warnAboutMissingNodes=false);
55 | 	void SaveToFile(std::string filename) const;
56 | 	void SaveToStream(std::ostream& stream) const;
57 | 	void AddSubgraph(const GfaGraph& subgraph);
58 | 	GfaGraph GetSubgraph(const std::unordered_set<int>& ids) const;
59 | 	GfaGraph GetSubgraph(const std::unordered_set<int>& nodes, const std::unordered_set<std::pair<NodePos, NodePos>>& edges) const;
60 | 	std::string OriginalNodeName(int nodeId) const;
61 | 	void confirmDoublesidedEdges();
62 | 	std::unordered_map<int, std::string> nodes;
63 | 	std::unordered_map<NodePos, std::vector<NodePos>> edges;
64 | 	std::unordered_map<std::pair<NodePos, NodePos>, size_t> varyingOverlaps;
65 | 	size_t edgeOverlap;
66 | 	std::unordered_map<int, std::string> tags;
67 | 	std::unordered_map<int, std::string> originalNodeName;
68 | private:
69 | 	void numberBackToIntegers();
70 | };
71 | 
72 | #endif


--------------------------------------------------------------------------------
/src/GraphAlignerGAFAlignment.h:
--------------------------------------------------------------------------------
  1 | #ifndef GraphAlignerGAFAlignment_h
  2 | #define GraphAlignerGAFAlignment_h
  3 | 
  4 | #include <string>
  5 | #include <vector>
  6 | #include "AlignmentGraph.h"
  7 | #include "NodeSlice.h"
  8 | #include "CommonUtils.h"
  9 | #include "ThreadReadAssertion.h"
 10 | #include "GraphAlignerCommon.h"
 11 | 
 12 | template <typename LengthType, typename ScoreType, typename Word>
 13 | class GraphAlignerGAFAlignment
 14 | {
 15 | 	using Common = GraphAlignerCommon<LengthType, ScoreType, Word>;
 16 | 	using Params = typename Common::Params;
 17 | 	using MatrixPosition = typename Common::MatrixPosition;
 18 | 	using TraceItem = typename Common::TraceItem;
 19 | 	struct MergedNodePos
 20 | 	{
 21 | 		int nodeId;
 22 | 		bool reverse;
 23 | 		size_t nodeOffset;
 24 | 		size_t seqPos;
 25 | 	};
 26 | 	enum EditType
 27 | 	{
 28 | 		Match,
 29 | 		Mismatch,
 30 | 		MatchOrMismatch,
 31 | 		Insertion,
 32 | 		Deletion,
 33 | 		Empty
 34 | 	};
 35 | public:
 36 | 
 37 | 	static std::string traceToAlignment(const std::string& seq_id, const std::string& sequence, const GraphAlignerCommon<size_t, int32_t, uint64_t>::OnewayTrace& tracePair, const Params& params, bool cigarMatchMismatchMerge)
 38 | 	{
 39 | 		auto& trace = tracePair.trace;
 40 | 		if (trace.size() == 0) return nullptr;
 41 | 		std::stringstream cigar;
 42 | 		std::string readName = seq_id;
 43 | 		size_t readLen = sequence.size();
 44 | 		size_t readStart = trace[0].DPposition.seqPos;
 45 | 		size_t readEnd = trace.back().DPposition.seqPos+1;
 46 | 		bool strand = true;
 47 | 		std::stringstream nodePath;
 48 | 		size_t nodePathLen = 0;
 49 | 		size_t nodePathStart = trace[0].DPposition.nodeOffset;
 50 | 		size_t nodePathEnd = 0;
 51 | 		size_t matches = 0;
 52 | 		size_t blockLength = trace.size();
 53 | 		int mappingQuality = 255;
 54 | 
 55 | 		MergedNodePos currentPos;
 56 | 		currentPos.nodeId = trace[0].DPposition.node;
 57 | 		currentPos.reverse = (trace[0].DPposition.node % 2) == 1;
 58 | 		currentPos.nodeOffset = trace[0].DPposition.nodeOffset;
 59 | 		currentPos.seqPos = trace[0].DPposition.seqPos;
 60 | 		EditType currentEdit = Empty;
 61 | 		size_t mismatches = 0;
 62 | 		size_t deletions = 0;
 63 | 		size_t insertions = 0;
 64 | 		size_t editLength = 0;
 65 | 		if (cigarMatchMismatchMerge)
 66 | 		{
 67 | 			currentEdit = MatchOrMismatch;
 68 | 			editLength = 1;
 69 | 			if (Common::characterMatch(trace[0].sequenceCharacter, trace[0].graphCharacter))
 70 | 			{
 71 | 				matches += 1;
 72 | 			}
 73 | 			else
 74 | 			{
 75 | 				mismatches += 1;
 76 | 			}
 77 | 		}
 78 | 		else if (Common::characterMatch(trace[0].sequenceCharacter, trace[0].graphCharacter))
 79 | 		{
 80 | 			currentEdit = Match;
 81 | 			editLength = 1;
 82 | 			matches += 1;
 83 | 		}
 84 | 		else
 85 | 		{
 86 | 			currentEdit = Mismatch;
 87 | 			editLength = 1;
 88 | 			mismatches += 1;
 89 | 		}
 90 | 		addPosToString(nodePath, currentPos, params);
 91 | 		nodePathLen += params.graph.originalNodeSize.at(currentPos.nodeId);
 92 | 		for (size_t pos = 1; pos < trace.size(); pos++)
 93 | 		{
 94 | 			assert(trace[pos].DPposition.seqPos < sequence.size());
 95 | 			MergedNodePos newPos;
 96 | 			newPos.nodeId = trace[pos].DPposition.node;
 97 | 			newPos.reverse = (trace[pos].DPposition.node % 2) == 1;
 98 | 			newPos.nodeOffset = trace[pos].DPposition.nodeOffset;
 99 | 			newPos.seqPos = trace[pos].DPposition.seqPos;
100 | 			bool insideNode = !trace[pos-1].nodeSwitch || (newPos.nodeId == currentPos.nodeId && newPos.reverse == currentPos.reverse && newPos.nodeOffset > currentPos.nodeOffset);
101 | 
102 | 			assert(newPos.seqPos >= currentPos.seqPos);
103 | 
104 | 			if (!insideNode)
105 | 			{
106 | 				size_t skippedBefore = params.graph.originalNodeSize.at(currentPos.nodeId) - 1 - trace[pos-1].DPposition.nodeOffset;
107 | 				currentPos = newPos;
108 | 				addPosToString(nodePath, currentPos, params);
109 | 				assert(trace[pos].DPposition.nodeOffset < params.graph.originalNodeSize.at(currentPos.nodeId));
110 | 				size_t skippedAfter = trace[pos].DPposition.nodeOffset;
111 | 				nodePathLen += params.graph.originalNodeSize.at(currentPos.nodeId) - (skippedBefore + skippedAfter);
112 | 			}
113 | 
114 | 			if (trace[pos-1].DPposition.seqPos == trace[pos].DPposition.seqPos)
115 | 			{
116 | 				if (currentEdit == Empty) currentEdit = Deletion;
117 | 				if (currentEdit != Deletion)
118 | 				{
119 | 					addCigarItem(cigar, editLength, currentEdit);
120 | 					currentEdit = Deletion;
121 | 					editLength = 0;
122 | 				}
123 | 				editLength += 1;
124 | 				deletions += 1;
125 | 			}
126 | 			else if (insideNode && trace[pos-1].DPposition.nodeOffset == trace[pos].DPposition.nodeOffset)
127 | 			{
128 | 				if (currentEdit == Empty) currentEdit = Insertion;
129 | 				if (currentEdit != Insertion)
130 | 				{
131 | 					addCigarItem(cigar, editLength, currentEdit);
132 | 					currentEdit = Insertion;
133 | 					editLength = 0;
134 | 				}
135 | 				editLength += 1;
136 | 				insertions += 1;
137 | 			}
138 | 			else if (cigarMatchMismatchMerge)
139 | 			{
140 | 				if (currentEdit == Empty) currentEdit = MatchOrMismatch;
141 | 				if (currentEdit != MatchOrMismatch)
142 | 				{
143 | 					addCigarItem(cigar, editLength, currentEdit);
144 | 					currentEdit = MatchOrMismatch;
145 | 					editLength = 0;
146 | 				}
147 | 				editLength += 1;
148 | 				if (Common::characterMatch(trace[pos].sequenceCharacter, trace[pos].graphCharacter))
149 | 				{
150 | 					matches += 1;
151 | 				}
152 | 				else
153 | 				{
154 | 					mismatches += 1;
155 | 				}
156 | 			}
157 | 			else if (Common::characterMatch(trace[pos].sequenceCharacter, trace[pos].graphCharacter))
158 | 			{
159 | 				if (currentEdit == Empty) currentEdit = Match;
160 | 				if (currentEdit != Match)
161 | 				{
162 | 					addCigarItem(cigar, editLength, currentEdit);
163 | 					currentEdit = Match;
164 | 					editLength = 0;
165 | 				}
166 | 				editLength += 1;
167 | 				matches += 1;
168 | 			}
169 | 			else
170 | 			{
171 | 				if (currentEdit == Empty) currentEdit = Mismatch;
172 | 				if (currentEdit != Mismatch)
173 | 				{
174 | 					addCigarItem(cigar, editLength, currentEdit);
175 | 					currentEdit = Mismatch;
176 | 					editLength = 0;
177 | 				}
178 | 				editLength += 1;
179 | 				mismatches += 1;
180 | 			}
181 | 			if (insideNode)
182 | 			{
183 | 				assert(trace[pos-1].nodeSwitch || newPos.nodeId == currentPos.nodeId);
184 | 				assert(trace[pos-1].nodeSwitch || newPos.reverse == currentPos.reverse);
185 | 			}
186 | 		}
187 | 
188 | 		assert(matches + mismatches + deletions + insertions == trace.size());
189 | 		addCigarItem(cigar, editLength, currentEdit);
190 | 
191 | 		nodePathEnd = nodePathLen - (params.graph.originalNodeSize.at(trace.back().DPposition.node) - 1 - trace.back().DPposition.nodeOffset);
192 | 
193 | 		std::stringstream sstr;
194 | 		sstr << readName << "\t" << readLen << "\t" << readStart << "\t" << readEnd << "\t" << (strand ? "+" : "-") << "\t" << nodePath.str() << "\t" << nodePathLen << "\t" << nodePathStart << "\t" << nodePathEnd << "\t" << matches << "\t" << blockLength << "\t" << mappingQuality;
195 | 		sstr << "\t" << "NM:i:" << (mismatches + deletions + insertions);
196 | 		sstr << "\t" << "dv:f:" << 1.0-((double)matches / (double)(matches + mismatches + deletions + insertions));
197 | 		sstr << "\t" << "id:f:" << ((double)matches / (double)(matches + mismatches + deletions + insertions));
198 | 		sstr << "\t" << "cg:Z:" << cigar.str();
199 | 		return sstr.str();
200 | 	}
201 | 
202 | private:
203 | 
204 | 	static void addPosToString(std::stringstream& str, MergedNodePos pos, const Params& params)
205 | 	{
206 | 		if (pos.reverse)
207 | 		{
208 | 			str << "<";
209 | 		}
210 | 		else
211 | 		{
212 | 			str << ">";
213 | 		}
214 | 		std::string nodeName = params.graph.originalNodeName.at(pos.nodeId);
215 | 		if (nodeName == "")
216 | 		{
217 | 			str << pos.nodeId/2;
218 | 		}
219 | 		else
220 | 		{
221 | 			str << nodeName;
222 | 		}
223 | 	}
224 | 
225 | 	static void addCigarItem(std::stringstream& str, size_t editLength, EditType type)
226 | 	{
227 | 		if (editLength == 0) return;
228 | 		str << editLength;
229 | 		switch(type)
230 | 		{
231 | 			case MatchOrMismatch:
232 | 				str << "M";
233 | 				break;
234 | 			case Match:
235 | 				str << "=";
236 | 				break;
237 | 			case Mismatch:
238 | 				str << "X";
239 | 				break;
240 | 			case Insertion:
241 | 				str << "I";
242 | 				break;
243 | 			case Deletion:
244 | 				str << "D";
245 | 				break;
246 | 			case Empty:
247 | 			default:
248 | 				return;
249 | 		}
250 | 	}
251 | };
252 | 
253 | #endif


--------------------------------------------------------------------------------
/src/GraphAlignerVGAlignment.h:
--------------------------------------------------------------------------------
  1 | #ifndef GraphAlignerVGAlignment_h
  2 | #define GraphAlignerVGAlignment_h
  3 | 
  4 | #include <string>
  5 | #include <vector>
  6 | #include "AlignmentGraph.h"
  7 | #include "vg.pb.h"
  8 | #include "NodeSlice.h"
  9 | #include "CommonUtils.h"
 10 | #include "ThreadReadAssertion.h"
 11 | #include "GraphAlignerCommon.h"
 12 | 
 13 | template <typename LengthType, typename ScoreType, typename Word>
 14 | class GraphAlignerVGAlignment
 15 | {
 16 | 	using Common = GraphAlignerCommon<LengthType, ScoreType, Word>;
 17 | 	using Params = typename Common::Params;
 18 | 	using MatrixPosition = typename Common::MatrixPosition;
 19 | 	using TraceItem = typename Common::TraceItem;
 20 | 	struct MergedNodePos
 21 | 	{
 22 | 		int nodeId;
 23 | 		bool reverse;
 24 | 		size_t nodeOffset;
 25 | 		size_t seqPos;
 26 | 	};
 27 | 	enum EditType
 28 | 	{
 29 | 		Match,
 30 | 		Mismatch,
 31 | 		Insertion,
 32 | 		Deletion,
 33 | 		Empty
 34 | 	};
 35 | public:
 36 | 
 37 | 	static std::shared_ptr<vg::Alignment> traceToAlignment(const std::string& seq_id, const std::string& sequence, ScoreType score, const std::vector<TraceItem>& trace, size_t cellsProcessed, bool reverse)
 38 | 	{
 39 | 		if (trace.size() == 0) return nullptr;
 40 | 		vg::Alignment* aln = new vg::Alignment;
 41 | 		std::shared_ptr<vg::Alignment> result { aln };
 42 | 		result->set_name(seq_id);
 43 | 		result->set_score(score);
 44 | 		result->set_sequence(sequence);
 45 | 		auto path = new vg::Path;
 46 | 		result->set_allocated_path(path);
 47 | 		MergedNodePos currentPos;
 48 | 		currentPos.nodeId = trace[0].DPposition.node;
 49 | 		currentPos.reverse = (trace[0].DPposition.node % 2) == 1;
 50 | 		currentPos.nodeOffset = trace[0].DPposition.nodeOffset;
 51 | 		currentPos.seqPos = trace[0].DPposition.seqPos;
 52 | 		int rank = 0;
 53 | 		auto vgmapping = path->add_mapping();
 54 | 		auto position = new vg::Position;
 55 | 		vgmapping->set_allocated_position(position);
 56 | 		vgmapping->set_rank(rank);
 57 | 		auto edit = vgmapping->add_edit();
 58 | 		EditType currentEdit = Empty;
 59 | 		size_t mismatches = 0;
 60 | 		size_t deletions = 0;
 61 | 		size_t insertions = 0;
 62 | 		size_t matches = 0;
 63 | 		if (Common::characterMatch(trace[0].sequenceCharacter, trace[0].graphCharacter))
 64 | 		{
 65 | 			currentEdit = Match;
 66 | 			edit->set_from_length(edit->from_length()+1);
 67 | 			edit->set_to_length(edit->to_length()+1);
 68 | 			matches += 1;
 69 | 		}
 70 | 		else
 71 | 		{
 72 | 			currentEdit = Mismatch;
 73 | 			edit->set_from_length(edit->from_length()+1);
 74 | 			edit->set_to_length(edit->to_length()+1);
 75 | 			edit->set_sequence(std::string { sequence[0] });
 76 | 			mismatches += 1;
 77 | 		}
 78 | 		position->set_node_id(currentPos.nodeId);
 79 | 		position->set_is_reverse(currentPos.reverse);
 80 | 		position->set_offset(currentPos.nodeOffset);
 81 | 		for (size_t pos = 1; pos < trace.size(); pos++)
 82 | 		{
 83 | 			assert(trace[pos].DPposition.seqPos < sequence.size());
 84 | 			MergedNodePos newPos;
 85 | 			newPos.nodeId = trace[pos].DPposition.node;
 86 | 			newPos.reverse = (trace[pos].DPposition.node % 2) == 1;
 87 | 			newPos.nodeOffset = trace[pos].DPposition.nodeOffset;
 88 | 			newPos.seqPos = trace[pos].DPposition.seqPos;
 89 | 			bool insideNode = !trace[pos-1].nodeSwitch || (newPos.nodeId == currentPos.nodeId && newPos.reverse == currentPos.reverse && newPos.nodeOffset > currentPos.nodeOffset);
 90 | 
 91 | 			assert(newPos.seqPos >= currentPos.seqPos);
 92 | 
 93 | 			if (!insideNode)
 94 | 			{
 95 | 				rank++;
 96 | 				currentPos = newPos;
 97 | 				vgmapping = path->add_mapping();
 98 | 				position = new vg::Position;
 99 | 				vgmapping->set_allocated_position(position);
100 | 				vgmapping->set_rank(rank);
101 | 				position->set_offset(currentPos.nodeOffset);
102 | 				position->set_node_id(currentPos.nodeId);
103 | 				position->set_is_reverse(currentPos.reverse);
104 | 				edit = vgmapping->add_edit();
105 | 				currentEdit = Empty;
106 | 			}
107 | 
108 | 			if (trace[pos-1].DPposition.seqPos == trace[pos].DPposition.seqPos)
109 | 			{
110 | 				if (currentEdit == Empty) currentEdit = Deletion;
111 | 				if (currentEdit != Deletion)
112 | 				{
113 | 					edit = vgmapping->add_edit();
114 | 					currentEdit = Deletion;
115 | 				}
116 | 				edit->set_from_length(edit->from_length()+1);
117 | 				deletions += 1;
118 | 			}
119 | 			else if (insideNode && trace[pos-1].DPposition.nodeOffset == trace[pos].DPposition.nodeOffset)
120 | 			{
121 | 				if (currentEdit == Empty) currentEdit = Insertion;
122 | 				if (currentEdit != Insertion)
123 | 				{
124 | 					edit = vgmapping->add_edit();
125 | 					currentEdit = Insertion;
126 | 				}
127 | 				edit->set_to_length(edit->to_length()+1);
128 | 				edit->set_sequence(edit->sequence() + trace[pos].sequenceCharacter);
129 | 				insertions += 1;
130 | 			}
131 | 			else if (Common::characterMatch(trace[pos].sequenceCharacter, trace[pos].graphCharacter))
132 | 			{
133 | 				if (currentEdit == Empty) currentEdit = Match;
134 | 				if (currentEdit != Match)
135 | 				{
136 | 					edit = vgmapping->add_edit();
137 | 					currentEdit = Match;
138 | 				}
139 | 				edit->set_from_length(edit->from_length()+1);
140 | 				edit->set_to_length(edit->to_length()+1);
141 | 				matches += 1;
142 | 			}
143 | 			else
144 | 			{
145 | 				if (currentEdit == Empty) currentEdit = Mismatch;
146 | 				if (currentEdit != Mismatch)
147 | 				{
148 | 					edit = vgmapping->add_edit();
149 | 					currentEdit = Mismatch;
150 | 				}
151 | 				edit->set_from_length(edit->from_length()+1);
152 | 				edit->set_to_length(edit->to_length()+1);
153 | 				edit->set_sequence(edit->sequence() + trace[pos].sequenceCharacter);
154 | 				mismatches += 1;
155 | 			}
156 | 			if (insideNode)
157 | 			{
158 | 				assert(trace[pos-1].nodeSwitch || newPos.nodeId == currentPos.nodeId);
159 | 				assert(trace[pos-1].nodeSwitch || newPos.reverse == currentPos.reverse);
160 | 			}
161 | 		}
162 | 		result->set_identity((double)matches / (double)(matches + mismatches + insertions + deletions));
163 | 		assert(currentEdit != Empty);
164 | 		return result;
165 | 	}
166 | 
167 | 	static bool posEqual(const vg::Position& pos1, const vg::Position& pos2)
168 | 	{
169 | 		return pos1.node_id() == pos2.node_id() && pos1.is_reverse() == pos2.is_reverse();
170 | 	}
171 | };
172 | 
173 | #endif


--------------------------------------------------------------------------------
/src/GraphAlignerWrapper.cpp:
--------------------------------------------------------------------------------
 1 | //split this here so modifying GraphAligner.h doesn't require recompiling every cpp file
 2 | 
 3 | #include <limits>
 4 | #include "GraphAlignerWrapper.h"
 5 | #include "GraphAligner.h"
 6 | #include "ThreadReadAssertion.h"
 7 | 
 8 | AlignmentResult AlignOneWay(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, size_t initialBandwidth, size_t rampBandwidth, bool quietMode, GraphAlignerCommon<size_t, int32_t, uint64_t>::AlignerGraphsizedState& reusableState, bool lowMemory, bool forceGlobal, bool preciseClipping, bool nondeterministicOptimizations, double preciseClippingIdentityCutoff, int Xdropcutoff, size_t DPRestartStride)
 9 | {
10 | 	GraphAlignerCommon<size_t, int32_t, uint64_t>::Params params {initialBandwidth, rampBandwidth, graph, std::numeric_limits<size_t>::max(), quietMode, false, lowMemory, forceGlobal, preciseClipping, 1, 0, nondeterministicOptimizations, preciseClippingIdentityCutoff, Xdropcutoff};
11 | 	GraphAligner<size_t, int32_t, uint64_t> aligner {params};
12 | 	return aligner.AlignOneWay(seq_id, sequence, reusableState, DPRestartStride);
13 | }
14 | 
15 | AlignmentResult AlignOneWayDijkstra(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, bool quietMode, GraphAlignerCommon<size_t, int32_t, uint64_t>::AlignerGraphsizedState& reusableState, bool forceGlobal, bool preciseClipping)
16 | {
17 | 	GraphAlignerCommon<size_t, int32_t, uint64_t>::Params params {1, 1, graph, std::numeric_limits<size_t>::max(), quietMode, false, true, forceGlobal, preciseClipping, 1, 0, false, .5, 0};
18 | 	GraphAligner<size_t, int32_t, uint64_t> aligner {params};
19 | 	return aligner.AlignOneWayDijkstra(seq_id, sequence, reusableState);
20 | }
21 | 
22 | AlignmentResult AlignOneWay(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, size_t initialBandwidth, size_t rampBandwidth, size_t maxCellsPerSlice, bool quietMode, bool sloppyOptimizations, const std::vector<SeedHit>& seedHits, GraphAlignerCommon<size_t, int32_t, uint64_t>::AlignerGraphsizedState& reusableState, bool lowMemory, bool forceGlobal, bool preciseClipping, size_t minClusterSize, double seedExtendDensity, bool nondeterministicOptimizations, double preciseClippingIdentityCutoff, int Xdropcutoff, long long l, long long r, long long offset)
23 | {
24 | 	GraphAlignerCommon<size_t, int32_t, uint64_t>::Params params {initialBandwidth, rampBandwidth, graph, maxCellsPerSlice, quietMode, sloppyOptimizations, lowMemory, forceGlobal, preciseClipping, minClusterSize, seedExtendDensity, nondeterministicOptimizations, preciseClippingIdentityCutoff, Xdropcutoff};
25 | 	GraphAligner<size_t, int32_t, uint64_t> aligner {params};
26 | 	if (l == -1) l = 0;
27 | 	if (r == -1) r = seedHits.size();
28 | 	return aligner.AlignOneWay(seq_id, sequence, seedHits, reusableState, l, r, offset);
29 | }
30 | 
31 | void AddAlignment(const std::string& seq_id, const std::string& sequence, AlignmentResult::AlignmentItem& alignment)
32 | {
33 | 	GraphAlignerCommon<size_t, int32_t, uint64_t>::Params params {1, 1, AlignmentGraph::DummyGraph(), 1, true, true, true, false, false, 1, 0, false, .5, 0};
34 | 	GraphAligner<size_t, int32_t, uint64_t> aligner {params};
35 | 	aligner.AddAlignment(seq_id, sequence, alignment);
36 | }
37 | 
38 | void AddGAFLine(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, AlignmentResult::AlignmentItem& alignment, bool cigarMatchMismatchMerge)
39 | {
40 | 	GraphAlignerCommon<size_t, int32_t, uint64_t>::Params params {1, 1, graph, 1, true, true, true, false, false, 1, 0, false, .5, 0};
41 | 	GraphAligner<size_t, int32_t, uint64_t> aligner {params};
42 | 	aligner.AddGAFLine(seq_id, sequence, alignment, cigarMatchMismatchMerge);
43 | }
44 | 
45 | void AddCorrected(AlignmentResult::AlignmentItem& alignment)
46 | {
47 | 	GraphAlignerCommon<size_t, int32_t, uint64_t>::Params params {1, 1, AlignmentGraph::DummyGraph(), 1, true, true, true, false, false, 1, 0, false, .5, 0};
48 | 	GraphAligner<size_t, int32_t, uint64_t> aligner {params};
49 | 	aligner.AddCorrected(alignment);
50 | }
51 | 
52 | void OrderSeeds(const AlignmentGraph& graph, std::vector<SeedHit>& seedHits)
53 | {
54 | 	GraphAlignerCommon<size_t, int32_t, uint64_t>::Params params {1, 1, graph, 1, true, true, true, false, false, 1, 0, false, .5, 0};
55 | 	GraphAligner<size_t, int32_t, uint64_t> aligner {params};
56 | 	aligner.orderSeedsByChaining(seedHits);
57 | }
58 | 
59 | 
60 | void OrderSeedsCLC(const AlignmentGraph& graph, std::vector<SeedHit>& seedHits)
61 | {
62 | 	GraphAlignerCommon<size_t, int32_t, uint64_t>::Params params {1, 1, graph, 1, true, true, true, false, false, 1, 0, false, .5, 0};
63 | 	GraphAligner<size_t, int32_t, uint64_t> aligner {params};
64 | 	aligner.orderSeedsByChainingCLC(seedHits);
65 | }
66 | 


--------------------------------------------------------------------------------
/src/GraphAlignerWrapper.h:
--------------------------------------------------------------------------------
 1 | //split this here so modifying GraphAligner.h doesn't require recompiling every cpp file
 2 | 
 3 | #ifndef GraphAlignerWrapper_h
 4 | #define GraphAlignerWrapper_h
 5 | 
 6 | #include <tuple>
 7 | #include "vg.pb.h"
 8 | #include "GraphAlignerCommon.h"
 9 | #include "AlignmentGraph.h"
10 | 
11 | class SeedHit
12 | {
13 | public:
14 | 	SeedHit(int nodeID, size_t nodeOffset, size_t seqPos, size_t matchLen, size_t rawSeedGoodness, bool reverse) :
15 | 	nodeID(nodeID),
16 | 	nodeOffset(nodeOffset),
17 | 	seqPos(seqPos),
18 | 	matchLen(matchLen),
19 | 	reverse(reverse),
20 | 	alignmentGraphNodeId(std::numeric_limits<size_t>::max()),
21 | 	alignmentGraphNodeOffset(std::numeric_limits<size_t>::max()),
22 | 	rawSeedGoodness(rawSeedGoodness),
23 | 	seedGoodness(0),
24 | 	seedClusterSize(0)
25 | 	{
26 | 	}
27 | 	int nodeID;
28 | 	size_t nodeOffset;
29 | 	size_t seqPos;
30 | 	size_t matchLen;
31 | 	bool reverse;
32 | 	size_t alignmentGraphNodeId;
33 | 	size_t alignmentGraphNodeOffset;
34 | 	size_t rawSeedGoodness;
35 | 	size_t seedGoodness;
36 | 	size_t seedClusterSize;
37 | };
38 | 
39 | AlignmentResult AlignOneWay(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, size_t initialBandwidth, size_t rampBandwidth, bool quietMode, GraphAlignerCommon<size_t, int32_t, uint64_t>::AlignerGraphsizedState& reusableState, bool lowMemory, bool forceGlobal, bool preciseClipping, bool nondeterministicOptimizations, double preciseClippingIdentityCutoff, int Xdropcutoff, size_t DPRestartStride);
40 | AlignmentResult AlignOneWayDijkstra(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, bool quietMode, GraphAlignerCommon<size_t, int32_t, uint64_t>::AlignerGraphsizedState& reusableState, bool forceGlobal, bool preciseClipping);
41 | AlignmentResult AlignOneWay(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, size_t initialBandwidth, size_t rampBandwidth, size_t maxCellsPerSlice, bool quietMode, bool sloppyOptimizations, const std::vector<SeedHit>& seedHits, GraphAlignerCommon<size_t, int32_t, uint64_t>::AlignerGraphsizedState& reusableState, bool lowMemory, bool forceGlobal, bool preciseClipping, size_t minClusterSize, double seedExtendDensity, bool nondeterministicOptimizations, double preciseClippingIdentityCutoff, int Xdropcutoff, long long l = -1, long long r = -1, long long offset = 0);
42 | 
43 | void AddAlignment(const std::string& seq_id, const std::string& sequence, AlignmentResult::AlignmentItem& alignment);
44 | void AddGAFLine(const AlignmentGraph& graph, const std::string& seq_id, const std::string& sequence, AlignmentResult::AlignmentItem& alignment, bool cigarMatchMismatchMerge);
45 | void AddCorrected(AlignmentResult::AlignmentItem& alignment);
46 | void OrderSeeds(const AlignmentGraph& graph, std::vector<SeedHit>& seedHits);
47 | void OrderSeedsCLC(const AlignmentGraph& graph, std::vector<SeedHit>& seedHits);
48 | 
49 | #endif


--------------------------------------------------------------------------------
/src/MafToAlignment.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | #include <algorithm>
  4 | #include "CommonUtils.h"
  5 | #include "vg.pb.h"
  6 | #include "stream.hpp"
  7 | 
  8 | struct MafEntry {
  9 | 	std::string readname;
 10 | 	std::string realsequence;
 11 | 	int startpos;
 12 | 	int length;
 13 | 	bool backward;
 14 | };
 15 | 
 16 | std::vector<vg::Alignment> mafsToAlignments(const std::vector<MafEntry>& mafs, const std::vector<int>& posToNode, const std::map<int, int>& nodeSize, const std::map<int, bool>& nodeIsReverse)
 17 | {
 18 | 	std::vector<vg::Alignment> result;
 19 | 	for (size_t i = 0; i < mafs.size(); i++)
 20 | 	{
 21 | 		std::vector<int> nodeIds;
 22 | 		nodeIds.push_back(posToNode[mafs[i].startpos]);
 23 | 		for (int j = 1; j < mafs[i].length; j++)
 24 | 		{
 25 | 			if (posToNode[mafs[i].startpos+j] != nodeIds.back())
 26 | 			{
 27 | 				nodeIds.push_back(posToNode[mafs[i].startpos+j]);
 28 | 			}
 29 | 		}
 30 | 		if (mafs[i].backward)
 31 | 		{
 32 | 			std::reverse(nodeIds.begin(), nodeIds.end());
 33 | 		}
 34 | 		vg::Alignment mafResult;
 35 | 		mafResult.set_name(mafs[i].readname);
 36 | 		auto path = new vg::Path;
 37 | 		mafResult.set_allocated_path(path);
 38 | 		for (size_t j = 0; j < nodeIds.size(); j++)
 39 | 		{
 40 | 			auto vgmapping = path->add_mapping();
 41 | 			auto position = new vg::Position;
 42 | 			vgmapping->set_allocated_position(position);
 43 | 			vgmapping->set_rank(j);
 44 | 			position->set_node_id(nodeIds[j]);
 45 | 			position->set_is_reverse(nodeIsReverse.at(nodeIds[j]) ^ mafs[i].backward);
 46 | 			auto edit = vgmapping->add_edit();
 47 | 			edit->set_from_length(nodeSize.at(nodeIds[j]));
 48 | 		}
 49 | 		result.push_back(mafResult);
 50 | 	}
 51 | 	return result;
 52 | }
 53 | 
 54 | std::vector<MafEntry> getMafEntries(std::string filename)
 55 | {
 56 | 	std::vector<MafEntry> result;
 57 | 
 58 | 	std::ifstream mafFile { filename };
 59 | 	while (mafFile.good())
 60 | 	{
 61 | 		std::string line;
 62 | 		std::getline(mafFile, line);
 63 | 		std::string a, b, direction;
 64 | 		if (line.size() == 0 || line[0] != 'a') continue;
 65 | 		MafEntry maf;
 66 | 		std::string checks, checkref;
 67 | 		mafFile >> checks >> checkref;
 68 | 		assert(checkref == "ref");
 69 | 		assert(checks == "s");
 70 | 		mafFile >> maf.startpos >> maf.length;
 71 | 		mafFile >> a >> b;
 72 | 		mafFile >> maf.realsequence;
 73 | 		//https://stackoverflow.com/questions/20406744/how-to-find-and-replace-all-occurrences-of-a-substring-in-a-string
 74 | 		std::string::size_type n = 0;
 75 | 		while ((n = maf.realsequence.find("-", n)) != std::string::npos)
 76 | 		{
 77 | 			maf.realsequence.replace(n, 1, "");
 78 | 		}
 79 | 		mafFile >> checks >> maf.readname;
 80 | 		assert(checks == "s");
 81 | 		mafFile >> a >> b >> direction;
 82 | 		if (direction == "-")
 83 | 		{
 84 | 			maf.realsequence = CommonUtils::ReverseComplement(maf.realsequence);
 85 | 		}
 86 | 		result.push_back(maf);
 87 | 	}
 88 | 
 89 | 	return result;
 90 | }
 91 | 
 92 | int main(int argc, char** argv)
 93 | {
 94 | 	vg::Graph graph = CommonUtils::LoadVGGraph(argv[1]);
 95 | 
 96 | 	vg::Alignment referenceAlignment;
 97 | 	{
 98 | 		std::ifstream referenceFile { argv[2], std::ios::in | std::ios::binary };
 99 | 		std::function<void(vg::Alignment&)> lambda = [&referenceAlignment](vg::Alignment& g) {
100 | 			referenceAlignment = g;
101 | 		};
102 | 		stream::for_each(referenceFile, lambda);
103 | 	}
104 | 
105 | 	std::vector<int> posToNode;
106 | 	std::map<int, bool> nodeIsReverse;
107 | 	std::map<int, int> nodeSizes;
108 | 
109 | 	for (int i = 0; i < referenceAlignment.path().mapping_size(); i++)
110 | 	{
111 | 		auto mapping = referenceAlignment.path().mapping(i);
112 | 		int currentNodeSize = mapping.edit(0).to_length();
113 | 		for (int j = 0; j < currentNodeSize; j++)
114 | 		{
115 | 			posToNode.push_back(mapping.position().node_id());
116 | 		}
117 | 		nodeIsReverse[mapping.position().node_id()] = mapping.position().is_reverse();
118 | 	}
119 | 
120 | 	for (int i = 0; i < graph.node_size(); i++)
121 | 	{
122 | 		nodeSizes[graph.node(i).id()] = graph.node(i).sequence().size();
123 | 	}
124 | 
125 | 	auto mafs = getMafEntries(argv[3]);
126 | 	auto alignments = mafsToAlignments(mafs, posToNode, nodeSizes, nodeIsReverse);
127 | 
128 | 	std::ofstream alignmentOut { argv[4], std::ios::out | std::ios::binary };
129 | 	stream::write_buffered(alignmentOut, alignments, 0);
130 | 
131 | 	std::ofstream readsOut { argv[5], std::ios::out };
132 | 	for (size_t i = 0; i < mafs.size(); i++)
133 | 	{
134 | 		readsOut << ">" << mafs[i].readname << std::endl << mafs[i].realsequence << std::endl;
135 | 	}
136 | 
137 | }


--------------------------------------------------------------------------------
/src/MinimizerSeeder.h:
--------------------------------------------------------------------------------
 1 | #ifndef MinimizerSeeder_h
 2 | #define MinimizerSeeder_h
 3 | 
 4 | #include <random>
 5 | #include <vector>
 6 | #include <string>
 7 | #include <sdsl/int_vector.hpp>
 8 | #include <sdsl/select_support_mcl.hpp>
 9 | #include <ParallelBB.h>
10 | #include "AlignmentGraph.h"
11 | #include "GraphAlignerWrapper.h"
12 | #include "BooPHF.h"
13 | 
14 | class MinimizerSeeder
15 | {
16 | 	struct KmerBucket
17 | 	{
18 | 		KmerBucket();
19 | 		KmerBucket(const KmerBucket& other) = delete;
20 | 		KmerBucket(KmerBucket&& other) = default;
21 | 		~KmerBucket();
22 | 		KmerBucket& operator=(const KmerBucket& other) = delete;
23 | 		KmerBucket& operator=(KmerBucket&& other) = default;
24 | 		typedef boomphf::SingleHashFunctor<uint64_t> hasher_t;
25 | 		typedef boomphf::mphf<uint64_t, hasher_t> boophf_t;
26 | 		boophf_t* locator;
27 | 		sdsl::int_vector<0> kmerCheck;
28 | 		sdsl::int_vector<0> startPos;
29 | 		sdsl::int_vector<0> positions;
30 | 	};
31 | public:
32 | 	MinimizerSeeder(const AlignmentGraph& graph, size_t minimizerLength, size_t windowSize, size_t numThreads, double keepLeastFrequentFraction);
33 | 	std::vector<SeedHit> getSeeds(const std::string& sequence, double density) const;
34 | 	bool canSeed() const;
35 | private:
36 | 	void addMinimizers(std::vector<SeedHit>& result, std::vector<std::tuple<size_t, size_t, size_t, size_t>>& matchIndices, size_t maxCount) const;
37 | 	size_t getStart(size_t bucket, size_t index) const;
38 | 	size_t getBucket(size_t hash) const;
39 | 	SeedHit matchToSeedHit(int nodeId, size_t nodeOffset, size_t seqPos, int count) const;
40 | 	void initMinimizers(size_t numThreads);
41 | 	void initMaxCount(double keepLeastFrequentFraction);
42 | 	const AlignmentGraph& graph;
43 | 	std::vector<KmerBucket> buckets;
44 | 	size_t minimizerLength;
45 | 	size_t windowSize;
46 | 	size_t maxCount;
47 | };
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/src/MummerSeeder.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <boost/archive/text_oarchive.hpp>
  3 | #include <boost/archive/text_iarchive.hpp>
  4 | #include <boost/serialization/vector.hpp>
  5 | #include "CommonUtils.h"
  6 | #include "MummerSeeder.h"
  7 | 
  8 | char lowercaseRef(char c)
  9 | {
 10 | 	switch(c)
 11 | 	{
 12 | 		case 'a':
 13 | 		case 'A':
 14 | 			return 'a';
 15 | 		case 'c':
 16 | 		case 'C':
 17 | 			return 'c';
 18 | 		case 'g':
 19 | 		case 'G':
 20 | 			return 'g';
 21 | 		case 'u':
 22 | 		case 'U':
 23 | 		case 't':
 24 | 		case 'T':
 25 | 			return 't';
 26 | 		default:
 27 | 		case '`':
 28 | 			return '`';
 29 | 	}
 30 | 	assert(false);
 31 | 	return std::numeric_limits<char>::max();
 32 | }
 33 | 
 34 | char lowercaseSeq(char c)
 35 | {
 36 | 	switch(c)
 37 | 	{
 38 | 		case 'a':
 39 | 		case 'A':
 40 | 			return 'a';
 41 | 		case 'c':
 42 | 		case 'C':
 43 | 			return 'c';
 44 | 		case 'g':
 45 | 		case 'G':
 46 | 			return 'g';
 47 | 		case 'u':
 48 | 		case 'U':
 49 | 		case 't':
 50 | 		case 'T':
 51 | 			return 't';
 52 | 		default:
 53 | 			return 'x';
 54 | 	}
 55 | 	assert(false);
 56 | 	return std::numeric_limits<char>::max();
 57 | }
 58 | 
 59 | bool fileExists(const std::string& fileName)
 60 | {
 61 | 	std::ifstream file { fileName };
 62 | 	return file.good();
 63 | }
 64 | 
 65 | MummerSeeder::MummerSeeder(const GfaGraph& graph, const std::string& cachePrefix)
 66 | {
 67 | 	if (cachePrefix.size() > 0 && fileExists(cachePrefix + ".aux"))
 68 | 	{
 69 | 		loadFrom(cachePrefix);
 70 | 	}
 71 | 	else
 72 | 	{
 73 | 		initTree(graph);
 74 | 		if (cachePrefix.size() > 0) saveTo(cachePrefix);
 75 | 	}
 76 | }
 77 | 
 78 | MummerSeeder::MummerSeeder(const vg::Graph& graph, const std::string& cachePrefix)
 79 | {
 80 | 	if (cachePrefix.size() > 0 && fileExists(cachePrefix + ".aux"))
 81 | 	{
 82 | 		loadFrom(cachePrefix);
 83 | 	}
 84 | 	else
 85 | 	{
 86 | 		initTree(graph);
 87 | 		if (cachePrefix.size() > 0) saveTo(cachePrefix);
 88 | 	}
 89 | }
 90 | 
 91 | void MummerSeeder::initTree(const GfaGraph& graph)
 92 | {
 93 | 	for (auto node : graph.nodes)
 94 | 	{
 95 | 		nodePositions.push_back(seq.size());
 96 | 		nodeIDs.push_back(node.first);
 97 | 		seq += node.second;
 98 | 		seq += '`';
 99 | 	}
100 | 	nodePositions.push_back(seq.size());
101 | 	for (size_t i = 0; i < seq.size(); i++)
102 | 	{
103 | 		seq[i] = lowercaseRef(seq[i]);
104 | 	}
105 | 	seq.shrink_to_fit();
106 | 	matcher = std::make_unique<mummer::mummer::sparseSA>(mummer::mummer::sparseSA::create_auto(seq.c_str(), seq.size(), 0, true));
107 | }
108 | 
109 | void MummerSeeder::initTree(const vg::Graph& graph)
110 | {
111 | 	for (int i = 0; i < graph.node_size(); i++)
112 | 	{
113 | 		nodePositions.push_back(seq.size());
114 | 		nodeIDs.push_back(graph.node(i).id());
115 | 		seq += graph.node(i).sequence();
116 | 		seq += '`';
117 | 	}
118 | 	nodePositions.push_back(seq.size());
119 | 	for (size_t i = 0; i < seq.size(); i++)
120 | 	{
121 | 		seq[i] = lowercaseRef(seq[i]);
122 | 	}
123 | 	seq.shrink_to_fit();
124 | 	matcher = std::make_unique<mummer::mummer::sparseSA>(mummer::mummer::sparseSA::create_auto(seq.c_str(), seq.size(), 0, true));
125 | }
126 | 
127 | size_t MummerSeeder::getNodeIndex(size_t indexPos) const
128 | {
129 | 	assert(indexPos < nodePositions.back());
130 | 	auto next = std::upper_bound(nodePositions.begin(), nodePositions.end(), indexPos);
131 | 	assert(next != nodePositions.begin());
132 | 	size_t index = (next - nodePositions.begin()) - 1;
133 | 	assert(index < nodePositions.size()-1);
134 | 	return index;
135 | }
136 | 
137 | void MummerSeeder::saveTo(const std::string& prefix) const
138 | {
139 | 	std::ofstream file { prefix + ".aux", std::ios::binary };
140 | 	{
141 | 		boost::archive::text_oarchive oa(file);
142 | 		oa << seq;
143 | 		oa << nodePositions;
144 | 		oa << nodeIDs;
145 | 	}
146 | 	matcher->save(prefix + "_index");
147 | }
148 | 
149 | void MummerSeeder::loadFrom(const std::string& prefix)
150 | {
151 | 	std::ifstream file { prefix + ".aux", std::ios::binary };
152 | 	{
153 | 		boost::archive::text_iarchive ia(file);
154 | 		ia >> seq;
155 | 		ia >> nodePositions;
156 | 		ia >> nodeIDs;
157 | 	}
158 | 	// same params that create_auto with minlen=0 passes
159 | 	matcher = std::make_unique<mummer::mummer::sparseSA>(seq, false, 1, true, false, false, 1, 0, true);
160 | 	matcher->load(prefix + "_index");
161 | }
162 | 
163 | struct MatchWithOrientation
164 | {
165 | 	MatchWithOrientation(const mummer::mummer::match_t& match, bool reverse) :
166 | 	match(match),
167 | 	reverse(reverse)
168 | 	{
169 | 	}
170 | 	mummer::mummer::match_t match;
171 | 	bool reverse;
172 | 	bool operator>(const MatchWithOrientation& other) const
173 | 	{
174 | 		return match.len > other.match.len;
175 | 	}
176 | };
177 | 
178 | std::vector<SeedHit> MummerSeeder::getMumSeeds(std::string sequence, size_t maxCount, size_t minLen) const
179 | {
180 | 	for (size_t i = 0; i < sequence.size(); i++)
181 | 	{
182 | 		sequence[i] = lowercaseSeq(sequence[i]);
183 | 	}
184 | 	assert(matcher != nullptr);
185 | 	std::priority_queue<MatchWithOrientation, std::vector<MatchWithOrientation>, std::greater<MatchWithOrientation>> matches;
186 | 	matcher->findMAM_each(sequence, minLen, false, [&matches, maxCount](const mummer::mummer::match_t& match)
187 | 	{
188 | 		if (matches.size() < maxCount)
189 | 		{
190 | 			matches.emplace(match, false);
191 | 			return;
192 | 		}
193 | 		if (matches.top().match.len < match.len)
194 | 		{
195 | 			matches.pop();
196 | 			matches.emplace(match, false);
197 | 		}
198 | 	});
199 | 	revcompInPlace(sequence);
200 | 	matcher->findMAM_each(sequence, minLen, false, [&matches, maxCount](const mummer::mummer::match_t& match)
201 | 	{
202 | 		if (matches.size() < maxCount)
203 | 		{
204 | 			matches.emplace(match, true);
205 | 			return;
206 | 		}
207 | 		if (matches.top().match.len < match.len)
208 | 		{
209 | 			matches.pop();
210 | 			matches.emplace(match, true);
211 | 		}
212 | 	});
213 | 	std::vector<mummer::mummer::match_t> MAMs;
214 | 	std::vector<mummer::mummer::match_t> bwMAMs;
215 | 	while (matches.size() > 0)
216 | 	{
217 | 		if (matches.top().reverse)
218 | 		{
219 | 			bwMAMs.push_back(matches.top().match);
220 | 		}
221 | 		else
222 | 		{
223 | 			MAMs.push_back(matches.top().match);
224 | 		}
225 | 		matches.pop();
226 | 	}
227 | 	auto seeds = matchesToSeeds(sequence.size(), MAMs, bwMAMs);
228 | 	assert(seeds.size() <= maxCount);
229 | 	std::sort(seeds.begin(), seeds.end(), [](const SeedHit& left, const SeedHit& right) { return left.matchLen > right.matchLen; });
230 | 	return seeds;
231 | }
232 | 
233 | std::vector<SeedHit> MummerSeeder::getMemSeeds(std::string sequence, size_t maxCount, size_t minLen) const
234 | {
235 | 	for (size_t i = 0; i < sequence.size(); i++)
236 | 	{
237 | 		sequence[i] = lowercaseSeq(sequence[i]);
238 | 	}
239 | 	assert(matcher != nullptr);
240 | 	std::priority_queue<MatchWithOrientation, std::vector<MatchWithOrientation>, std::greater<MatchWithOrientation>> matches;
241 | 	matcher->findMEM_each(sequence, minLen, false, [&matches, maxCount](const mummer::mummer::match_t& match)
242 | 	{
243 | 		if (matches.size() < maxCount)
244 | 		{
245 | 			matches.emplace(match, false);
246 | 			return;
247 | 		}
248 | 		if (matches.top().match.len < match.len)
249 | 		{
250 | 			matches.pop();
251 | 			matches.emplace(match, false);
252 | 		}
253 | 	});
254 | 	revcompInPlace(sequence);
255 | 	matcher->findMEM_each(sequence, minLen, false, [&matches, maxCount](const mummer::mummer::match_t& match)
256 | 	{
257 | 		if (matches.size() < maxCount)
258 | 		{
259 | 			matches.emplace(match, true);
260 | 			return;
261 | 		}
262 | 		if (matches.top().match.len < match.len)
263 | 		{
264 | 			matches.pop();
265 | 			matches.emplace(match, true);
266 | 		}
267 | 	});
268 | 	std::vector<mummer::mummer::match_t> MAMs;
269 | 	std::vector<mummer::mummer::match_t> bwMAMs;
270 | 	while (matches.size() > 0)
271 | 	{
272 | 		if (matches.top().reverse)
273 | 		{
274 | 			bwMAMs.push_back(matches.top().match);
275 | 		}
276 | 		else
277 | 		{
278 | 			MAMs.push_back(matches.top().match);
279 | 		}
280 | 		matches.pop();
281 | 	}
282 | 	auto seeds = matchesToSeeds(sequence.size(), MAMs, bwMAMs);
283 | 	assert(seeds.size() <= maxCount);
284 | 	std::sort(seeds.begin(), seeds.end(), [](const SeedHit& left, const SeedHit& right) { return left.matchLen > right.matchLen; });
285 | 	return seeds;
286 | }
287 | 
288 | std::vector<SeedHit> MummerSeeder::matchesToSeeds(size_t seqLen, const std::vector<mummer::mummer::match_t>& fwmatches, const std::vector<mummer::mummer::match_t>& bwmatches) const
289 | {
290 | 	std::vector<SeedHit> result;
291 | 	result.reserve(fwmatches.size() + bwmatches.size());
292 | 	for (auto match : fwmatches)
293 | 	{
294 | 		assert(match.ref + match.len <= nodePositions.back());
295 | 		auto index = getNodeIndex(match.ref);
296 | 		int nodeID = nodeIDs[index];
297 | 		size_t nodeOffset = match.ref - nodePositions[index];
298 | 		size_t seqPos = match.query;
299 | 		size_t matchLen = match.len;
300 | 		result.emplace_back(nodeID, nodeOffset, seqPos, matchLen, matchLen, false);
301 | 	}
302 | 	for (auto match : bwmatches)
303 | 	{
304 | 		assert(match.ref + match.len <= nodePositions.back());
305 | 		auto index = getNodeIndex(match.ref);
306 | 		int nodeID = nodeIDs[index];
307 | 		size_t nodeOffset = match.ref - nodePositions[index];
308 | 		size_t seqPos = match.query;
309 | 		size_t matchLen = match.len;
310 | 		assert(match.len > 0);
311 | 		assert(nodeOffset + matchLen <= nodeLength(index));
312 | 		assert(seqPos + matchLen <= seqLen);
313 | 		nodeOffset = nodeLength(index) - nodeOffset - matchLen;
314 | 		seqPos = seqLen - seqPos - matchLen;
315 | 		assert(nodeOffset < nodeLength(index));
316 | 		assert(seqPos < seqLen);
317 | 		result.emplace_back(nodeID, nodeOffset, seqPos, matchLen, matchLen, true);
318 | 	}
319 | 	return result;
320 | }
321 | 
322 | size_t MummerSeeder::nodeLength(size_t indexPos) const
323 | {
324 | 	//-1 for separator
325 | 	return nodePositions[indexPos+1] - nodePositions[indexPos] - 1;
326 | }
327 | 
328 | void MummerSeeder::revcompInPlace(std::string& seq) const
329 | {
330 | 	std::reverse(seq.begin(), seq.end());
331 | 	for (size_t i = 0; i < seq.size(); i++)
332 | 	{
333 | 		switch(seq[i])
334 | 		{
335 | 			case 'a':
336 | 				seq[i] = 't';
337 | 				break;
338 | 			case 'u':
339 | 			case 't':
340 | 				seq[i] = 'a';
341 | 				break;
342 | 			case 'c':
343 | 				seq[i] = 'g';
344 | 				break;
345 | 			case 'g':
346 | 				seq[i] = 'c';
347 | 				break;
348 | 			default:
349 | 				seq[i] = 'x';
350 | 				break;
351 | 		}
352 | 	}
353 | }
354 | 


--------------------------------------------------------------------------------
/src/MummerSeeder.h:
--------------------------------------------------------------------------------
 1 | #ifndef MummerSeeder_h
 2 | #define MummerSeeder_h
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | #include <mummer/sparseSA.hpp>
 7 | #include <mummer/fasta.hpp>
 8 | #include "GfaGraph.h"
 9 | #include "GraphAlignerWrapper.h"
10 | #include "vg.pb.h"
11 | 
12 | class MummerSeeder
13 | {
14 | public:
15 | 	MummerSeeder(const GfaGraph& graph, const std::string& cachePrefix);
16 | 	MummerSeeder(const vg::Graph& graph, const std::string& cachePrefix);
17 | 	std::vector<SeedHit> getMemSeeds(std::string sequence, size_t maxCount, size_t minLen) const;
18 | 	std::vector<SeedHit> getMumSeeds(std::string sequence, size_t maxCount, size_t minLen) const;
19 | private:
20 | 	std::vector<SeedHit> matchesToSeeds(size_t seqLen, const std::vector<mummer::mummer::match_t>& fwmatches, const std::vector<mummer::mummer::match_t>& bwmatches) const;
21 | 	void revcompInPlace(std::string& seq) const;
22 | 	size_t getNodeIndex(size_t indexPos) const;
23 | 	size_t nodeLength(size_t indexPos) const;
24 | 	void initTree(const GfaGraph& graph);
25 | 	void initTree(const vg::Graph& graph);
26 | 	void saveTo(const std::string& cachePrefix) const;
27 | 	void loadFrom(const std::string& cachePrefix);
28 | 	std::string seq;
29 | 	std::unique_ptr<mummer::mummer::sparseSA> matcher;
30 | 	std::vector<size_t> nodePositions;
31 | 	std::vector<int> nodeIDs;
32 | };
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/src/NodePosCsv.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <fstream>
 3 | #include <vector>
 4 | #include <unordered_map>
 5 | #include <unordered_set>
 6 | #include "CommonUtils.h"
 7 | 
 8 | int main(int argc, char** argv)
 9 | {
10 | 	std::string infilename {argv[1]};
11 | 	std::string outfilename {argv[2]};
12 | 
13 | 	std::unordered_map<int, std::unordered_map<std::string, std::vector<std::pair<int, int>>>> positions;
14 | 	std::unordered_map<int, std::unordered_map<std::string, int>> minRepeatCounts;
15 | 	auto alignments = CommonUtils::LoadVGAlignments(infilename);
16 | 	std::unordered_set<std::string> alignmentNames;
17 | 	for (auto aln : alignments)
18 | 	{
19 | 		alignmentNames.insert(aln.name());
20 | 		int pos = aln.query_position();
21 | 		for (size_t i = 0; i < aln.path().mapping_size(); i++)
22 | 		{
23 | 			auto mapping = aln.path().mapping(i);
24 | 			positions[mapping.position().node_id()][aln.name()].emplace_back(pos, pos+mapping.edit(0).to_length());
25 | 			pos += mapping.edit(0).to_length();
26 | 			minRepeatCounts[mapping.position().node_id()][aln.name()] += 1;
27 | 		}
28 | 	}
29 | 	std::vector<std::string> readnames { alignmentNames.begin(), alignmentNames.end() };
30 | 	std::sort(readnames.begin(), readnames.end());
31 | 	std::ofstream out {outfilename};
32 | 	out << "node,_numreads,_minalnrepeatcount,_traversingreads";
33 | 	for (auto read : readnames)
34 | 	{
35 | 		out << "," << read;
36 | 	}
37 | 	out << std::endl;
38 | 	std::vector<int> nodevec;
39 | 	for (auto node : positions)
40 | 	{
41 | 		nodevec.push_back(node.first);
42 | 	}
43 | 	std::sort(nodevec.begin(), nodevec.end());
44 | 	for (auto node : nodevec)
45 | 	{
46 | 		out << node;
47 | 		out << "," << positions[node].size();
48 | 		int minRepeatCount = 0;
49 | 		for (auto pair : minRepeatCounts[node])
50 | 		{
51 | 			minRepeatCount = std::max(minRepeatCount, pair.second);
52 | 		}
53 | 		out << "," << minRepeatCount;
54 | 		out << ",";
55 | 		bool first = true;
56 | 		for (auto read : positions[node])
57 | 		{
58 | 			if (read.second.size() > 0)
59 | 			{
60 | 				if (!first) out << ";";
61 | 				out << read.first;
62 | 				first = false;
63 | 			}
64 | 		}
65 | 		for (auto read : readnames)
66 | 		{
67 | 			out << ",";
68 | 			if (positions[node].count(read) == 1)
69 | 			{
70 | 				for (size_t i = 0; i < positions[node][read].size(); i++)
71 | 				{
72 | 					if (i > 0) out << ";";
73 | 					out << positions[node][read][i].first << "-" << positions[node][read][i].second;
74 | 				}
75 | 			}
76 | 		}
77 | 		out << std::endl;
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/src/PickAdjacentAlnPairs.cpp:
--------------------------------------------------------------------------------
 1 | #include <unordered_map>
 2 | #include <fstream>
 3 | #include "vg.pb.h"
 4 | #include "stream.hpp"
 5 | #include "CommonUtils.h"
 6 | #include "fastqloader.h"
 7 | 
 8 | void outputPairs(std::ofstream& alignmentOut, const std::string& readname, const std::unordered_map<std::string, size_t>& readLens, const std::vector<vg::Alignment>& starts, const std::vector<vg::Alignment>& ends, const int maxSplitDist, const int minPartialLen)
 9 | {
10 | 	std::vector<vg::Alignment> pairs;
11 | 	size_t currentPairNum = 0;
12 | 	for (auto& start : starts)
13 | 	{
14 | 		assert(start.query_position() == 0);
15 | 		int startEnd = 0;
16 | 		for (int i = 0; i < start.path().mapping_size(); i++)
17 | 		{
18 | 			startEnd += start.path().mapping(i).edit(0).to_length();
19 | 		}
20 | 		assert(startEnd >= minPartialLen);
21 | 		for (auto& end : ends)
22 | 		{
23 | 			int endStart = end.query_position();
24 | 			if (abs(startEnd-endStart) > maxSplitDist) continue;
25 | 			vg::Alignment left { start };
26 | 			vg::Alignment right { end };
27 | 			left.set_name(readname + "_pair" + std::to_string(currentPairNum) + "_1");
28 | 			right.set_name(readname + "_pair" + std::to_string(currentPairNum) + "_2");
29 | 			pairs.push_back(std::move(left));
30 | 			pairs.push_back(std::move(right));
31 | 			currentPairNum++;
32 | 		}
33 | 	}
34 | 	if (pairs.size() > 0) stream::write_buffered(alignmentOut, pairs, 0);
35 | }
36 | 
37 | void pickAndWritePairs(std::string inputFile, std::string outputFile, const std::unordered_map<std::string, size_t>& readLens, const int maxSplitDist, const int minPartialLen)
38 | {
39 | 	std::ofstream alignmentOut { outputFile, std::ios::out | std::ios::binary };
40 | 	std::string currentRead;
41 | 	std::vector<vg::Alignment> starts;
42 | 	std::vector<vg::Alignment> ends;
43 | 
44 | 	std::ifstream alignmentIn { inputFile, std::ios::in | std::ios::binary };
45 | 	std::function<void(vg::Alignment&)> lambda = [&alignmentOut, &readLens, &currentRead, &starts, &ends, maxSplitDist, minPartialLen](vg::Alignment& aln) {
46 | 		if (aln.name() != currentRead)
47 | 		{
48 | 			outputPairs(alignmentOut, currentRead, readLens, starts, ends, maxSplitDist, minPartialLen);
49 | 			starts.clear();
50 | 			ends.clear();
51 | 			currentRead = aln.name();
52 | 		}
53 | 		assert(readLens.count(aln.name()) == 1);
54 | 		size_t alnlen = 0;
55 | 		for (int i = 0; i < aln.path().mapping_size(); i++)
56 | 		{
57 | 			alnlen += aln.path().mapping(i).edit(0).to_length();
58 | 		}
59 | 		if (alnlen < minPartialLen) return;
60 | 		if (aln.query_position() == 0)
61 | 		{
62 | 			starts.push_back(aln);
63 | 		}
64 | 		if (aln.query_position() + alnlen == readLens.at(aln.name()))
65 | 		{
66 | 			ends.push_back(aln);
67 | 		}
68 | 	};
69 | 	stream::for_each(alignmentIn, lambda);
70 | 	outputPairs(alignmentOut, currentRead, readLens, starts, ends, maxSplitDist, minPartialLen);
71 | }
72 | 
73 | std::unordered_map<std::string, size_t> getReadLens(std::string filename)
74 | {
75 | 	std::unordered_map<std::string, size_t> result;
76 | 	FastQ::streamFastqFromFile(filename, false, [&result](const FastQ& read)
77 | 	{
78 | 		result[read.seq_id] = read.sequence.size();
79 | 	});
80 | 	return result;
81 | }
82 | 
83 | int main(int argc, char** argv)
84 | {
85 | 	std::string inputAlns { argv[1] };
86 | 	int maxSplitDist = std::stoi(argv[2]);
87 | 	std::string readFile { argv[3] };
88 | 	std::string outputAlns { argv[4] };
89 | 	int minPartialLen = std::stoi(argv[5]);
90 | 
91 | 	auto readLens = getReadLens(readFile);
92 | 	pickAndWritePairs(inputAlns, outputAlns, readLens, maxSplitDist, minPartialLen);
93 | }


--------------------------------------------------------------------------------
/src/PickMummerSeeds.cpp:
--------------------------------------------------------------------------------
  1 | #include <queue>
  2 | #include <iostream>
  3 | #include <unordered_map>
  4 | #include <vector>
  5 | #include <string>
  6 | #include <algorithm>
  7 | #include "vg.pb.h"
  8 | #include "stream.hpp"
  9 | #include "CommonUtils.h"
 10 | #include "fastqloader.h"
 11 | 
 12 | struct MummerSeed
 13 | {
 14 | 	size_t readpos;
 15 | 	size_t len;
 16 | 	int nodeId;
 17 | 	size_t nodepos;
 18 | 	bool reverse;
 19 | };
 20 | 
 21 | class AlignmentLengthCompare
 22 | {
 23 | public:
 24 | 	bool operator()(const MummerSeed& left, const MummerSeed& right) const
 25 | 	{
 26 | 		return left.len > right.len;
 27 | 	}
 28 | };
 29 | 
 30 | vg::Alignment createAlignment(const std::string& readname, const MummerSeed& seed)
 31 | {
 32 | 	vg::Alignment result;
 33 | 	result.set_name(readname);
 34 | 	result.set_query_position(seed.readpos);
 35 | 	auto path = new vg::Path;
 36 | 	result.set_allocated_path(path);
 37 | 	auto vgmapping = path->add_mapping();
 38 | 	auto position = new vg::Position;
 39 | 	vgmapping->set_allocated_position(position);
 40 | 	position->set_node_id(seed.nodeId);
 41 | 	position->set_is_reverse(seed.reverse);
 42 | 	position->set_offset(seed.nodepos);
 43 | 	auto edit = vgmapping->add_edit();
 44 | 	edit->set_from_length(seed.len);
 45 | 	edit->set_to_length(seed.len);
 46 | 	return result;
 47 | }
 48 | 
 49 | int getNodeIndex(size_t pos, const std::vector<size_t>& nodeMappingPositions)
 50 | {
 51 | 	auto iter = std::upper_bound(nodeMappingPositions.begin(), nodeMappingPositions.end(), pos);
 52 | 	int index = iter - nodeMappingPositions.begin();
 53 | 	assert(index > 0);
 54 | 	assert(index <= nodeMappingPositions.size());
 55 | 	return index-1;
 56 | }
 57 | 
 58 | int main(int argc, char** argv)
 59 | {
 60 | 	std::string outputFileName { argv[1] };
 61 | 	std::string gfaReferenceFilename { argv[2] };
 62 | 	int maxSeeds = std::stoi(argv[3]);
 63 | 	std::string readFile { argv[4] };
 64 | 	std::unordered_map<std::string, size_t> readLengths;
 65 | 	std::unordered_map<int, size_t> nodeLengths;
 66 | 
 67 | 	{
 68 | 		auto reads = loadFastqFromFile(readFile);
 69 | 		for (auto read : reads)
 70 | 		{
 71 | 			readLengths[read.seq_id] = read.sequence.size();
 72 | 		}
 73 | 	}
 74 | 	{
 75 | 		auto reads = loadFastqFromFile(gfaReferenceFilename);
 76 | 		for (size_t i = 0; i < reads.size(); i++)
 77 | 		{
 78 | 			nodeLengths[std::stoi(reads[i].seq_id)] = reads[i].sequence.size();
 79 | 		}
 80 | 	}
 81 | 
 82 | 	std::unordered_map<std::string, std::priority_queue<MummerSeed, std::vector<MummerSeed>, AlignmentLengthCompare>> alignments;
 83 | 	size_t numElems = 0;
 84 | 	std::string currentRead;
 85 | 	std::string line;
 86 | 	bool currentReverse = false;
 87 | 	size_t currentReadLength;
 88 | 	std::priority_queue<MummerSeed, std::vector<MummerSeed>, AlignmentLengthCompare>* currentQueue;
 89 | 	while (std::getline(std::cin, line))
 90 | 	{
 91 | 		if (line[0] == '>')
 92 | 		{
 93 | 			if (line.size() > 8 && (std::string{line.end()-8, line.end()} == " Reverse" || std::string{line.end()-8, line.end()} == "_Reverse"))
 94 | 			{
 95 | 				currentReverse = true;
 96 | 				currentRead = std::string { line.begin()+2, line.end()-8 };
 97 | 			}
 98 | 			else
 99 | 			{
100 | 				currentReverse = false;
101 | 				currentRead = std::string { line.begin()+2, line.end() };
102 | 			}
103 | 			currentReadLength = readLengths[currentRead];
104 | 			currentQueue = &alignments[currentRead];
105 | 		}
106 | 		else
107 | 		{
108 | 			std::stringstream str { line };
109 | 			MummerSeed newSeed;
110 | 			str >> newSeed.nodeId >> newSeed.nodepos >> newSeed.readpos >> newSeed.len;
111 | 			newSeed.reverse = currentReverse;
112 | 			assert(newSeed.nodepos >= 1);
113 | 			assert(newSeed.readpos >= 1);
114 | 			newSeed.nodepos -= 1;
115 | 			newSeed.readpos -= 1;
116 | 			if (currentReverse)
117 | 			{
118 | 				//there's some weird bug, possibly even in mummer
119 | 				//ignore it until we figure out what's going on
120 | 				if (newSeed.nodepos > nodeLengths[newSeed.nodeId] - 1) continue;
121 | 				if (newSeed.readpos > currentReadLength - 1) continue;
122 | 				assert(newSeed.nodepos <= nodeLengths[newSeed.nodeId] - 1);
123 | 				assert(newSeed.readpos <= currentReadLength - 1);
124 | 				newSeed.nodepos = nodeLengths[newSeed.nodeId] - 1 - newSeed.nodepos;
125 | 				newSeed.readpos = currentReadLength - 1 - newSeed.readpos;
126 | 			}
127 | 			//there's some weird bug, possibly even in mummer
128 | 			//ignore it until we figure out what's going on
129 | 			if (newSeed.readpos >= currentReadLength) continue;
130 | 			if (newSeed.nodepos >= nodeLengths[newSeed.nodeId]) continue;
131 | 			assert(newSeed.readpos < currentReadLength);
132 | 			assert(newSeed.nodepos < nodeLengths[newSeed.nodeId]);
133 | 			assert(newSeed.readpos >= 0);
134 | 			assert(newSeed.nodepos >= 0);
135 | 			if (currentQueue->size() < maxSeeds)
136 | 			{
137 | 				currentQueue->emplace(newSeed);
138 | 				numElems++;
139 | 			}
140 | 			else if (AlignmentLengthCompare{}(newSeed, currentQueue->top()))
141 | 			{
142 | 				currentQueue->pop();
143 | 				currentQueue->emplace(newSeed);
144 | 			}
145 | 		}
146 | 	}
147 | 	std::vector<vg::Alignment> writeAlignments;
148 | 	writeAlignments.reserve(numElems);
149 | 	std::vector<vg::Alignment> insertAlns;
150 | 	insertAlns.reserve(maxSeeds);
151 | 	for (auto& pair : alignments)
152 | 	{
153 | 		insertAlns.clear();
154 | 		while (pair.second.size() > 0)
155 | 		{
156 | 			auto aln = createAlignment(pair.first, pair.second.top());
157 | 			insertAlns.push_back(aln);
158 | 			pair.second.pop();
159 | 		}
160 | 		std::reverse(insertAlns.begin(), insertAlns.end());
161 | 		writeAlignments.insert(writeAlignments.end(), insertAlns.begin(), insertAlns.end());
162 | 	}
163 | 	assert(writeAlignments.size() == numElems);
164 | 	std::ofstream alignmentOut { outputFileName, std::ios::out | std::ios::binary };
165 | 	stream::write_buffered(alignmentOut, writeAlignments, 0);
166 | }


--------------------------------------------------------------------------------
/src/Postprocess.cpp:
--------------------------------------------------------------------------------
  1 | #include <unordered_set>
  2 | #include <vector>
  3 | #include <atomic>
  4 | #include <unordered_map>
  5 | #include <fstream>
  6 | #include <concurrentqueue.h> //https://github.com/cameron314/concurrentqueue
  7 | #include "vg.pb.h"
  8 | #include "stream.hpp"
  9 | #include "fastqloader.h"
 10 | #include "CommonUtils.h"
 11 | #include "AlignmentSelection.h"
 12 | 
 13 | std::atomic<bool> readingDone;
 14 | std::atomic<bool> splittingDone;
 15 | std::vector<vg::Alignment*> cleanup;
 16 | 
 17 | size_t allAlnsCount = 0;
 18 | size_t selectedAlnCount = 0;
 19 | size_t fullLengthAlnCount = 0;
 20 | size_t readsWithAnAlnCount = 0;
 21 | size_t bpInReads = 0;
 22 | size_t bpInSelected = 0;
 23 | size_t bpInFull = 0;
 24 | 
 25 | std::unordered_map<std::string, size_t> getReadLengths(std::string readFile)
 26 | {
 27 | 	std::unordered_map<std::string, size_t> result;
 28 | 	auto reads = loadFastqFromFile(readFile);
 29 | 	for (auto read : reads)
 30 | 	{
 31 | 		result[read.seq_id] = read.sequence.size();
 32 | 	}
 33 | 	return result;
 34 | }
 35 | 
 36 | void loadAlignments(std::string filename, moodycamel::ConcurrentQueue<vg::Alignment*>& output)
 37 | {
 38 | 	vg::Alignment* current[100];
 39 | 	size_t countCurrent = 0;
 40 | 	std::ifstream alnFile { filename, std::ios::in | std::ios::binary };
 41 | 	std::function<void(vg::Alignment&)> lambda = [&cleanup, &output, &current, &countCurrent](vg::Alignment& g) {
 42 | 		vg::Alignment* ptr = new vg::Alignment;
 43 | 		*ptr = g;
 44 | 		cleanup.push_back(ptr);
 45 | 		current[countCurrent] = ptr;
 46 | 		countCurrent++;
 47 | 		if (countCurrent == 100)
 48 | 		{
 49 | 			output.enqueue_bulk(current, 100);
 50 | 			countCurrent = 0;
 51 | 		}
 52 | 	};
 53 | 	stream::for_each(alnFile, lambda);
 54 | 	if (countCurrent > 0)
 55 | 	{
 56 | 		output.enqueue_bulk(current, countCurrent);
 57 | 	}
 58 | 
 59 | 	readingDone = true;
 60 | }
 61 | 
 62 | void splitAlignmentsIntoSelectedAndFullLength(const std::unordered_map<std::string, size_t>& readLengths, moodycamel::ConcurrentQueue<vg::Alignment*>& inputAlns, moodycamel::ConcurrentQueue<vg::Alignment*>& outputSelected, moodycamel::ConcurrentQueue<vg::Alignment*>& outputFullLength)
 63 | {
 64 | 	vg::Alignment* alns[100] {};
 65 | 
 66 | 	std::unordered_map<std::string, std::vector<vg::Alignment*>> alnsPerRead;
 67 | 	while (true)
 68 | 	{
 69 | 		size_t gotAlns = inputAlns.try_dequeue_bulk(alns, 100);
 70 | 		if (gotAlns == 0)
 71 | 		{
 72 | 			if (readingDone) break;
 73 | 			std::this_thread::sleep_for(std::chrono::milliseconds(10));
 74 | 			continue;
 75 | 		}
 76 | 		for (size_t i = 0; i < gotAlns; i++)
 77 | 		{
 78 | 			alnsPerRead[alns[i]->name()].push_back(alns[i]);
 79 | 		}
 80 | 	}
 81 | 
 82 | 	AlignmentSelection::SelectionOptions options;
 83 | 	options.method = AlignmentSelection::SelectionMethod::GreedyLength;
 84 | 	for (auto pair : alnsPerRead)
 85 | 	{
 86 | 		auto selected = AlignmentSelection::SelectAlignments(pair.second, options);
 87 | 		outputSelected.enqueue_bulk(selected.data(), selected.size());
 88 | 		allAlnsCount += pair.second.size();
 89 | 		selectedAlnCount += selected.size();
 90 | 		for (auto ptr : selected)
 91 | 		{
 92 | 			bpInSelected += ptr->sequence().size();
 93 | 		}
 94 | 		if (selected[0]->sequence().size() >= readLengths.at(pair.first) - 1)
 95 | 		{
 96 | 			outputFullLength.enqueue(selected[0]);
 97 | 			bpInFull += selected[0]->sequence().size();
 98 | 			fullLengthAlnCount += 1;
 99 | 		}
100 | 	}
101 | 
102 | 	readsWithAnAlnCount = alnsPerRead.size();
103 | 
104 | 	splittingDone = true;
105 | }
106 | 
107 | void writeAlignments(std::string filename, moodycamel::ConcurrentQueue<vg::Alignment*>& inputAlns)
108 | {
109 | 	std::ofstream outfile { filename,  std::ios::out | std::ios::binary };
110 | 
111 | 	std::vector<vg::Alignment*> alns;
112 | 	alns.resize(1000, nullptr);
113 | 
114 | 	while (true)
115 | 	{
116 | 		alns.resize(1000);
117 | 		size_t gotAlns = inputAlns.try_dequeue_bulk(alns.data(), 1000);
118 | 		if (gotAlns == 0)
119 | 		{
120 | 			if (splittingDone) break;
121 | 			std::this_thread::sleep_for(std::chrono::milliseconds(10));
122 | 			continue;
123 | 		}
124 | 		alns.resize(gotAlns);
125 | 		stream::write_buffered_ptr(outfile, alns, 0);
126 | 	}
127 | }
128 | 
129 | int main(int argc, char** argv)
130 | {
131 | 	std::string rawAlnFile { argv[1] };
132 | 	std::string readsFile { argv[2] };
133 | 	std::string outputSelectedAlnFile { argv[3] };
134 | 	std::string outputFullLengthAlnFile { argv[4] };
135 | 	std::string outputSummaryFile { argv[5] };
136 | 
137 | 	readingDone = false;
138 | 	splittingDone = false;
139 | 
140 | 	auto readLengths = getReadLengths(readsFile);
141 | 
142 | 	moodycamel::ConcurrentQueue<vg::Alignment*> readToSplitting;
143 | 	moodycamel::ConcurrentQueue<vg::Alignment*> splitToSelected;
144 | 	moodycamel::ConcurrentQueue<vg::Alignment*> splitToFullLength;
145 | 
146 | 	std::thread readThread {[&rawAlnFile, &readToSplitting](){loadAlignments(rawAlnFile, readToSplitting);}};
147 | 	std::thread splitter {[&readToSplitting, &splitToSelected, &splitToFullLength, &readLengths](){splitAlignmentsIntoSelectedAndFullLength(readLengths, readToSplitting, splitToSelected, splitToFullLength);}};
148 | 	std::thread selectedWriter {[&splitToSelected, &outputSelectedAlnFile](){writeAlignments(outputSelectedAlnFile, splitToSelected);}};
149 | 	std::thread fullLengthWriter {[&splitToFullLength, &outputFullLengthAlnFile](){writeAlignments(outputFullLengthAlnFile, splitToFullLength);}};
150 | 
151 | 	readThread.join();
152 | 	splitter.join();
153 | 	selectedWriter.join();
154 | 	fullLengthWriter.join();
155 | 
156 | 	for (auto aln : cleanup)
157 | 	{
158 | 		delete aln;
159 | 	}
160 | 
161 | 	for (auto pair : readLengths)
162 | 	{
163 | 		bpInReads += pair.second;
164 | 	}
165 | 
166 | 	std::ofstream summary {outputSummaryFile};
167 | 	summary << readLengths.size() << "\tnumber of reads" << std::endl;
168 | 	summary << selectedAlnCount << "\tnumber of selected alignments" << std::endl;
169 | 	summary << fullLengthAlnCount << "\tnumber of full length alignments" << std::endl;
170 | 	summary << readsWithAnAlnCount << "\treads with an alignment" << std::endl;
171 | 	summary << bpInReads << "\tbp in reads" << std::endl;
172 | 	summary << bpInSelected << "\tbp in selected alignments" << std::endl;
173 | 	summary << bpInFull << "\tbp in full length alignments" << std::endl;
174 | }


--------------------------------------------------------------------------------
/src/ReadCorrection.cpp:
--------------------------------------------------------------------------------
 1 | #include "ThreadReadAssertion.h"
 2 | #include "ReadCorrection.h"
 3 | 
 4 | std::string toUpper(std::string seq)
 5 | {
 6 | 	for (auto& c : seq)
 7 | 	{
 8 | 		c = toupper(c);
 9 | 	}
10 | 	return seq;
11 | }
12 | 
13 | std::string toLower(std::string seq)
14 | {
15 | 	for (auto& c : seq)
16 | 	{
17 | 		c = tolower(c);
18 | 	}
19 | 	return seq;
20 | }
21 | 
22 | size_t getLongestOverlap(const std::string& left, const std::string& right, size_t maxOverlap)
23 | {
24 | 	if (left.size() < maxOverlap) maxOverlap = left.size();
25 | 	if (right.size() < maxOverlap) maxOverlap = right.size();
26 | 	for (size_t i = maxOverlap; i > 0; i--)
27 | 	{
28 | 		bool match = true;
29 | 		for (size_t a = 0; a < i && match; a++)
30 | 		{
31 | 			if (left[left.size() - maxOverlap + a] != right[a]) match = false;
32 | 		}
33 | 		if (match) return i;
34 | 	}
35 | 	return 0;
36 | }
37 | 
38 | std::string getCorrected(const std::string& raw, const std::vector<Correction>& corrections, size_t maxOverlap)
39 | {
40 | 	std::string result;
41 | 	size_t currentEnd = 0;
42 | 	for (size_t i = 0; i < corrections.size(); i++)
43 | 	{
44 | 		assert(i == 0 || corrections[i].startIndex >= corrections[i-1].startIndex);
45 | 		if (corrections[i].startIndex < currentEnd)
46 | 		{
47 | 			size_t overlap = getLongestOverlap(result, corrections[i].corrected, maxOverlap);
48 | 			result += toUpper(corrections[i].corrected.substr(overlap));
49 | 		}
50 | 		else if (corrections[i].startIndex > currentEnd)
51 | 		{
52 | 			result += toLower(raw.substr(currentEnd, corrections[i].startIndex - currentEnd));
53 | 			result += toUpper(corrections[i].corrected);
54 | 		}
55 | 		else
56 | 		{
57 | 			assert(corrections[i].startIndex == currentEnd);
58 | 			result += toUpper(corrections[i].corrected);
59 | 		}
60 | 		currentEnd = corrections[i].endIndex;
61 | 	}
62 | 	if (currentEnd < raw.size()) result += toLower(raw.substr(currentEnd));
63 | 	return result;
64 | }
65 | 


--------------------------------------------------------------------------------
/src/ReadCorrection.h:
--------------------------------------------------------------------------------
 1 | #ifndef ReadCorrection_h
 2 | #define ReadCorrection_h
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | struct Correction
 8 | {
 9 | 	size_t startIndex;
10 | 	size_t endIndex;
11 | 	std::string corrected;
12 | };
13 | 
14 | std::string getCorrected(const std::string& raw, const std::vector<Correction>& corrections, size_t maxOverlap);
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/src/ReverseReads.cpp:
--------------------------------------------------------------------------------
 1 | #include <fstream>
 2 | #include "fastqloader.h"
 3 | 
 4 | int main(int argc, char** argv)
 5 | {
 6 | 	auto reads = loadFastqFromFile(argv[1]);
 7 | 	std::ofstream output {argv[2]};
 8 | 	for (size_t i = 0; i < reads.size(); i++)
 9 | 	{
10 | 		auto reverse = reads[i].reverseComplement();
11 | 		output << ">" << reverse.seq_id << "_Reverse" << "\n";
12 | 		output << reverse.sequence << "\n";
13 | 	}
14 | }


--------------------------------------------------------------------------------
/src/SelectLongestAlignment.cpp:
--------------------------------------------------------------------------------
 1 | #include <fstream>
 2 | #include <string>
 3 | #include <unordered_map>
 4 | #include "CommonUtils.h"
 5 | #include "fastqloader.h"
 6 | #include "stream.hpp"
 7 | 
 8 | int main(int argc, char** argv)
 9 | {
10 | 	std::string alnfile { argv[1] };
11 | 	std::string outfile { argv[2] };
12 | 
13 | 	auto alns = CommonUtils::LoadVGAlignments(alnfile);
14 | 
15 | 	std::unordered_map<std::string, vg::Alignment> result;
16 | 	for (auto aln : alns)
17 | 	{
18 | 		if (result.count(aln.name()) == 0) result[aln.name()] = aln;
19 | 		else if (aln.sequence().size() > result[aln.name()].sequence().size()) result[aln.name()] = aln;
20 | 		else if (aln.sequence().size() == result[aln.name()].sequence().size() && aln.score() < result[aln.name()].score()) result[aln.name()] = aln;
21 | 	}
22 | 
23 | 	std::vector<vg::Alignment> writeAlns;
24 | 	writeAlns.reserve(result.size());
25 | 	for (auto pair : result)
26 | 	{
27 | 		writeAlns.push_back(pair.second);
28 | 	}
29 | 
30 | 	std::ofstream resultFile { outfile, std::ios::out | std::ios::binary };
31 | 	stream::write_buffered(resultFile, writeAlns, 0);
32 | }
33 | 


--------------------------------------------------------------------------------
/src/SupportedSubgraph.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <fstream>
 3 | #include <unordered_set>
 4 | #include "CommonUtils.h"
 5 | #include "vg.pb.h"
 6 | #include "stream.hpp"
 7 | 
 8 | 
 9 | int main(int argc, char** argv)
10 | {
11 | 	std::string graphFile { argv[1] };
12 | 	std::string alnFile { argv[2] };
13 | 	std::string outputGraph { argv[3] };
14 | 
15 | 	vg::Graph graph = CommonUtils::LoadVGGraph(graphFile);
16 | 
17 | 	std::vector<vg::Alignment> alignments;
18 | 	{
19 | 		std::ifstream alignmentfile {alnFile, std::ios::in | std::ios::binary};
20 | 		std::function<void(vg::Alignment&)> lambda = [&alignments](vg::Alignment& g) {
21 | 			alignments.push_back(g);
22 | 		};
23 | 		stream::for_each(alignmentfile, lambda);
24 | 	}
25 | 
26 | 	std::map<int, std::set<int>> existingEdges;
27 | 	for (size_t i = 0; i < graph.edge_size(); i++)
28 | 	{
29 | 		existingEdges[graph.edge(i).from()].insert(graph.edge(i).to());
30 | 	}
31 | 
32 | 	std::map<int, std::set<int>> supportedEdges;
33 | 	std::unordered_set<int> supportedNodes;
34 | 
35 | 	for (size_t i = 0; i < alignments.size(); i++)
36 | 	{
37 | 		std::cout << "alignment " << alignments[i].name() << std::endl;
38 | 		for (size_t j = 0; j < alignments[i].path().mapping_size()-1; j++)
39 | 		{
40 | 			auto from = alignments[i].path().mapping(j).position().node_id();
41 | 			auto to = alignments[i].path().mapping(j+1).position().node_id();
42 | 			supportedNodes.insert(from);
43 | 			supportedNodes.insert(to);
44 | 			if (existingEdges[from].count(to) == 0 && existingEdges[to].count(from) == 0)
45 | 			{
46 | 				std::cout << "nonexistant alignment from " << from << " to " << to << std::endl;
47 | 			}
48 | 			supportedEdges[from].insert(to);
49 | 		}
50 | 	}
51 | 
52 | 	vg::Graph resultGraph;
53 | 	for (int i = 0 ; i < graph.node_size(); i++)
54 | 	{
55 | 		if (supportedNodes.count(graph.node(i).id()) == 0) continue;
56 | 		auto* node = resultGraph.add_node();
57 | 		node->set_sequence(graph.node(i).sequence());
58 | 		node->set_id(graph.node(i).id());
59 | 		node->set_name(graph.node(i).name());
60 | 	}
61 | 	for (int i = 0; i < graph.edge_size(); i++)
62 | 	{
63 | 		auto from = graph.edge(i).from();
64 | 		auto to = graph.edge(i).to();
65 | 		bool foundForward = supportedEdges[from].count(to) == 1;
66 | 		auto foundBackward = supportedEdges[to].count(from) == 1;
67 | 		if (!foundForward && !foundBackward)
68 | 		{
69 | 			continue;
70 | 		}
71 | 		auto* edge = resultGraph.add_edge();
72 | 		edge->set_from(graph.edge(i).from());
73 | 		edge->set_to(graph.edge(i).to());
74 | 		edge->set_from_start(graph.edge(i).from_start());
75 | 		edge->set_to_end(graph.edge(i).to_end());
76 | 		edge->set_overlap(graph.edge(i).overlap());
77 | 	}
78 | 
79 | 	std::ofstream graphOut { outputGraph, std::ios::out | std::ios::binary };
80 | 	std::vector<vg::Graph> writeVector {resultGraph};
81 | 	stream::write_buffered(graphOut, writeVector, 0);
82 | }


--------------------------------------------------------------------------------
/src/ThreadReadAssertion.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <sstream>
 3 | #include <string_view>
 4 | #include "ThreadReadAssertion.h"
 5 | 
 6 | namespace ThreadReadAssertion
 7 | {
 8 | 	thread_local int currentnodeID;
 9 | 	thread_local bool currentreverse;
10 | 	thread_local size_t currentseqPos;
11 | 	thread_local size_t currentmatchLen;
12 | 	thread_local size_t currentnodeOffset;
13 | 	thread_local std::string_view currentRead;
14 | 	void signal(int signal)
15 | 	{
16 | 		std::stringstream msg;
17 | 		msg << "Signal " << signal << ". Read: " << currentRead << ". Seed: " << assertGetSeedInfo();
18 | 		std::cerr << msg.str() << std::endl;
19 | 		std::abort();
20 | 	}
21 | 	void setRead(const std::string& readName)
22 | 	{
23 | 		currentRead = std::string_view(readName.data(), readName.size());
24 | 	}
25 | 	void setSeed(int nodeID, bool reverse, size_t seqPos, size_t matchLen, size_t nodeOffset)
26 | 	{
27 | 		currentnodeID = nodeID;
28 | 		currentreverse = reverse;
29 | 		currentseqPos = seqPos;
30 | 		currentmatchLen = matchLen;
31 | 		currentnodeOffset = nodeOffset;
32 | 	}
33 | 	void assertFailed(const char* expression, const char* file, int line)
34 | 	{
35 | 		std::stringstream msg;
36 | 		msg << file << ":" << line << ": Assertion '" << expression << "' failed. Read: " << currentRead << ". Seed: " << assertGetSeedInfo();
37 | 		std::cerr << msg.str() << std::endl;
38 | 		throw AssertionFailure {};
39 | 	}	
40 | 	std::string assertGetSeedInfo()
41 | 	{
42 | 		return std::to_string(currentnodeID) + (currentreverse ? "-" : "+") + "," + std::to_string(currentseqPos) + "," + std::to_string(currentmatchLen) + "," + std::to_string(currentnodeOffset);
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/src/ThreadReadAssertion.h:
--------------------------------------------------------------------------------
 1 | #ifndef ThreadReadAssertion_h
 2 | #define ThreadReadAssertion_h
 3 | 
 4 | #include <string>
 5 | 
 6 | namespace ThreadReadAssertion
 7 | {
 8 | 	class AssertionFailure
 9 | 	{
10 | 	};
11 | 	void setRead(const std::string& readName);
12 | 	void setSeed(int nodeID, bool reverse, size_t seqPos, size_t matchLen, size_t nodeOffset);
13 | 	void assertFailed(const char* expression, const char* file, int line);
14 | 	void signal(int signal);
15 | 	std::string assertGetSeedInfo();
16 | }
17 | 
18 | #endif
19 | 
20 | #ifdef assert
21 | #undef assert
22 | #endif
23 | 
24 | #ifndef NDEBUG
25 | 
26 | //https://stackoverflow.com/questions/9701229/c-assert-implementation-in-assert-h
27 | #define assert(expression) (void)((expression) || (ThreadReadAssertion::assertFailed(#expression, __FILE__, __LINE__),0))
28 | #define assertSetRead(name, nodeid, reverse, seqpos, matchlen, nodeoffset) { ThreadReadAssertion::setRead(name); ThreadReadAssertion::setSeed(nodeid, reverse, seqpos, matchlen, nodeoffset); }
29 | #define assertSetNoRead(name) { ThreadReadAssertion::setRead(name); ThreadReadAssertion::setSeed(0, 0, 0, 0, 0); }
30 | 
31 | #else
32 | 
33 | #define assert(ignore) ((void)0)
34 | #define assertSetRead(name, nodeid, reverse, seqpos, matchlen, nodeoffset) ((void)0)
35 | #define assertSetNoRead(name) ((void)0)
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/src/UnitigifyDBG.cpp:
--------------------------------------------------------------------------------
  1 | #include <cassert>
  2 | #include <string>
  3 | #include <unordered_map>
  4 | #include "CommonUtils.h"
  5 | #include "GfaGraph.h"
  6 | 
  7 | GfaGraph unitigify(const GfaGraph& graph)
  8 | {
  9 | 	GfaGraph result;
 10 | 	result.edgeOverlap = graph.edgeOverlap;
 11 | 	std::unordered_map<NodePos, int> belongsInUnitig;
 12 | 	std::unordered_set<int> nodesHandled;
 13 | 	std::unordered_map<int, NodePos> unitigLeft;
 14 | 	std::unordered_map<int, NodePos> unitigRight;
 15 | 	std::vector<std::vector<NodePos>> nodesInUnitig;
 16 | 
 17 | 	for (auto node : graph.nodes)
 18 | 	{
 19 | 		if (nodesHandled.count(node.first) == 1) continue;
 20 | 		NodePos left { node.first, false };		
 21 | 		NodePos right { node.first, true };
 22 | 		bool leftBreaks = true;
 23 | 		bool rightBreaks = true;
 24 | 		if (graph.edges.count(left) == 1 && graph.edges.at(left).size() == 1)
 25 | 		{
 26 | 			auto neighbor = graph.edges.at(left)[0];
 27 | 			assert(graph.edges.count(neighbor.Reverse()) == 1);
 28 | 			if (graph.edges.at(neighbor.Reverse()).size() == 1) leftBreaks = false;
 29 | 		}
 30 | 		if (graph.edges.count(right) == 1 && graph.edges.at(right).size() == 1)
 31 | 		{
 32 | 			auto neighbor = graph.edges.at(right)[0];
 33 | 			assert(graph.edges.count(neighbor.Reverse()) == 1);
 34 | 			if (graph.edges.at(neighbor.Reverse()).size() == 1) rightBreaks = false;
 35 | 		}
 36 | 		if (leftBreaks && rightBreaks)
 37 | 		{
 38 | 			assert(nodesHandled.count(node.first) == 0);
 39 | 			nodesHandled.insert(node.first);
 40 | 			unitigLeft[nodesInUnitig.size()] = NodePos { node.first, true };
 41 | 			unitigRight[nodesInUnitig.size()] = NodePos { node.first, true };
 42 | 			assert(belongsInUnitig.count(right) == 0);
 43 | 			belongsInUnitig[right] = nodesInUnitig.size();
 44 | 			nodesInUnitig.emplace_back();
 45 | 			nodesInUnitig.back().emplace_back(node.first, true);
 46 | 			continue;
 47 | 		}
 48 | 		if (!leftBreaks && !rightBreaks)
 49 | 		{
 50 | 			continue;
 51 | 		}
 52 | 		assert((leftBreaks && !rightBreaks) || (rightBreaks && !leftBreaks));
 53 | 		NodePos start;
 54 | 		int id = nodesInUnitig.size();
 55 | 		nodesInUnitig.emplace_back();
 56 | 		start.id = node.first;
 57 | 		start.end = leftBreaks;
 58 | 		assert(belongsInUnitig.count(start) == 0);
 59 | 		assert(belongsInUnitig.count(start.Reverse()) == 0);
 60 | 		assert(nodesHandled.count(start.id) == 0);
 61 | 		unitigLeft[id] = start;
 62 | 		unitigRight[id] = start;
 63 | 		nodesHandled.insert(start.id);
 64 | 		belongsInUnitig[start] = id;
 65 | 		nodesInUnitig.back().push_back(start);
 66 | 		assert(graph.edges.count(start) == 1 && graph.edges.at(start).size() == 1);
 67 | 		while (graph.edges.count(start) == 1 && graph.edges.at(start).size() == 1)
 68 | 		{
 69 | 			start = graph.edges.at(start)[0];
 70 | 			assert(graph.edges.count(start.Reverse()) == 1);
 71 | 			if (graph.edges.at(start.Reverse()).size() != 1) break;
 72 | 			assert(belongsInUnitig.count(start) == 0);
 73 | 			assert(belongsInUnitig.count(start.Reverse()) == 0);
 74 | 			assert(nodesHandled.count(start.id) == 0);
 75 | 			unitigRight[id] = start;
 76 | 			nodesHandled.insert(start.id);
 77 | 			belongsInUnitig[start] = id;
 78 | 			nodesInUnitig.back().push_back(start);
 79 | 		}
 80 | 	}
 81 | 	//circular separate components
 82 | 	for (auto node : graph.nodes)
 83 | 	{
 84 | 		if (nodesHandled.count(node.first) == 1) continue;
 85 | 		NodePos left { node.first, false };		
 86 | 		NodePos right { node.first, true };
 87 | 		assert(graph.edges.count(left) == 1 && graph.edges.at(left).size() == 1);
 88 | 		assert(graph.edges.count(right) == 1 && graph.edges.at(right).size() == 1);
 89 | 		NodePos start = right;
 90 | 		int id = nodesInUnitig.size();
 91 | 		nodesInUnitig.emplace_back();
 92 | 		unitigLeft[id] = start;
 93 | 		unitigRight[id] = start;
 94 | 		do
 95 | 		{
 96 | 			nodesHandled.insert(node.first);
 97 | 			belongsInUnitig[start] = id;
 98 | 			nodesInUnitig.back().push_back(start);
 99 | 			assert(graph.edges.count(start) == 1 && graph.edges.at(start).size() == 1);
100 | 			assert(graph.edges.count(start.Reverse()) == 1 && graph.edges.at(start.Reverse()).size() == 1);
101 | 			start = graph.edges.at(start)[0];
102 | 		} while (start.id != node.first);
103 | 		result.edges[NodePos { id, true }].emplace_back(id, true);
104 | 	}
105 | 	assert(nodesHandled.size() == graph.nodes.size());
106 | 	assert(belongsInUnitig.size() == graph.nodes.size());
107 | 	for (size_t i = 0; i < nodesInUnitig.size(); i++)
108 | 	{
109 | 		std::string seq;
110 | 		assert(nodesInUnitig[i].size() > 0);
111 | 		seq = graph.nodes.at(nodesInUnitig[i][0].id);
112 | 		if (!nodesInUnitig[i][0].end) seq = CommonUtils::ReverseComplement(seq);
113 | 		seq = seq.substr(0, graph.edgeOverlap);
114 | 		for (auto node : nodesInUnitig[i])
115 | 		{
116 | 			std::string add;
117 | 			add = graph.nodes.at(node.id);
118 | 			if (!node.end) add = CommonUtils::ReverseComplement(add);
119 | 			add = add.substr(graph.edgeOverlap);
120 | 			seq += add;
121 | 		}
122 | 		result.nodes[i] = seq;
123 | 	}
124 | 	for (auto edge : graph.edges)
125 | 	{
126 | 		NodePos src = edge.first;
127 | 		NodePos from;
128 | 		assert(belongsInUnitig.count(src) == 1 || belongsInUnitig.count(src.Reverse()) == 1);
129 | 		if (belongsInUnitig.count(src) == 1)
130 | 		{
131 | 			assert(belongsInUnitig.count(src) == 1);
132 | 			assert(belongsInUnitig.count(src.Reverse()) == 0);
133 | 			if (unitigRight[belongsInUnitig[src]] != src) continue;
134 | 			from = NodePos { belongsInUnitig[src], true };
135 | 		}
136 | 		else
137 | 		{
138 | 			assert(belongsInUnitig.count(src) == 0);
139 | 			assert(belongsInUnitig.count(src.Reverse()) == 1);
140 | 			if (unitigLeft[belongsInUnitig[src.Reverse()]] != src.Reverse()) continue;
141 | 			from = NodePos { belongsInUnitig[src.Reverse()], false };
142 | 		}
143 | 		for (auto dst : edge.second)
144 | 		{
145 | 			NodePos to;
146 | 			assert(belongsInUnitig.count(dst) == 1 || belongsInUnitig.count(dst.Reverse()) == 1);
147 | 			if (belongsInUnitig.count(dst) == 1)
148 | 			{
149 | 				assert(belongsInUnitig.count(dst) == 1);
150 | 				assert(belongsInUnitig.count(dst.Reverse()) == 0);
151 | 				if (unitigLeft[belongsInUnitig[dst]] != dst) continue;
152 | 				to = NodePos { belongsInUnitig[dst], true };
153 | 			}
154 | 			else
155 | 			{
156 | 				assert(belongsInUnitig.count(dst) == 0);
157 | 				assert(belongsInUnitig.count(dst.Reverse()) == 1);
158 | 				if (unitigRight[belongsInUnitig[dst.Reverse()]] != dst.Reverse()) continue;
159 | 				to = NodePos { belongsInUnitig[dst.Reverse()], false };
160 | 			}
161 | 			result.edges[from].push_back(to);
162 | 		}
163 | 	}
164 | 	return result;
165 | }
166 | 
167 | int main(int argc, char** argv)
168 | {
169 | 	std::string inputGraph { argv[1] };
170 | 	std::string outputGraph { argv[2] };
171 | 
172 | 	auto graph = GfaGraph::LoadFromFile(inputGraph);
173 | 	graph.confirmDoublesidedEdges();
174 | 	auto result = unitigify(graph);
175 | 	result.SaveToFile(outputGraph);
176 | }


--------------------------------------------------------------------------------
/src/UntipRelative.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cassert>
  3 | #include <unordered_set>
  4 | #include <unordered_map>
  5 | #include <limits>
  6 | #include <algorithm>
  7 | #include "GfaGraph.h"
  8 | 
  9 | std::unordered_map<NodePos, size_t> getNodeMapping(const GfaGraph& graph)
 10 | {
 11 | 	std::unordered_map<NodePos, size_t> result;
 12 | 	for (auto node : graph.nodes)
 13 | 	{
 14 | 		size_t id = result.size();
 15 | 		result[NodePos { node.first, true }] = id;
 16 | 		id = result.size();
 17 | 		result[NodePos { node.first, false }] = id;
 18 | 	}
 19 | 	return result;
 20 | }
 21 | 
 22 | std::vector<size_t> getLengths(const std::unordered_map<NodePos, size_t>& nodeMapping, const GfaGraph& graph)
 23 | {
 24 | 	std::vector<size_t> result;
 25 | 	result.resize(nodeMapping.size(), 0);
 26 | 	for (auto node : graph.nodes)
 27 | 	{
 28 | 		result[nodeMapping.at(NodePos{ node.first, true })] = node.second.size() - graph.edgeOverlap;
 29 | 		result[nodeMapping.at(NodePos{ node.first, false })] = node.second.size() - graph.edgeOverlap;
 30 | 	}
 31 | 	return result;
 32 | }
 33 | 
 34 | std::vector<std::vector<size_t>> getOutEdges(const std::unordered_map<NodePos, size_t>& nodeMapping, const GfaGraph& graph)
 35 | {
 36 | 	std::vector<std::vector<size_t>> result;
 37 | 	result.resize(nodeMapping.size());
 38 | 	for (auto edge : graph.edges)
 39 | 	{
 40 | 		NodePos source = edge.first;
 41 | 		NodePos revSource = source.Reverse();
 42 | 		for (auto target : edge.second)
 43 | 		{
 44 | 			NodePos revTarget = target.Reverse();
 45 | 			assert(nodeMapping.at(source) < result.size());
 46 | 			assert(nodeMapping.at(target) < result.size());
 47 | 			assert(nodeMapping.at(revSource) < result.size());
 48 | 			assert(nodeMapping.at(revTarget) < result.size());
 49 | 			result[nodeMapping.at(source)].push_back(nodeMapping.at(target));
 50 | 			result[nodeMapping.at(revTarget)].push_back(nodeMapping.at(revSource));
 51 | 		}
 52 | 	}
 53 | 	return result;
 54 | }
 55 | 
 56 | std::vector<size_t> getNodeDepths(const std::vector<std::vector<size_t>>& componentNodes, const std::vector<size_t>& nodeLengths, const std::vector<std::vector<size_t>>& edges)
 57 | {
 58 | 	std::vector<size_t> result;
 59 | 	result.resize(nodeLengths.size(), 0);
 60 | 	for (size_t i = componentNodes.size()-1; i < componentNodes.size(); i--)
 61 | 	{
 62 | 		if (componentNodes[i].size() > 1)
 63 | 		{
 64 | 			for (auto node : componentNodes[i])
 65 | 			{
 66 | 				result[node] = std::numeric_limits<size_t>::max();
 67 | 			}
 68 | 		}
 69 | 		else
 70 | 		{
 71 | 			auto node = componentNodes[i][0];
 72 | 			result[node] = nodeLengths[node];
 73 | 			for (auto neighbor : edges[node])
 74 | 			{
 75 | 				if (result[neighbor] == std::numeric_limits<size_t>::max())
 76 | 				{
 77 | 					result[node] = std::numeric_limits<size_t>::max();
 78 | 					break;
 79 | 				}
 80 | 				if (neighbor == node)
 81 | 				{
 82 | 					result[node] = std::numeric_limits<size_t>::max();
 83 | 					break;
 84 | 				}
 85 | 				result[node] = std::max(result[node], result[neighbor] + nodeLengths[node]);
 86 | 			}
 87 | 		}
 88 | 	}
 89 | 	return result;
 90 | }
 91 | 
 92 | void removeRec(std::vector<bool>& keepers, size_t pos, const std::vector<std::vector<size_t>>& edges)
 93 | {
 94 | 	if (!keepers[pos]) return;
 95 | 	keepers[pos] = false;
 96 | 	for (auto neighbor : edges[pos])
 97 | 	{
 98 | 		removeRec(keepers, neighbor, edges);
 99 | 	}
100 | }
101 | 
102 | std::vector<bool> getKeepers(const std::vector<size_t>& depths, const std::vector<std::vector<size_t>>& edges, const size_t maxRemovableLen, const size_t minSafeLen, const double fraction)
103 | {
104 | 	std::vector<bool> result;
105 | 	result.resize(depths.size(), true);
106 | 	for (size_t i = 0; i < depths.size(); i++)
107 | 	{
108 | 		if (!result[i]) continue;
109 | 		size_t bigLength = 0;
110 | 		for (auto neighbor : edges[i])
111 | 		{
112 | 			bigLength = std::max(bigLength, depths[neighbor]);
113 | 		}
114 | 		if (bigLength < minSafeLen) continue;
115 | 		size_t removableLen = bigLength * fraction;
116 | 		removableLen = std::min(removableLen, maxRemovableLen);
117 | 		for (auto neighbor : edges[i])
118 | 		{
119 | 			if (depths[neighbor] <= removableLen)
120 | 			{
121 | 				removeRec(result, neighbor, edges);
122 | 			}
123 | 		}
124 | 	}
125 | 	return result;
126 | }
127 | 
128 | void strongConnectIterative(size_t node, size_t& i, std::vector<size_t>& index, std::vector<size_t>& lowlink, std::vector<bool>& onStack, std::vector<size_t>& S, std::vector<std::vector<size_t>>& result, const std::vector<std::vector<size_t>>& edges)
129 | {
130 | 	std::vector<std::tuple<int, size_t, size_t>> stack;
131 | 	stack.emplace_back(0, node, 0);
132 | 	while (stack.size() > 0)
133 | 	{
134 | 		auto top = stack.back();
135 | 		size_t node = std::get<1>(top);
136 | 		size_t neighborI = std::get<2>(top);
137 | 		stack.pop_back();
138 | 		switch(std::get<0>(top))
139 | 		{
140 | 			case 0:
141 | 				assert(!onStack[node]);
142 | 				assert(index[node] == -1);
143 | 				assert(lowlink[node] == -1);
144 | 				index[node] = i;
145 | 				lowlink[node] = i;
146 | 				i++;
147 | 				S.push_back(node);
148 | 				onStack[node] = true;
149 | 			START_LOOP:
150 | 			case 1:
151 | 				if (neighborI < edges[node].size())
152 | 				{
153 | 					auto neighbor = edges[node][neighborI];
154 | 					if (index[neighbor] == -1)
155 | 					{
156 | 						stack.emplace_back(2, node, neighborI);
157 | 						stack.emplace_back(0, edges[node][neighborI], 0);
158 | 						continue;
159 | 					}
160 | 					else if (onStack[neighbor])
161 | 					{
162 | 						assert(index[neighbor] != -1);
163 | 						lowlink[node] = std::min(lowlink[node], index[neighbor]);
164 | 					}
165 | 					neighborI++;
166 | 				}
167 | 				if (neighborI < edges[node].size()) goto START_LOOP;
168 | 				goto END_LOOP;
169 | 			case 2:
170 | 				{
171 | 					auto neighbor = edges[node][neighborI];
172 | 					assert(lowlink[neighbor] != -1);
173 | 					lowlink[node] = std::min(lowlink[node], lowlink[neighbor]);
174 | 					neighborI++;
175 | 					goto START_LOOP;
176 | 				}
177 | 			END_LOOP:
178 | 			case 3:
179 | 				assert(lowlink[node] != -1);
180 | 				assert(index[node] != -1);
181 | 				if (lowlink[node] == index[node])
182 | 				{
183 | 					result.emplace_back();
184 | 					size_t stacknode;
185 | 					do
186 | 					{
187 | 						assert(S.size() > 0);
188 | 						stacknode = S.back();
189 | 						S.pop_back();
190 | 						assert(onStack[stacknode]);
191 | 						onStack[stacknode] = false;
192 | 						result.back().push_back(stacknode);
193 | 					} while (stacknode != node);
194 | 				}
195 | 		}
196 | 	}
197 | }
198 | 
199 | std::vector<std::vector<size_t>> topologicalSort(const std::vector<std::vector<size_t>>& edges)
200 | {
201 | 	std::vector<size_t> index;
202 | 	std::vector<size_t> lowlink;
203 | 	std::vector<bool> onStack;
204 | 	index.resize(edges.size(), -1);
205 | 	lowlink.resize(edges.size(), -1);
206 | 	onStack.resize(edges.size(), false);
207 | 	std::vector<size_t> S;
208 | 	std::vector<std::vector<size_t>> result;
209 | 	size_t i = 0;
210 | 	for (size_t node = 0; node < edges.size(); node++)
211 | 	{
212 | 		if (index[node] == -1) strongConnectIterative(node, i, index, lowlink, onStack, S, result, edges);
213 | 		assert(S.size() == 0);
214 | 	}
215 | 	assert(i == edges.size());
216 | 	std::reverse(result.begin(), result.end());
217 | 	std::vector<size_t> belongsToComponent;
218 | 	belongsToComponent.resize(edges.size(), -1);
219 | 	for (size_t i = 0; i < result.size(); i++)
220 | 	{
221 | 		for (auto node : result[i])
222 | 		{
223 | 			belongsToComponent[node] = i;
224 | 		}
225 | 	}
226 | 	for (size_t i = 0; i < edges.size(); i++)
227 | 	{
228 | 		assert(belongsToComponent[i] != -1);
229 | 		for (auto edge : edges[i])
230 | 		{
231 | 			assert(belongsToComponent[edge] != -1);
232 | 			assert(belongsToComponent[edge] >= belongsToComponent[i]);
233 | 		}
234 | 	}
235 | 	return result;
236 | }
237 | 
238 | std::unordered_set<int> filterNodes(const GfaGraph& graph, const int maxRemovableLen, const int minSafeLen, const double fraction)
239 | {
240 | 	auto nodeMapping = getNodeMapping(graph);
241 | 	auto lengths = getLengths(nodeMapping, graph);
242 | 	auto edges = getOutEdges(nodeMapping, graph);
243 | 	auto order = topologicalSort(edges);
244 | 	auto depths = getNodeDepths(order, lengths, edges);
245 | 	auto keepers = getKeepers(depths, edges, maxRemovableLen, minSafeLen, fraction);
246 | 	std::unordered_set<int> result;
247 | 	for (auto node : graph.nodes)
248 | 	{
249 | 		if (keepers[nodeMapping[NodePos { node.first, true }]] && keepers[nodeMapping[NodePos { node.first, false }]])
250 | 		{
251 | 			result.emplace(node.first);
252 | 		}
253 | 	}
254 | 	return result;
255 | }
256 | 
257 | int main(int argc, char** argv)
258 | {
259 | 	int maxRemovableLen = std::stoi(argv[1]);
260 | 	int minSafeLen = std::stoi(argv[2]);
261 | 	double fraction = std::stod(argv[3]);
262 | 	auto graph = GfaGraph::LoadFromStream(std::cin);
263 | 	//write to cout
264 | 
265 | 	auto keptNodes = filterNodes(graph, maxRemovableLen, minSafeLen, fraction);
266 | 	auto filteredGraph = graph.GetSubgraph(keptNodes);
267 | 	filteredGraph.SaveToStream(std::cout);
268 | }


--------------------------------------------------------------------------------
/src/VisualizeAlignment.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <iostream>
  3 | #include <fstream>
  4 | #include <string>
  5 | #include "GfaGraph.h"
  6 | #include "AlignmentCorrectnessEstimation.h"
  7 | #include "CommonUtils.h"
  8 | #include "GraphAlignerWrapper.h"
  9 | 
 10 | void pad(std::string& str, size_t size)
 11 | {
 12 | 	assert(str.size() <= size);
 13 | 	while (str.size() < size)
 14 | 	{
 15 | 		str += " ";
 16 | 	}
 17 | }
 18 | 
 19 | std::vector<AlignmentResult::TraceItem> loadTrace(std::string filename)
 20 | {
 21 | 	std::ifstream file { filename };
 22 | 	std::vector<AlignmentResult::TraceItem> result;
 23 | 	while (file.good())
 24 | 	{
 25 | 		int nodeid, offset, reverse, readpos, type;
 26 | 		char graphChar, readChar;
 27 | 		file >> nodeid >> offset >> reverse >> readpos >> type >> graphChar >> readChar;
 28 | 		if (!file.good()) break;
 29 | 		result.emplace_back();
 30 | 		result.back().nodeID = nodeid;
 31 | 		result.back().offset = offset;
 32 | 		result.back().reverse = reverse == 1;
 33 | 		result.back().readpos = readpos;
 34 | 		result.back().type = (AlignmentResult::TraceMatchType)type;
 35 | 		result.back().graphChar = graphChar;
 36 | 		result.back().readChar = readChar;
 37 | 	}
 38 | 	return result;
 39 | }
 40 | 
 41 | std::string getCharwiseCorrectInfo(const std::vector<AlignmentResult::TraceItem>& trace)
 42 | {
 43 | 	std::string result;
 44 | 	AlignmentCorrectnessEstimationState charwiseCorrect;
 45 | 	std::vector<bool> charwiseCorrectCorrectTrace;
 46 | 	std::vector<bool> charwiseCorrectFalseTrace;
 47 | 	for (size_t i = 0; i < trace.size(); i++)
 48 | 	{
 49 | 		auto type = trace[i].type;
 50 | 		if (type == AlignmentResult::TraceMatchType::MATCH)
 51 | 		{
 52 | 			charwiseCorrect = charwiseCorrect.NextState(0, 1);
 53 | 			charwiseCorrectCorrectTrace.push_back(charwiseCorrect.CorrectFromCorrect());
 54 | 			charwiseCorrectFalseTrace.push_back(charwiseCorrect.FalseFromCorrect());
 55 | 		}
 56 | 		else if (type == AlignmentResult::TraceMatchType::FORWARDBACKWARDSPLIT)
 57 | 		{
 58 | 			bool oldCorrect = charwiseCorrect.CurrentlyCorrect();
 59 | 			charwiseCorrect = AlignmentCorrectnessEstimationState {};
 60 | 			charwiseCorrectCorrectTrace.push_back(oldCorrect);
 61 | 			charwiseCorrectFalseTrace.push_back(oldCorrect);
 62 | 		}
 63 | 		else
 64 | 		{
 65 | 			charwiseCorrect = charwiseCorrect.NextState(1, 1);
 66 | 			charwiseCorrectCorrectTrace.push_back(charwiseCorrect.CorrectFromCorrect());
 67 | 			charwiseCorrectFalseTrace.push_back(charwiseCorrect.FalseFromCorrect());
 68 | 		}
 69 | 	}
 70 | 	bool charwiseCurrentlyCorrect = charwiseCorrect.CurrentlyCorrect();
 71 | 	std::string charwiseCorrectInfo = "";
 72 | 	for (size_t i = charwiseCorrectCorrectTrace.size()-1; i < charwiseCorrectCorrectTrace.size(); i--)
 73 | 	{
 74 | 		if (charwiseCurrentlyCorrect)
 75 | 		{
 76 | 			charwiseCorrectInfo += "#";
 77 | 			charwiseCurrentlyCorrect = charwiseCorrectCorrectTrace[i];
 78 | 		}
 79 | 		else
 80 | 		{
 81 | 			charwiseCorrectInfo += " ";
 82 | 			charwiseCurrentlyCorrect = charwiseCorrectFalseTrace[i];
 83 | 		}
 84 | 	}
 85 | 	std::reverse(charwiseCorrectInfo.begin(), charwiseCorrectInfo.end());
 86 | 	return charwiseCorrectInfo;
 87 | }
 88 | 
 89 | std::string getSlicewiseCorrectInfo(const std::vector<AlignmentResult::TraceItem>& trace)
 90 | {
 91 | 	int readcharsUntilSlicewiseCheck = 64;
 92 | 	int mismatches = 0;
 93 | 	AlignmentCorrectnessEstimationState slicewiseCorrect;
 94 | 	std::string slicewiseCorrectInfo;
 95 | 	for (size_t i = 0; i < trace.size(); i++)
 96 | 	{
 97 | 		switch(trace[i].type)
 98 | 		{
 99 | 			case AlignmentResult::TraceMatchType::MATCH:
100 | 				readcharsUntilSlicewiseCheck--;
101 | 				break;
102 | 			case AlignmentResult::TraceMatchType::MISMATCH:
103 | 				mismatches++;
104 | 				readcharsUntilSlicewiseCheck--;
105 | 				break;
106 | 			case AlignmentResult::TraceMatchType::INSERTION:
107 | 				mismatches++;
108 | 				readcharsUntilSlicewiseCheck--;
109 | 				break;
110 | 			case AlignmentResult::TraceMatchType::DELETION:
111 | 				mismatches++;
112 | 				break;
113 | 			case AlignmentResult::TraceMatchType::FORWARDBACKWARDSPLIT:
114 | 				break;
115 | 		}
116 | 		if (readcharsUntilSlicewiseCheck == 0)
117 | 		{
118 | 			slicewiseCorrect = slicewiseCorrect.NextState(mismatches, 64);
119 | 			char addchar = slicewiseCorrect.CurrentlyCorrect() ? '#' : ' ';
120 | 			for (int i = 0; i < 64; i++)
121 | 			{
122 | 				slicewiseCorrectInfo += addchar;
123 | 			}
124 | 			mismatches = 0;
125 | 			readcharsUntilSlicewiseCheck = 64;
126 | 		}
127 | 	}
128 | 	pad(slicewiseCorrectInfo, trace.size());
129 | 	return slicewiseCorrectInfo;
130 | }
131 | 
132 | int main(int argc, char** argv)
133 | {
134 | 	std::string tracefile { argv[1] };
135 | 
136 | 	std::vector<AlignmentResult::TraceItem> trace = loadTrace(tracefile);
137 | 
138 | 	std::string graphinfo;
139 | 	std::string graphpath;
140 | 	std::string alignmentinfo;
141 | 	std::string readinfo;
142 | 	std::string readpath;
143 | 	int oldNodeId = trace[0].nodeID;
144 | 	bool oldReverse = trace[0].reverse;
145 | 	int oldReadPos = trace[0].readpos;
146 | 	size_t splitIndex = 0;
147 | 	for (int i = 0; i < trace.size(); i++)
148 | 	{
149 | 		auto type = trace[i].type;
150 | 		char readChar = trace[i].readChar;
151 | 		char graphChar = trace[i].graphChar;
152 | 		if (i == 0)
153 | 		{
154 | 			graphinfo += "v";
155 | 			readinfo += "^";
156 | 		}
157 | 		if ((i > 0 && (trace[i].nodeID != trace[i-1].nodeID)) || type == AlignmentResult::TraceMatchType::FORWARDBACKWARDSPLIT)
158 | 		{
159 | 			int nodeidInfoLength = std::to_string(oldNodeId).size() + 1;
160 | 			if (i > graphinfo.size() + nodeidInfoLength)
161 | 			{
162 | 				graphinfo += std::to_string(oldNodeId);
163 | 				if (oldReverse) graphinfo += "-"; else graphinfo += "+";
164 | 			}
165 | 			int readSizeInfoLength = std::to_string(oldReadPos).size();
166 | 			if (i > readinfo.size() + readSizeInfoLength)
167 | 			{
168 | 				readinfo += std::to_string(oldReadPos);
169 | 			}
170 | 			pad(graphinfo, i);
171 | 			pad(readinfo, i);
172 | 			graphinfo += "v";
173 | 			readinfo += "^";
174 | 			oldNodeId = trace[i].nodeID;
175 | 			oldReverse = trace[i].reverse;
176 | 			oldReadPos = trace[i].readpos;
177 | 		}
178 | 
179 | 		switch(type)
180 | 		{
181 | 			case AlignmentResult::TraceMatchType::MATCH:
182 | 				graphpath += graphChar;
183 | 				readpath += readChar;
184 | 				alignmentinfo += "|";
185 | 				assert(graphChar == readChar);
186 | 				break;
187 | 			case AlignmentResult::TraceMatchType::MISMATCH:
188 | 				graphpath += graphChar;
189 | 				readpath += readChar;
190 | 				alignmentinfo += " ";
191 | 				assert(graphChar != readChar);
192 | 				break;
193 | 			case AlignmentResult::TraceMatchType::INSERTION:
194 | 				graphpath += ' ';
195 | 				readpath += readChar;
196 | 				alignmentinfo += " ";
197 | 				break;
198 | 			case AlignmentResult::TraceMatchType::DELETION:
199 | 				graphpath += graphChar;
200 | 				alignmentinfo += " ";
201 | 				readpath += ' ';
202 | 				break;
203 | 			case AlignmentResult::TraceMatchType::FORWARDBACKWARDSPLIT:
204 | 				graphpath += graphChar;
205 | 				readpath += readChar;
206 | 				alignmentinfo += graphChar == readChar ? '|' : ' ';
207 | 				splitIndex = i;
208 | 				break;
209 | 		}
210 | 	}
211 | 
212 | 	std::cerr << "splitIndex " << splitIndex << std::endl;
213 | 
214 | 	std::string charwiseCorrectInfo;
215 | 	std::string slicewiseCorrectInfo;
216 | 	{
217 | 		std::vector<AlignmentResult::TraceItem> backwards;
218 | 		backwards.insert(backwards.end(), trace.begin(), trace.begin()+splitIndex);
219 | 		std::reverse(backwards.begin(), backwards.end());
220 | 		auto bw = getCharwiseCorrectInfo(backwards);
221 | 		auto slicewise = getSlicewiseCorrectInfo(backwards);
222 | 		std::reverse(bw.begin(), bw.end());
223 | 		std::reverse(slicewise.begin(), slicewise.end());
224 | 		charwiseCorrectInfo += bw;
225 | 		slicewiseCorrectInfo += slicewise;
226 | 	}
227 | 	{
228 | 		std::vector<AlignmentResult::TraceItem> forwards;
229 | 		forwards.insert(forwards.end(), trace.begin()+splitIndex, trace.end());
230 | 		slicewiseCorrectInfo += getSlicewiseCorrectInfo(forwards);
231 | 		charwiseCorrectInfo += getCharwiseCorrectInfo(forwards);
232 | 	}
233 | 	std::cout << "       " << graphinfo << std::endl;
234 | 	std::cout << "GRAPH: " << graphpath << std::endl;
235 | 	std::cout << "       " << alignmentinfo << std::endl;
236 | 	std::cout << "READ:  " << readpath << std::endl;
237 | 	std::cout << "       " << readinfo << std::endl;
238 | 	std::cout << "       " << charwiseCorrectInfo << std::endl;
239 | 	std::cout << "       " << slicewiseCorrectInfo << std::endl;
240 | }


--------------------------------------------------------------------------------
/src/fastqloader.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <fstream>
 3 | #include "fastqloader.h"
 4 | #include "CommonUtils.h"
 5 | 
 6 | std::vector<FastQ> loadFastqFromFile(std::string filename, bool includeQuality)
 7 | {
 8 | 	std::vector<FastQ> result;
 9 | 	FastQ::streamFastqFromFile(filename, includeQuality, [&result](FastQ& fq) {
10 | 		result.emplace_back(std::move(fq));
11 | 	});
12 | 	return result;
13 | }
14 | 
15 | FastQ FastQ::reverseComplement() const
16 | {
17 | 	FastQ result;
18 | 	result.sequence = CommonUtils::ReverseComplement(sequence);
19 | 	result.seq_id = seq_id;
20 | 	result.quality = quality;
21 | 	std::reverse(result.quality.begin(), result.quality.end());
22 | 	return result;
23 | }
24 | 


--------------------------------------------------------------------------------
/src/fastqloader.h:
--------------------------------------------------------------------------------
  1 | #ifndef FastqLoader_H
  2 | #define FastqLoader_H
  3 | 
  4 | #include <string>
  5 | #include <vector>
  6 | #include <zstr.hpp> //https://github.com/mateidavid/zstr
  7 | 
  8 | class FastQ {
  9 | public:
 10 | 	template <typename F>
 11 | 	static void streamFastqFastqFromStream(std::istream& file, bool includeQuality, F f)
 12 | 	{
 13 | 		do
 14 | 		{
 15 | 			std::string line;
 16 | 			std::getline(file, line);
 17 | 			if (!file.good()) break;
 18 | 			if (line.size() == 0) continue;
 19 | 			if (line[0] != '@') continue;
 20 | 			FastQ newread;
 21 | 			if (line.back() == '\r') line.pop_back();
 22 | 			newread.seq_id = line.substr(1);
 23 | 			std::getline(file, line);
 24 | 			if (line.back() == '\r') line.pop_back();
 25 | 			newread.sequence = line;
 26 | 			std::getline(file, line);
 27 | 			std::getline(file, line);
 28 | 			if (line.back() == '\r') line.pop_back();
 29 | 			if (includeQuality) newread.quality = line;
 30 | 			f(newread);
 31 | 		} while (file.good());
 32 | 	}
 33 | 	template <typename F>
 34 | 	static void streamFastqFastaFromStream(std::istream& file, bool includeQuality, F f)
 35 | 	{
 36 | 		std::string line;
 37 | 		std::getline(file, line);
 38 | 		do
 39 | 		{
 40 | 			if (line.size() == 0)
 41 | 			{
 42 | 				std::getline(file, line);
 43 | 				continue;
 44 | 			}
 45 | 			if (line[0] != '>')
 46 | 			{
 47 | 				std::getline(file, line);
 48 | 				continue;
 49 | 			}
 50 | 			FastQ newread;
 51 | 			if (line.back() == '\r') line.pop_back();
 52 | 			newread.seq_id = line.substr(1);
 53 | 			newread.sequence = "";
 54 | 			do
 55 | 			{
 56 | 				std::getline(file, line);
 57 | 				if (!file.good()) break;
 58 | 				if (line.size() == 0) continue;
 59 | 				if (line[0] == '>') break;
 60 | 				if (line.back() == '\r') line.pop_back();
 61 | 				newread.sequence += line;
 62 | 			} while (file.good());
 63 | 			if (includeQuality)
 64 | 			{
 65 | 				for (size_t i = 0; i < newread.sequence.size(); i++)
 66 | 				{
 67 | 					newread.quality += '!';
 68 | 				}
 69 | 			}
 70 | 			f(newread);
 71 | 		} while (file.good());
 72 | 	}
 73 | 	template <typename F>
 74 | 	static void streamFastqFastqFromFile(std::string filename, bool includeQuality, F f)
 75 | 	{
 76 | 		std::ifstream file {filename};
 77 | 		streamFastqFastqFromStream(file, includeQuality, f);
 78 | 	}
 79 | 	template <typename F>
 80 | 	static void streamFastqFastaFromFile(std::string filename, bool includeQuality, F f)
 81 | 	{
 82 | 		std::ifstream file {filename};
 83 | 		streamFastqFastaFromStream(file, includeQuality, f);
 84 | 	}
 85 | 	template <typename F>
 86 | 	static void streamFastqFastqFromGzippedFile(std::string filename, bool includeQuality, F f)
 87 | 	{
 88 | 		zstr::ifstream file { filename };
 89 | 		streamFastqFastqFromStream(file, includeQuality, f);
 90 | 	}
 91 | 	template <typename F>
 92 | 	static void streamFastqFastaFromGzippedFile(std::string filename, bool includeQuality, F f)
 93 | 	{
 94 | 		zstr::ifstream file { filename };
 95 | 		streamFastqFastaFromStream(file, includeQuality, f);
 96 | 	}
 97 | 	template <typename F>
 98 | 	static void streamFastqFromFile(std::string filename, bool includeQuality, F f)
 99 | 	{
100 | 		bool gzipped = false;
101 | 		std::string originalFilename = filename;
102 | 		if (filename.size() > 3 && filename.substr(filename.size()-3) == ".gz")
103 | 		{
104 | 			gzipped = true;
105 | 			filename = filename.substr(0, filename.size()-3);
106 | 		}
107 | 		bool fastq = false;
108 | 		bool fasta = false;
109 | 		if (filename.size() > 6 && filename.substr(filename.size()-6) == ".fastq") fastq = true;
110 | 		if (filename.size() > 3 && filename.substr(filename.size()-3) == ".fq") fastq = true;
111 | 		if (filename.size() > 6 && filename.substr(filename.size()-6) == ".fasta") fasta = true;
112 | 		if (filename.size() > 3 && filename.substr(filename.size()-3) == ".fa") fasta = true;
113 | 		if (fasta)
114 | 		{
115 | 			if (gzipped)
116 | 			{
117 | 				streamFastqFastaFromGzippedFile(originalFilename, includeQuality, f);
118 | 				return;
119 | 			}
120 | 			else
121 | 			{
122 | 				streamFastqFastaFromFile(originalFilename, includeQuality, f);
123 | 				return;
124 | 			}
125 | 		}
126 | 		if (fastq)
127 | 		{
128 | 			if (gzipped)
129 | 			{
130 | 				streamFastqFastqFromGzippedFile(originalFilename, includeQuality, f);
131 | 				return;
132 | 			}
133 | 			else
134 | 			{
135 | 				streamFastqFastqFromFile(originalFilename, includeQuality, f);
136 | 				return;
137 | 			}
138 | 		}
139 | 	}
140 | 	FastQ reverseComplement() const;
141 | 	std::string seq_id;
142 | 	std::string sequence;
143 | 	std::string quality;
144 | };
145 | 
146 | std::vector<FastQ> loadFastqFromFile(std::string filename, bool includeQuality = true);
147 | 
148 | #endif
149 | 


--------------------------------------------------------------------------------
/src/stream.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef STREAM_H
  2 | #define STREAM_H
  3 | 
  4 | // from http://www.mail-archive.com/protobuf@googlegroups.com/msg03417.html
  5 | 
  6 | #include <cassert>
  7 | #include <iostream>
  8 | #include <fstream>
  9 | #include <functional>
 10 | #include <vector>
 11 | #include <list>
 12 | #include "google/protobuf/stubs/common.h"
 13 | #include "google/protobuf/io/zero_copy_stream.h"
 14 | #include "google/protobuf/io/zero_copy_stream_impl.h"
 15 | #include "google/protobuf/io/gzip_stream.h"
 16 | #include "google/protobuf/io/coded_stream.h"
 17 | 
 18 | namespace stream {
 19 | 
 20 | // write objects
 21 | // count should be equal to the number of objects to write
 22 | // but if it is 0, it is not written
 23 | // if not all objects are written, return false, otherwise true
 24 | template <typename T>
 25 | bool write(std::ostream& out, uint64_t count, std::function<T(uint64_t)>& lambda) {
 26 | 
 27 |     ::google::protobuf::io::ZeroCopyOutputStream *raw_out =
 28 |           new ::google::protobuf::io::OstreamOutputStream(&out);
 29 |     ::google::protobuf::io::GzipOutputStream *gzip_out =
 30 |           new ::google::protobuf::io::GzipOutputStream(raw_out);
 31 |     ::google::protobuf::io::CodedOutputStream *coded_out =
 32 |           new ::google::protobuf::io::CodedOutputStream(gzip_out);
 33 | 
 34 |     // prefix the chunk with the number of objects
 35 |     coded_out->WriteVarint64(count);
 36 | 
 37 |     std::string s;
 38 |     uint64_t written = 0;
 39 |     for (uint64_t n = 0; n < count; ++n, ++written) {
 40 |         lambda(n).SerializeToString(&s);
 41 |         // and prefix each object with its size
 42 |         coded_out->WriteVarint32(s.size());
 43 |         coded_out->WriteRaw(s.data(), s.size());
 44 |     }
 45 | 
 46 |     delete coded_out;
 47 |     delete gzip_out;
 48 |     delete raw_out;
 49 | 
 50 |     return !count || written == count;
 51 | }
 52 | 
 53 | template <typename T>
 54 | bool write_buffered(std::ostream& out, std::vector<T>& buffer, uint64_t buffer_limit) {
 55 |     bool wrote = false;
 56 |     if (buffer.size() >= buffer_limit) {
 57 |         std::function<T(uint64_t)> lambda = [&buffer](uint64_t n) { return buffer.at(n); };
 58 | #pragma omp critical (stream_out)
 59 |         wrote = write(out, buffer.size(), lambda);
 60 |         buffer.clear();
 61 |     }
 62 |     return wrote;
 63 | }
 64 | 
 65 | template <typename T>
 66 | bool write_buffered_ptr(std::ostream& out, std::vector<T*>& buffer, uint64_t buffer_limit) {
 67 |     bool wrote = false;
 68 |     if (buffer.size() >= buffer_limit) {
 69 |         std::function<T(uint64_t)> lambda = [&buffer](uint64_t n) { return *buffer.at(n); };
 70 | #pragma omp critical (stream_out)
 71 |         wrote = write(out, buffer.size(), lambda);
 72 |         buffer.clear();
 73 |     }
 74 |     return wrote;
 75 | }
 76 | 
 77 | // deserialize the input stream into the objects
 78 | // count containts the count read
 79 | // takes a callback function to be called on the objects
 80 | 
 81 | template <typename T>
 82 | bool for_each(std::istream& in,
 83 |               std::function<void(T&)>& lambda,
 84 |               std::function<void(uint64_t)>& handle_count) {
 85 | 
 86 |     ::google::protobuf::io::ZeroCopyInputStream *raw_in =
 87 |           new ::google::protobuf::io::IstreamInputStream(&in);
 88 |     ::google::protobuf::io::GzipInputStream *gzip_in =
 89 |           new ::google::protobuf::io::GzipInputStream(raw_in);
 90 |     ::google::protobuf::io::CodedInputStream *coded_in =
 91 |           new ::google::protobuf::io::CodedInputStream(gzip_in);
 92 | 
 93 |     uint64_t count;
 94 |     coded_in->ReadVarint64((::google::protobuf::uint64*) &count);
 95 |     // this loop handles a chunked file with many pieces
 96 |     // such as we might write in a multithreaded process
 97 |     if (!count) return !count;
 98 |     do {
 99 | 
100 |         handle_count(count);
101 | 
102 |         std::string s;
103 |         for (uint64_t i = 0; i < count; ++i) {
104 |             uint32_t msgSize = 0;
105 |             delete coded_in;
106 |             coded_in = new ::google::protobuf::io::CodedInputStream(gzip_in);
107 |             // the messages are prefixed by their size
108 |             coded_in->ReadVarint32(&msgSize);
109 |             if ((msgSize > 0) &&
110 |                 (coded_in->ReadString(&s, msgSize))) {
111 |                 T object;
112 |                 object.ParseFromString(s);
113 |                 lambda(object);
114 |             }
115 |         }
116 |     } while (coded_in->ReadVarint64((::google::protobuf::uint64*) &count));
117 | 
118 |     delete coded_in;
119 |     delete gzip_in;
120 |     delete raw_in;
121 | 
122 |     return !count;
123 | }
124 | 
125 | template <typename T>
126 | bool for_each(std::istream& in,
127 |               std::function<void(T&)>& lambda) {
128 |     std::function<void(uint64_t)> noop = [](uint64_t) { };
129 |     return for_each(in, lambda, noop);
130 | }
131 | 
132 | template <typename T>
133 | bool for_each_parallel(std::istream& in,
134 |                        std::function<void(T&)>& lambda,
135 |                        std::function<void(uint64_t)>& handle_count) {
136 | 
137 |     ::google::protobuf::io::ZeroCopyInputStream *raw_in =
138 |           new ::google::protobuf::io::IstreamInputStream(&in);
139 |     ::google::protobuf::io::GzipInputStream *gzip_in =
140 |           new ::google::protobuf::io::GzipInputStream(raw_in);
141 |     ::google::protobuf::io::CodedInputStream *coded_in =
142 |           new ::google::protobuf::io::CodedInputStream(gzip_in);
143 | 
144 |     uint64_t count;
145 |     bool more_input = coded_in->ReadVarint64((::google::protobuf::uint64*) &count);
146 |     bool more_objects = false;
147 |     // this loop handles a chunked file with many pieces
148 |     // such as we might write in a multithreaded process
149 |     std::list<T> objects;
150 |     int64_t object_count = 0;
151 |     int64_t read_threshold = 5000;
152 |     if (!count) return !count;
153 | #pragma omp parallel shared(more_input, more_objects, objects, count, in, lambda, handle_count, raw_in, gzip_in, coded_in)
154 |     do {
155 | 
156 |         bool has_object = false;
157 |         T object;
158 | #pragma omp critical (objects)
159 |         {
160 |             if (!objects.empty()) {
161 |                 object = objects.back();
162 |                 objects.pop_back();
163 |                 --object_count;
164 |                 has_object = true;
165 |             }
166 |         }
167 |         if (has_object) {
168 |             lambda(object);
169 |         }
170 | 
171 | #pragma omp master
172 |         {
173 |             while (more_input && object_count < read_threshold) {
174 |                 handle_count(count);
175 |                 std::string s;
176 |                 for (uint64_t i = 0; i < count; ++i) {
177 |                     uint32_t msgSize = 0;
178 |                     // the messages are prefixed by their size
179 |                     delete coded_in;
180 |                     coded_in = new ::google::protobuf::io::CodedInputStream(gzip_in);
181 |                     coded_in->ReadVarint32(&msgSize);
182 |                     if ((msgSize > 0) &&
183 |                         (coded_in->ReadString(&s, msgSize))) {
184 |                         T object;
185 |                         object.ParseFromString(s);
186 | #pragma omp critical (objects)
187 |                         {
188 |                             objects.push_front(object);
189 |                             ++object_count;
190 |                         }
191 |                     }
192 |                 }
193 |                 more_input = coded_in->ReadVarint64((::google::protobuf::uint64*) &count);
194 |             }
195 |         }
196 | #pragma omp critical (objects)
197 |         more_objects = (object_count > 0);
198 | 
199 |     } while (more_input || more_objects);
200 | 
201 |     delete coded_in;
202 |     delete gzip_in;
203 |     delete raw_in;
204 | 
205 |     return !count;
206 | }
207 | 
208 | template <typename T>
209 | bool for_each_parallel(std::istream& in,
210 |               std::function<void(T&)>& lambda) {
211 |     std::function<void(uint64_t)> noop = [](uint64_t) { };
212 |     return for_each_parallel(in, lambda, noop);
213 | }
214 | 
215 | }
216 | 
217 | #endif
218 | 


--------------------------------------------------------------------------------
/test/graph.gfa:
--------------------------------------------------------------------------------
1 | S	1	ACGTCATGCAGTCGTAACGTAGTCGTCACAGTCAGTCGTAGCTA
2 | S	2	A
3 | S	3	T
4 | S	4	GTAGCGTCAGTCAGTCAGTCGTAGCGTAACGTCGTAGTCAGT
5 | L	1	+	2	+	0M
6 | L	1	+	3	+	0M
7 | L	2	+	4	+	0M
8 | L	3	+	4	+	0M
9 | 


--------------------------------------------------------------------------------
/test/read.fa:
--------------------------------------------------------------------------------
1 | >read
2 | TCATCCACGTCGTAACGTAGTCGTCACAGTCAGTCGTAGCTAAGTACGTCAAGTCAGACAGTCGTAGCGTA
3 | 


--------------------------------------------------------------------------------