├── .gitignore
├── .gitmodules
├── Makefile
├── README.md
├── SRFAligner
├── SRFChainer
├── docs
    └── workflow.png
├── efg-ahocorasickAligner
├── efg-memsAligner
├── experiments
    ├── aligner-evaluation
    │   ├── README.md
    │   ├── ahocorasick
    │   │   └── runexp.sh
    │   ├── chaining
    │   │   └── runexp.sh
    │   ├── environment.yml
    │   ├── final
    │   │   └── runexp.sh
    │   ├── input
    │   │   ├── .gitkeep
    │   │   └── covid19_100_acc.txt
    │   ├── mems
    │   │   └── runexp.sh
    │   ├── scripts
    │   │   ├── .env
    │   │   ├── compute_metrics.py
    │   │   ├── compute_summary.py
    │   │   ├── generate_sim_reads.py
    │   │   ├── run_experiment.py
    │   │   └── vg_pb2.py
    │   ├── semi-repeat-free
    │   │   └── runexp.sh
    │   ├── vg-comparison
    │   │   └── runexp.sh
    │   └── vg-unchop
    │   │   └── runexp.sh
    ├── graph-statistics
    │   ├── README.md
    │   ├── run.sh
    │   └── scripts
    │   │   ├── compute-N.sh
    │   │   ├── compute-bps.sh
    │   │   ├── compute-branching-factor.sh
    │   │   ├── compute-branching-nodes.sh
    │   │   ├── compute-choices.sh
    │   │   ├── compute-edges.sh
    │   │   ├── compute-efg-H.sh
    │   │   ├── compute-longest-node.sh
    │   │   ├── compute-nodes.sh
    │   │   ├── compute-paths.sh
    │   │   └── compute-width.sh
    ├── msa-validation
    │   ├── README.md
    │   └── validate.sh
    ├── short-read-exact-match
    │   ├── README.md
    │   ├── input
    │   │   └── .gitkeep
    │   └── runexp.sh
    └── vcf-to-hapl-to-efg
    │   ├── README.md
    │   └── sample-and-build-efg-heuristic.sh
├── test
    ├── graph1.gfa
    ├── graph2.gfa
    ├── graph3.gfa
    ├── read1.fastq
    ├── read2.fastq
    └── read3.fastq
└── tools
    ├── ChainX-block-graph
        ├── Makefile
        ├── README.md
        ├── chaining.hpp
        ├── chaining_hpp_license.txt
        ├── chainx-block-graph.cpp
        ├── chainx-block-graph.hpp
        ├── command-line-parsing
        │   ├── cmdline.c
        │   ├── cmdline.h
        │   └── config.ggo
        ├── efg.hpp
        └── test
        │   ├── correctoutput
        │       ├── anchors-1-global.gaf
        │       └── anchors-1-semi-global.gaf
        │   ├── input
        │       ├── anchors-1.gaf
        │       └── graph.gfa
        │   └── test.sh
    ├── efg-ahocorasick
        ├── Makefile
        └── src
        │   ├── efg-ahocorasick.rs
        │   ├── efg-locate.hpp
        │   ├── efg.hpp
        │   └── extractor.cpp
    ├── efg-gaf-splitter
        ├── Makefile
        ├── README.md
        ├── command-line-parsing
        │   ├── cmdline.c
        │   ├── cmdline.h
        │   └── config.ggo
        ├── efg-gaf-splitter.cpp
        └── efg.hpp
    ├── efg-locate
        ├── Makefile
        ├── README.md
        ├── algo.cpp
        ├── command-line-parsing
        │   ├── cmdline.c
        │   ├── cmdline.h
        │   └── config.ggo
        ├── efg-locate.cpp
        ├── efg-locate.hpp
        ├── efg.hpp
        └── test
        │   ├── inputs
        │       ├── indels.gfa
        │       ├── indels_five_nodes.fasta
        │       ├── tcs_fig_5.gfa
        │       ├── tcs_fig_5_approximate.fasta
        │       ├── tcs_fig_5_edge.fasta
        │       ├── tcs_fig_5_four_nodes.fasta
        │       └── tcs_fig_5_three_nodes.fasta
        │   ├── outputs
        │       ├── indels_five_nodes.gfa
        │       ├── tcs_fig_5_approximate.fasta
        │       ├── tcs_fig_5_edge.gfa
        │       ├── tcs_fig_5_four_nodes.gfa
        │       └── tcs_fig_5_three_nodes.gfa
        │   └── test.sh
    └── efg-simplify
        ├── Makefile
        ├── README.md
        ├── command-line-parsing
            ├── cmdline.c
            ├── cmdline.h
            └── config.ggo
        └── efg-simplify.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | experiments/vcf-to-hapl-to-efg/output
 2 | experiments/vcf-to-hapl-to-efg/chr22_uppercase.fasta
 3 | experiments/vcf-to-hapl-to-efg/1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz
 4 | experiments/vcf-to-hapl-to-efg/1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz.tbi
 5 | experiments/vcf-to-hapl-to-efg/chm13v2.0.fa.gz
 6 | experiments/vcf-to-hapl-to-efg/phased_T2T_panel.tar
 7 | experiments/vcf-to-hapl-to-efg/chr22_iEFG.gfa
 8 | 
 9 | tools/efg-locate/efg-locate
10 | tools/efg-locate/test/output-*
11 | 
12 | tools/ChainX-block-graph/chainx-block-graph
13 | tools/ChainX-block-graph/test/output-*
14 | 
15 | tools/efg-gaf-splitter/efg-gaf-splitter
16 | 
17 | tools/efg-ahocorasick/efg-ahocorasick
18 | tools/efg-ahocorasick/extractor
19 | 
20 | tools/efg-simplify/efg-simplify
21 | 
22 | test/*.gaf
23 | 
24 | experiments/aligner-evaluation/scripts/__pycache__
25 | experiments/aligner-evaluation/input/*
26 | experiments/aligner-evaluation/semi-repeat-free/output
27 | experiments/aligner-evaluation/chaining/output
28 | experiments/aligner-evaluation/final/output
29 | experiments/aligner-evaluation/mems/output
30 | experiments/aligner-evaluation/vg-comparison/output
31 | experiments/aligner-evaluation/ahocorasick/output
32 | 
33 | experiments/msa-validation/tmp*
34 | experiments/msa-validation/output_stats.txt
35 | 
36 | experiments/short-read-exact-match/input/chm13v2.0.fa.gz
37 | experiments/short-read-exact-match/input/chr22_iEFG.gfa
38 | experiments/short-read-exact-match/input/chr22_uppercase.fasta
39 | experiments/short-read-exact-match/input/ERR1025645_sample05_1.fq.gz
40 | experiments/short-read-exact-match/output
41 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "tools/founderblockgraphs"]
 2 | 	path = tools/founderblockgraphs
 3 | 	url = https://github.com/algbio/founderblockgraphs.git
 4 | [submodule "tools/vcf2multialign"]
 5 | 	path = tools/vcf2multialign
 6 | 	url = https://github.com/tsnorri/vcf2multialign
 7 | [submodule "tools/concurrentqueue"]
 8 | 	path = tools/concurrentqueue
 9 | 	url = https://github.com/cameron314/concurrentqueue
10 | [submodule "tools/sdsl-lite-v3"]
11 | 	path = tools/sdsl-lite-v3
12 | 	url = https://github.com/xxsds/sdsl-lite
13 | [submodule "tools/Badread"]
14 | 	path = tools/Badread
15 | 	url = https://github.com/rrwick/Badread
16 | [submodule "tools/GraphChainer"]
17 | 	path = tools/GraphChainer
18 | 	url = https://github.com/algbio/GraphChainer
19 | [submodule "tools/minichain"]
20 | 	path = tools/minichain
21 | 	url = https://github.com/at-cg/minichain
22 | 	ignore = untracked
23 | [submodule "tools/minigraph"]
24 | 	path = tools/minigraph
25 | 	url = https://github.com/lh3/minigraph
26 | [submodule "tools/efg-mems"]
27 | 	path = tools/efg-mems
28 | 	url = https://github.com/algbio/efg-mems
29 | 	branch = mapped-output
30 | [submodule "tools/GraphAligner"]
31 | 	path = tools/GraphAligner
32 | 	url = https://github.com/maickrau/GraphAligner
33 | [submodule "tools/edlib"]
34 | 	path = tools/edlib
35 | 	url = https://github.com/Martinsos/edlib
36 | [submodule "tools/seqtk"]
37 | 	path = tools/seqtk
38 | 	url = https://github.com/lh3/seqtk
39 | 	ignore = untracked
40 | [submodule "tools/daachorse"]
41 | 	path = tools/daachorse
42 | 	url = https://github.com/daac-tools/daachorse.git
43 | [submodule "tools/bwa"]
44 | 	path = tools/bwa
45 | 	url = https://github.com/lh3/bwa
46 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: prerequisites
2 | prerequisites:
3 | 	make --directory=tools/efg-locate
4 | 	make --directory=tools/ChainX-block-graph
5 | 	make --directory=tools/efg-gaf-splitter
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # seed-chain-extend on indexable Elastic Founder Graphs
 2 | `SRFAligner` and `SRFChainer` are long-read alignment tools based on indexable Elastic Founder Graphs (iEFGs). iEFGs can be obtained from FASTA multiple sequence alignments using [`founderblockgraph`](https://github.com/algbio/founderblockgraphs), or from a VCF file with the pipeline implemented in `experiments/vcf-to-hapl-to-efg` using `founderblockgraph` and [`vcf2multialign`](https://github.com/tsnorri/vcf2multialign/). The graphs used in the experiments can be found at [doi.org/10.5281/zenodo.14012881](https://doi.org/10.5281/zenodo.14012881).
 3 | 
 4 | ![Workflow to build iEFGs from a VCF file and to perform seed-chain-extend alignment](docs/workflow.png)
 5 | 
 6 | ## getting started
 7 | `SRFAligner`, `SRFChainer` and the other prototype aligners are Bash scripts and they depend on `efg-locate`, `chainx-block-graph` (tested on GCC >= 13), and `GraphAligner` (tested on GitHub version >= 1.0.19). To clone this repository and compile the first two:
 8 | ```console
 9 | git clone https://github.com/algbio/SRFAligner && cd SRFAligner
10 | git submodule update --init tools/{sdsl-lite-v3,concurrentqueue}
11 | make
12 | ```
13 | and `GraphAligner`'s executable is expected to be found in folder `tools/GraphAligner/bin`, so you can run command `git submodule update --init --recursive tools/GraphAligner` and follow its [compilation instructions](https://github.com/maickrau/GraphAligner?tab=readme-ov-file#compilation). If `GraphAligner` is already installed in your system, you can just modify the relative line in `SRFAligner` and `SRFChainer`:
14 | ```console
15 | sed --in-place '7s/.*/graphaligner=GraphAligner/' SRFAligner SRFChainer efg-memsAligner efg-ahocorasickAligner
16 | ```
17 | Test the aligners with commands
18 | ```console
19 | ./SRFAligner -g test/graph1.gfa -f test/read1.fastq -a test/aln1.gaf
20 | ./SRFChainer -g test/graph2.gfa -f test/read2.fastq -a test/aln2.gaf
21 | ```
22 | 
23 | ## prototype aligners
24 | To use MEM seeds computed by `efg-mems`, `efg-memsAligner` expects `efg-mems`'s executable to be in `tools/efg-mems/efg-mems` and [`seqtk`](https://github.com/lh3/seqtk) to be in `tools/seqtk`:
25 | ```console
26 | git submodule update --init --recursive {tools/efg-mems,tools/seqtk}
27 | make -C tools/seqtk
28 | cd tools/efg-mems/sdsl-lite
29 | ./install.sh .
30 | cd ..
31 | cmake .
32 | make
33 | ```
34 | 
35 | Analogously, to use full node seeds computed by [`daachorse`](https://github.com/daac-tools/daachorse) (Aho-Corasick automaton of the node labels, requires Rust >= 1.61), `efg-ahocorasickAligner` expects `efg-ahocorasick` and `extractor` to be in `tools/efg-ahocorasick`, and [`seqtk`](https://github.com/lh3/seqtk) to be in `tools/seqtk`:
36 | 
37 | ```console
38 | git submodule update --init --recursive {tools/daachorse,tools/seqtk}
39 | make -C tools/seqtk
40 | make -C tools/efg-ahocorasick
41 | ```
42 | 


--------------------------------------------------------------------------------
/SRFAligner:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | set -o pipefail
  4 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
  5 | 
  6 | # executables' absolute paths/commands (make sure they work!)
  7 | graphaligner=$thisfolder/tools/GraphAligner/bin/GraphAligner
  8 | efglocate=$thisfolder/tools/efg-locate/efg-locate
  9 | 
 10 | # default params
 11 | workingfolder="."
 12 | threads=8 # threads
 13 | edgemincount=0 # semi-repeat-free seeds only
 14 | edgelongestcount=0
 15 | extendoptions="--max-cluster-extend 5 -b 10" # GraphAligner default extend options
 16 | discardoption="(substr(\$16,6) > 0.90) && ((\$4-\$3)*100/\$2 >= 50)" # using -c option, discard alignments with identity <= 90% or read coverage < 50%
 17 | 
 18 | print_help()
 19 | {
 20 |    echo "Pipeline to align long reads to indexable Elastic Founder Graphs based on semi-repeat-free seeds"
 21 |    echo "usage: SRFAligner -g graph.gfa -f reads.fastq -a alignments.gaf"
 22 |    echo "	-h --help:  show this screen"
 23 |    echo "	-g graph.gfa: semi-repeat-free EFG in xGFA format"
 24 |    echo "	-f reads.fastq: reads in FASTQ format"
 25 |    echo "	-a alignmentsout.gaf: output alignments in GAF format"
 26 |    echo "	-t threads: # of threads"
 27 |    echo "	-i IGNORECHARS : ignore the following characters for indexability/seed finding"
 28 |    echo "	-c : discard bad alignments (identity <= 0.9 or read coverage < 50%) and run GraphAligner on unaligned reads"
 29 |    echo "	-p : disable pipeline mode and save the seeds in working folder"
 30 |    echo "	-e : make GraphAligner consider each single seed (cluster) for extension"
 31 |    echo "	-m edgemincount:     heuristic parameter for seed computation (see efg-locate)"
 32 |    echo "	-o edgelongestcount: heuristic parameter for seed computation (see efg-locate)"
 33 |    echo "	-w path: working folder for output and temporary files"
 34 | }
 35 | 
 36 | # https://stackoverflow.com/questions/12022592/how-can-i-use-long-options-with-the-bash-getopts-builtin
 37 | for arg in "$@"; do
 38 | 	shift
 39 | 	case "$arg" in
 40 | 		'--help')                set -- "$@" '-h'   ;;
 41 | 		*)                       set -- "$@" "$arg" ;;
 42 | 	esac
 43 | done
 44 | 
 45 | while getopts "hepcg:f:a:t:w:m:i:o:" option; do
 46 | 	case $option in
 47 | 		h) # display help
 48 | 			print_help
 49 | 			exit;;
 50 | 		g) # graph
 51 | 			argg=true
 52 | 			graph="$OPTARG" ;;
 53 | 		f) # fastq reads
 54 | 			argf=true
 55 | 			reads="$OPTARG" ;;
 56 | 		a) # output
 57 | 			arga=true
 58 | 			alignmentsout="$OPTARG" ;;
 59 | 		t) # threads
 60 | 			argt=true
 61 | 			threads="$OPTARG" ;;
 62 | 		i) # ignorechars
 63 | 			argi=true
 64 | 			ignorechars="$OPTARG" ;;
 65 | 		w) # working folder
 66 | 			argw=true
 67 | 			workingfolder="$OPTARG" ;;
 68 | 		c) # discard short alignment heuristic flag
 69 | 			argc=true ;;
 70 | 		p) # disable pipeline mode flag
 71 | 			argp=true ;;
 72 | 		e) # extend all clusters
 73 | 			arge=true ;;
 74 | 		m) # edgemincount parameter
 75 | 			argm=true
 76 | 			edgemincount="$OPTARG" ;;
 77 | 		o) # edgelongestcount parameter
 78 | 			argo=true
 79 | 			edgelongestcount="$OPTARG" ;;
 80 | 		\?) # invalid option
 81 | 			echo "Error: Invalid option"
 82 | 			print_help
 83 | 			exit;;
 84 | 	esac
 85 | done
 86 | if [[ "$argg" != true ]] || [[ "$argf" != true ]] ; then
 87 | 	print_help
 88 | 	exit
 89 | fi
 90 | 
 91 | ignorecharsarg=""
 92 | if [[ "$argi" = true ]] ; then
 93 | 	ignorecharsarg="--ignore-chars=$ignorechars"
 94 | fi
 95 | 
 96 | if [[ "$arge" = true ]] ; then
 97 | 	extendoptions="--max-cluster-extend -1 --multimap-score-fraction 0.00 -b 10"
 98 | fi
 99 | 
100 | # move to working folder
101 | if [[ "$argw" = true ]] ; then
102 | 	workingfolder="${workingfolder%/}"
103 | else
104 | 	workingfolder="."
105 | fi
106 | 
107 | if [[ "$argp" = true ]] ; then
108 | 	# find semi-repeat-free seeds
109 | 	$efglocate --approximate --split-output-matches-graphaligner --reverse-complement --overwrite \
110 | 		$ignorecharsarg \
111 | 		--threads $threads \
112 | 		--approximate-edge-match-min-count $edgemincount \
113 | 		--approximate-edge-match-longest $edgelongestcount \
114 | 		$graph \
115 | 		<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
116 | 		"$workingfolder/$(basename $reads)_srf_seeds.gaf"
117 | 
118 | 	# GraphAligner extend
119 | 	$graphaligner $extendoptions \
120 | 		-t $threads \
121 | 		-g $graph \
122 | 		-f $reads \
123 | 		--realign "$workingfolder/$(basename $reads)_srf_seeds.gaf" \
124 | 		-a $alignmentsout
125 | 
126 | 	if [[ "$argc" = true ]] ; then
127 | 		echo -n "Filtering alignments with identity <= 90% or read coverage < 50%..."
128 | 		awk "{if ($discardoption) {print}}" "$alignmentsout" > "$workingfolder/filtered_alignments_$$.gaf"
129 | 		mv "$workingfolder/filtered_alignments_$$.gaf" "$alignmentsout"
130 | 		echo " done."
131 | 
132 | 		unalignedreads=$({ grep -v \
133 | 				-f <(cut -f1 $alignmentsout | uniq) \
134 | 				<(awk 'NR % 4 == 1' $reads | cut -d' ' -f1 | tr -d "@") || true; })
135 | 		unalignedreadscount=$(echo -n "$unalignedreads" | wc -l)
136 | 		if [[ "$unalignedreadscount" -gt "0" ]] ; then
137 | 			grep -A 1 --no-group-separator \
138 | 				-f <(echo "$unalignedreads") \
139 | 				<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
140 | 				> "$workingfolder/$(basename $reads)_unaligned_reads.fasta"
141 | 
142 | 			$graphaligner -x "vg" \
143 | 				-t $threads \
144 | 				-g $graph \
145 | 				-f "$workingfolder/$(basename $reads)_unaligned_reads.fasta" \
146 | 				-a "$workingfolder/$(basename $reads)_unaligned_reads.gaf"
147 | 			cat "$workingfolder/$(basename $reads)_unaligned_reads.gaf" >> $alignmentsout
148 | 		else
149 | 			echo "There are no unaligned reads to realign"
150 | 		fi
151 | 	fi
152 | else
153 | 	# pipeline of above commands
154 | 
155 | 	if [[ "$argc" = true ]] ; then
156 | 		# set up final GraphAligner call
157 | 		fastapipe=$(mktemp -u --suffix ".fasta")
158 | 		mkfifo -m 600 $fastapipe
159 | 		trap '{ rm -f -- "$fastapipe"; }' EXIT
160 | 
161 | 		# TODO find a way to not store as a separate file the unaligned alignments and directly append them
162 | 		$graphaligner -x "vg" \
163 | 			-t $threads \
164 | 			-g $graph \
165 | 			-f $fastapipe \
166 | 			-a "$workingfolder/unaligned_reads_$$.gaf" &
167 | 		# give GraphAligner a dummy "file", see https://github.com/maickrau/GraphAligner/issues/105
168 | 		set +eo pipefail ; { echo > $fastapipe & } ; set -eo pipefail
169 | 	fi
170 | 
171 | 	$efglocate --approximate --split-output-matches-graphaligner --reverse-complement --overwrite \
172 | 		$ignorecharsarg \
173 | 		--threads $threads \
174 | 		--approximate-edge-match-min-count $edgemincount \
175 | 		--approximate-edge-match-longest $edgelongestcount \
176 | 		$graph \
177 | 		<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
178 | 		/dev/stdout | \
179 | 	$graphaligner $extendoptions \
180 | 		-t $threads \
181 | 		-g $graph \
182 | 		-f $reads \
183 | 		--realign /dev/stdin \
184 | 		-a $alignmentsout
185 | 
186 | 	if [[ "$argc" = true ]] ; then
187 | 		echo -n "Filtering alignments with identity <= 90% or read coverage < 50%..."
188 | 		awk "{if ($discardoption) {print}}" "$alignmentsout" > "$workingfolder/filtered_alignments_$$.gaf"
189 | 		mv "$workingfolder/filtered_alignments_$$.gaf" "$alignmentsout"
190 | 		echo " done."
191 | 
192 | 		unalignedreads=$({ grep -v \
193 | 				-f <(cut -f1 $alignmentsout | uniq) \
194 | 				<(awk 'NR % 4 == 1' $reads | cut -d' ' -f1 | tr -d "@") || true; })
195 | 		unalignedreadscount=$(echo -n "$unalignedreads" | wc -l)
196 | 		if [[ "$unalignedreadscount" -gt "0" ]] ; then
197 | 			echo "There are the following unaligned reads:"
198 | 			echo "$unalignedreads"
199 | 			grep -A 1 --no-group-separator \
200 | 				-f <(echo "$unalignedreads") \
201 | 				<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
202 | 				> $fastapipe
203 | 			wait $(jobs -p)
204 | 		else
205 | 			echo "There are no unaligned reads to realign"
206 | 			set +eo pipefail ; { kill $(jobs -p) & } ; set -eo pipefail
207 | 		fi
208 | 
209 | 		cat "$workingfolder/unaligned_reads_$$.gaf" >> $alignmentsout
210 | 		rm "$workingfolder/unaligned_reads_$$.gaf"
211 | 	fi
212 | fi
213 | 


--------------------------------------------------------------------------------
/SRFChainer:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | set -o pipefail
  4 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
  5 | 
  6 | # executables' absolute paths/commands (make sure they work!)
  7 | graphaligner=$thisfolder/tools/GraphAligner/bin/GraphAligner
  8 | efglocate=$thisfolder/tools/efg-locate/efg-locate
  9 | chainxblockgraph=$thisfolder/tools/ChainX-block-graph/chainx-block-graph
 10 | 
 11 | # default params
 12 | workingfolder="."
 13 | threads=8 # threads
 14 | edgemincount=0 # semi-repeat-free seeds only
 15 | edgelongestcount=0
 16 | alternativechains=0
 17 | chainingguess="--initial-guess-coverage=0.5 --ramp-up-factor=1.5"
 18 | extendoptions="--max-cluster-extend 5 -b 10" # GraphAligner default extend options
 19 | discardoption="(substr(\$16,6) > 0.90) && ((\$4-\$3)*100/\$2 >= 50)" # using -c option, discard alignments with identity <= 90% or read coverage < 50%
 20 | 
 21 | # load balancing
 22 | locatework=(32 32 22 14 14 11 10 9 8 7 6)
 23 | chainwork=(8 25 32 32 32 32 32 32 32 32 32)
 24 | 
 25 | print_help()
 26 | {
 27 |    echo "Pipeline to align long reads to indexable Elastic Founder Graphs based on chained semi-repeat-free seeds"
 28 |    echo "usage: SRFChainer -g graph.gfa -f reads.fastq -a alignments.gaf"
 29 |    echo "	-h --help:  show this screen"
 30 |    echo "	-g graph.gfa: semi-repeat-free EFG in xGFA format"
 31 |    echo "	-f reads.fastq: reads in FASTQ format"
 32 |    echo "	-a alignmentsout.gaf: output alignments in GAF format"
 33 |    echo "	-t threads: # of threads"
 34 |    echo "	-i IGNORECHARS : ignore the following characters for indexability/seed finding"
 35 |    echo "	-c : discard bad alignments (identity <= 0.9 or read coverage < 50%) and run GraphAligner on unaligned reads"
 36 |    echo "	-p : disable pipeline mode and save the seeds in working folder"
 37 |    echo "	-e : make GraphAligner consider each single seed (cluster) for extension"
 38 |    echo "	-m edgemincount:     heuristic parameter for seed computation (see efg-locate)"
 39 |    echo "	-o edgelongestcount: heuristic parameter for seed computation (see efg-locate)"
 40 |    echo "	-n altchains : heuristic parameter to chain an additional n times for each strand"
 41 |    echo "	-w path: working folder for output and temporary files"
 42 | }
 43 | 
 44 | load_balance()
 45 | {
 46 | 	m=$2
 47 | 	if (( $m >= ${#locatework[@]} ))
 48 | 	then
 49 | 		m=$(( ${#locatework[@]} - 1 ))
 50 | 	fi
 51 | 
 52 | 
 53 | 	threads1=$(( $1 * ${chainwork[$m]} / (${chainwork[$m]} + ${locatework[$m]})))
 54 | 	threads1=$(( threads1 >= $1 ? $1 - 1 : threads1 ))
 55 | 	threads1=$(( threads1  <  1 ?      1 : threads1 ))
 56 | 
 57 | 	echo "$(($1 - $threads1))"
 58 | }
 59 | 
 60 | # https://stackoverflow.com/questions/12022592/how-can-i-use-long-options-with-the-bash-getopts-builtin
 61 | for arg in "$@"; do
 62 | 	shift
 63 | 	case "$arg" in
 64 | 		'--help')                set -- "$@" '-h'   ;;
 65 | 		*)                       set -- "$@" "$arg" ;;
 66 | 	esac
 67 | done
 68 | 
 69 | while getopts "hepcg:f:a:t:w:m:i:n:o:" option; do
 70 | 	case $option in
 71 | 		h) # display help
 72 | 			print_help
 73 | 			exit;;
 74 | 		g) # graph
 75 | 			argg=true
 76 | 			graph="$OPTARG" ;;
 77 | 		f) # fastq reads
 78 | 			argf=true
 79 | 			reads="$OPTARG" ;;
 80 | 		a) # output
 81 | 			arga=true
 82 | 			alignmentsout="$OPTARG" ;;
 83 | 		t) # threads
 84 | 			argt=true
 85 | 			threads="$OPTARG" ;;
 86 | 		i) # ignorechars
 87 | 			argi=true
 88 | 			ignorechars="$OPTARG" ;;
 89 | 		w) # working folder
 90 | 			argw=true
 91 | 			workingfolder="$OPTARG" ;;
 92 | 		c) # discard short alignment heuristic flag
 93 | 			argc=true ;;
 94 | 		p) # disable pipeline mode flag
 95 | 			argp=true ;;
 96 | 		e) # extend all clusters
 97 | 			arge=true ;;
 98 | 		m) # edgemincount parameter
 99 | 			argm=true
100 | 			edgemincount="$OPTARG" ;;
101 | 		o) # edgelongestcount parameter
102 | 			argo=true
103 | 			edgelongestcount="$OPTARG" ;;
104 | 		n) # alternative chains parameter
105 | 			argn=true
106 | 			alternativechains="$OPTARG" ;;
107 | 		\?) # invalid option
108 | 			echo "Error: Invalid option"
109 | 			print_help
110 | 			exit;;
111 | 	esac
112 | done
113 | if [[ "$argg" != true ]] || [[ "$argf" != true ]] ; then
114 | 	print_help
115 | 	exit
116 | fi
117 | 
118 | if [[ "$argp" != true ]] && [[ "$threads" -lt 2 ]] ; then
119 | 	echo "Please pick at least 2 threads in pipeline mode!"
120 | 	exit 1
121 | fi
122 | 
123 | ignorecharsarg=""
124 | if [[ "$argi" = true ]] ; then
125 | 	ignorecharsarg="--ignore-chars=$ignorechars"
126 | fi
127 | 
128 | if [[ "$arge" = true ]] ; then
129 | 	extendoptions="--max-cluster-extend -1 --multimap-score-fraction 0.00 -b 10"
130 | fi
131 | 
132 | # move to working folder
133 | if [[ "$argw" = true ]] ; then
134 | 	workingfolder="${workingfolder%/}"
135 | else
136 | 	workingfolder="."
137 | fi
138 | 
139 | if [[ "$argp" = true ]] ; then
140 | 	# find semi-repeat-free seeds
141 | 	$efglocate --approximate --split-output-matches --reverse-complement --rename-reverse-complement --overwrite \
142 | 		$ignorecharsarg \
143 | 		--threads $threads \
144 | 		--approximate-edge-match-min-count $edgemincount \
145 | 		--approximate-edge-match-longest $edgelongestcount \
146 | 		$graph \
147 | 		<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
148 | 		"$workingfolder/$(basename $reads)_srfchain_seeds.gaf"
149 | 
150 | 	# chainx-block-graph chain
151 | 	$chainxblockgraph --semi-global --split-output-matches-graphaligner --overwrite $chainingguess \
152 | 		--alternative-chains $alternativechains \
153 | 		--threads $threads \
154 | 		$graph \
155 | 		"$workingfolder/$(basename $reads)_srfchain_seeds.gaf" \
156 | 		"$workingfolder/$(basename $reads)_srfchain_chain.gaf"
157 | 
158 | 	# GraphAligner extend
159 | 	$graphaligner $extendoptions \
160 | 		-t $threads \
161 | 		-g $graph \
162 | 		-f $reads \
163 | 		--realign "$workingfolder/$(basename $reads)_srfchain_chain.gaf" \
164 | 		-a $alignmentsout
165 | 
166 | 	if [[ "$argc" = true ]] ; then
167 | 		echo -n "Filtering alignments with identity <= 90% or read coverage < 50%..."
168 | 		awk "{if ($discardoption) {print}}" "$alignmentsout" > "$workingfolder/filtered_alignments_$$.gaf"
169 | 		mv "$workingfolder/filtered_alignments_$$.gaf" "$alignmentsout"
170 | 		echo " done."
171 | 
172 | 		unalignedreads=$({ grep -v \
173 | 				-f <(cut -f1 $alignmentsout | uniq) \
174 | 				<(awk 'NR % 4 == 1' $reads | cut -d' ' -f1 | tr -d "@") || true; })
175 | 		unalignedreadscount=$(echo -n "$unalignedreads" | wc -l)
176 | 		if [[ "$unalignedreadscount" -gt "0" ]] ; then
177 | 			grep -A 1 --no-group-separator \
178 | 				-f <(echo "$unalignedreads") \
179 | 				<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
180 | 				> "$workingfolder/$(basename $reads)_unaligned_reads.fasta"
181 | 
182 | 			$graphaligner -x "vg" \
183 | 				-t $threads \
184 | 				-g $graph \
185 | 				-f "$workingfolder/$(basename $reads)_unaligned_reads.fasta" \
186 | 				-a "$workingfolder/$(basename $reads)_unaligned_reads.gaf"
187 | 			cat "$workingfolder/$(basename $reads)_unaligned_reads.gaf" >> $alignmentsout
188 | 		else
189 | 			echo "There are no unaligned reads to realign"
190 | 		fi
191 | 	fi
192 | else
193 | 	# pipeline of above commands
194 | 
195 | 	if [[ "$argc" = true ]] ; then
196 | 		# set up final GraphAligner call
197 | 		fastapipe=$(mktemp -u --suffix ".fasta")
198 | 		mkfifo -m 600 $fastapipe
199 | 		trap '{ rm -f -- "$fastapipe"; }' EXIT
200 | 
201 | 		# TODO find a way to not store on disk the unaligned alignments and directly append them
202 | 		$graphaligner -x "vg" \
203 | 			-t $threads \
204 | 			-g $graph \
205 | 			-f $fastapipe \
206 | 			-a "$workingfolder/unaligned_reads_$$.gaf" &
207 | 		# give GraphAligner a dummy "file", see https://github.com/maickrau/GraphAligner/issues/105
208 | 		set +eo pipefail ; { echo > $fastapipe & } ; set -eo pipefail
209 | 	fi
210 | 
211 | 	efglocatethreads=$(( $threads / 2 ))
212 | 	chainxthreads=$(( $threads - $efglocatethreads ))
213 | 	echo "load balance: $efglocatethreads for locate, $chainxthreads for chaining"
214 | 	$efglocate --approximate --split-output-matches --reverse-complement --rename-reverse-complement --overwrite \
215 | 		--threads $efglocatethreads \
216 | 		--approximate-edge-match-min-count $edgemincount \
217 | 		--approximate-edge-match-longest $edgelongestcount \
218 | 		$graph \
219 | 		<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
220 | 		/dev/stdout | \
221 | 	$chainxblockgraph --semi-global --split-output-matches-graphaligner --overwrite $chainingguess \
222 | 		--threads $chainxthreads \
223 | 		--alternative-chains $alternativechains \
224 | 		$graph \
225 | 		/dev/stdin \
226 | 		/dev/stdout | \
227 | 	$graphaligner $extendoptions \
228 | 		-t $threads \
229 | 		-g $graph \
230 | 		-f $reads \
231 | 		--realign /dev/stdin \
232 | 		-a $alignmentsout
233 | 
234 | 	if [[ "$argc" = true ]] ; then
235 | 		echo -n "Filtering alignments with identity <= 90% or read coverage < 50%..."
236 | 		awk "{if ($discardoption) {print}}" "$alignmentsout" > "$workingfolder/filtered_alignments_$$.gaf"
237 | 		mv "$workingfolder/filtered_alignments_$$.gaf" "$alignmentsout"
238 | 		echo " done."
239 | 
240 | 		unalignedreads=$({ grep -v \
241 | 				-f <(cut -f1 $alignmentsout | uniq) \
242 | 				<(awk 'NR % 4 == 1' $reads | cut -d' ' -f1 | tr -d "@") || true; })
243 | 		unalignedreadscount=$(echo -n "$unalignedreads" | wc -l)
244 | 		if [[ "$unalignedreadscount" -gt "0" ]] ; then
245 | 			echo "There are the following unaligned reads:"
246 | 			echo "$unalignedreads"
247 | 			grep -A 1 --no-group-separator \
248 | 				-f <(echo "$unalignedreads") \
249 | 				<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
250 | 				> $fastapipe
251 | 			wait $(jobs -p)
252 | 		else
253 | 			echo "There are no unaligned reads to realign"
254 | 			set +eo pipefail ; { kill $(jobs -p) & } ; set -eo pipefail
255 | 		fi
256 | 
257 | 		cat "$workingfolder/unaligned_reads_$$.gaf" >> $alignmentsout
258 | 		rm "$workingfolder/unaligned_reads_$$.gaf"
259 | 	fi
260 | fi
261 | 


--------------------------------------------------------------------------------
/docs/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/algbio/SRFAligner/c698b10ce1b3f5ec20ef5f218f6d0259eebf7911/docs/workflow.png


--------------------------------------------------------------------------------
/efg-ahocorasickAligner:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | set -o pipefail
  4 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
  5 | 
  6 | # executables' absolute paths/commands (make sure they work!)
  7 | graphaligner=$thisfolder/tools/GraphAligner/bin/GraphAligner
  8 | ahocorasick=$thisfolder/tools/efg-ahocorasick/efg-ahocorasick
  9 | extractor=$thisfolder/tools/efg-ahocorasick/extractor
 10 | efggafsplitter=$thisfolder/tools/efg-gaf-splitter/efg-gaf-splitter
 11 | seqtk=$thisfolder/tools/seqtk/seqtk
 12 | 
 13 | # default params
 14 | workingfolder="."
 15 | extendoptions="--max-cluster-extend 5 -b 10" # GraphAligner default extend options
 16 | 
 17 | print_help()
 18 | {
 19 |    echo "Pipeline to align long reads to indexable Elastic Founder Graphs, based on srf seeds"
 20 |    echo "Currently, only 1 compute thread is supported, and all intermediate seeds"
 21 |    echo " are saved in the working folder"
 22 |    echo "	-h --help:  show this screen"
 23 |    echo "	-g graph.gfa: semi-repeat-free EFG in xGFA format"
 24 |    echo "	-f reads.fastq: reads in FASTQ format"
 25 |    echo "	-a alignmentsout.gaf: output alignments in GAF format"
 26 |    echo "	-w path: working folder for output and temporary files"
 27 | }
 28 | 
 29 | # https://stackoverflow.com/questions/12022592/how-can-i-use-long-options-with-the-bash-getopts-builtin
 30 | for arg in "$@"; do
 31 | 	shift
 32 | 	case "$arg" in
 33 | 		'--help')                set -- "$@" '-h'   ;;
 34 | 		*)                       set -- "$@" "$arg" ;;
 35 | 	esac
 36 | done
 37 | 
 38 | while getopts "hg:f:a:w:" option; do
 39 | 	case $option in
 40 | 		h) # display help
 41 | 			print_help
 42 | 			exit;;
 43 | 		g) # graph
 44 | 			argg=true
 45 | 			graph="$OPTARG" ;;
 46 | 		f) # fastq reads
 47 | 			argf=true
 48 | 			reads="$OPTARG" ;;
 49 | 		a) # output
 50 | 			arga=true
 51 | 			alignmentsout="$OPTARG" ;;
 52 | 		w) # working folder
 53 | 			argw=true
 54 | 			workingfolder="$OPTARG" ;;
 55 | 		\?) # invalid option
 56 | 			echo "Error: Invalid option"
 57 | 			print_help
 58 | 			exit;;
 59 | 	esac
 60 | done
 61 | 
 62 | if [[ "$argg" != true ]] || [[ "$argf" != true ]] ; then
 63 | 	print_help
 64 | 	exit
 65 | fi
 66 | 
 67 | # move to working folder
 68 | if [[ "$argw" = true ]] ; then
 69 | 	workingfolder="${workingfolder%/}"
 70 | else
 71 | 	workingfolder="."
 72 | fi
 73 | 
 74 | # extract nodes and ids
 75 | $extractor $graph > "$workingfolder/nodes.txt" 2> "$workingfolder/nodeids.txt"
 76 | 
 77 | # find efg-mems seeds
 78 | # ahocorasick node_labels node_ids fasta_reads fasta_ids
 79 | $ahocorasick \
 80 | 	"$workingfolder/nodes.txt" \
 81 | 	"$workingfolder/nodeids.txt" \
 82 | 	<(cat \
 83 | 		<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
 84 | 		<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1 | sed 's/^>/>rev_/g' | $seqtk seq -r) \
 85 | 		| grep -v "^>") \
 86 | 	<(cat \
 87 | 		<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
 88 | 		<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1 | sed 's/^>/>rev_/g' | $seqtk seq -r) \
 89 | 		| grep "^>" | tr -d ">") \
 90 | 	> "$workingfolder/ahocorasick_seeds_pre_split.gaf"
 91 | 
 92 | # split seeds and reverse if needed
 93 | $efggafsplitter \
 94 | 	$graph \
 95 | 	"$workingfolder/ahocorasick_seeds_pre_split.gaf" \
 96 | 	> "$workingfolder/ahocorasick_seeds.gaf"
 97 | 
 98 | # GraphAligner extend
 99 | $graphaligner $extendoptions  \
100 | 	-t 1 \
101 | 	-g $graph \
102 | 	-f $reads \
103 | 	--realign "$workingfolder/ahocorasick_seeds.gaf" \
104 | 	-a $alignmentsout
105 | 


--------------------------------------------------------------------------------
/efg-memsAligner:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
 5 | 
 6 | # executables' absolute paths/commands (make sure they work!)
 7 | graphaligner=$thisfolder/tools/GraphAligner/bin/GraphAligner
 8 | efgmems=$thisfolder/tools/efg-mems/efg-mems
 9 | efggafsplitter=$thisfolder/tools/efg-gaf-splitter/efg-gaf-splitter
10 | seqtk=$thisfolder/tools/seqtk/seqtk
11 | 
12 | # default params
13 | workingfolder="."
14 | extendoptions="--max-cluster-extend 5 -b 10" # GraphAligner default extend options
15 | 
16 | print_help()
17 | {
18 |    echo "Pipeline to align long reads to indexable Elastic Founder Graphs, based on MEMs"
19 |    echo "Currently, only 1 compute thread is supported, and all intermediate seeds"
20 |    echo " are saved in the working folder"
21 |    echo "	-h --help:  show this screen"
22 |    echo "	-g graph.gfa: semi-repeat-free EFG in xGFA format"
23 |    echo "	-f reads.fastq: reads in FASTQ format"
24 |    echo "	-a alignmentsout.gaf: output alignments in GAF format"
25 |    echo "	-w path: working folder for output and temporary files"
26 | }
27 | 
28 | # https://stackoverflow.com/questions/12022592/how-can-i-use-long-options-with-the-bash-getopts-builtin
29 | for arg in "$@"; do
30 | 	shift
31 | 	case "$arg" in
32 | 		'--help')                set -- "$@" '-h'   ;;
33 | 		*)                       set -- "$@" "$arg" ;;
34 | 	esac
35 | done
36 | 
37 | while getopts "hg:f:a:w:" option; do
38 | 	case $option in
39 | 		h) # display help
40 | 			print_help
41 | 			exit;;
42 | 		g) # graph
43 | 			argg=true
44 | 			graph="$OPTARG" ;;
45 | 		f) # fastq reads
46 | 			argf=true
47 | 			reads="$OPTARG" ;;
48 | 		a) # output
49 | 			arga=true
50 | 			alignmentsout="$OPTARG" ;;
51 | 		w) # working folder
52 | 			argw=true
53 | 			workingfolder="$OPTARG" ;;
54 | 		\?) # invalid option
55 | 			echo "Error: Invalid option"
56 | 			print_help
57 | 			exit;;
58 | 	esac
59 | done
60 | 
61 | if [[ "$argg" != true ]] || [[ "$argf" != true ]] ; then
62 | 	print_help
63 | 	exit
64 | fi
65 | 
66 | # move to working folder
67 | if [[ "$argw" = true ]] ; then
68 | 	workingfolder="${workingfolder%/}"
69 | else
70 | 	workingfolder="."
71 | fi
72 | 
73 | # find efg-mems seeds
74 | $efgmems -a "ACGTXN#0" -k 20 --indexing --bdbwt \
75 | 	-o "$workingfolder/efgmems_seeds_pre_split.gaf" \
76 | 	<(cat <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1 | sed 's/^>/>rev_/g' | $seqtk seq -r) | tr "N" "X") \
77 | 	$graph
78 | 
79 | # split seeds and reverse if needed
80 | $efggafsplitter \
81 | 	$graph \
82 | 	"$workingfolder/efgmems_seeds_pre_split.gaf" \
83 | 	> "$workingfolder/efgmems_seeds.gaf"
84 | 
85 | # GraphAligner extend
86 | $graphaligner $extendoptions  \
87 | 	-t 1 \
88 | 	-g $graph \
89 | 	-f $reads \
90 | 	--realign "$workingfolder/efgmems_seeds.gaf" \
91 | 	-a $alignmentsout
92 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation of sequence-to-graph aligners on simulated reads
 2 | Based on the [GraphChainer evaluation pipeline](https://github.com/algbio/GraphChainer-scripts).
 3 | 
 4 | ## Prerequisites
 5 | Install [miniconda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html). Then, set up the environment `aligner-evaluation` defined in `environment.yml`:
 6 | ```
 7 | conda env create -f environment.yml
 8 | ```
 9 | The pipeline expects to find `GraphChainer` and `badread_runner.py` in folders `tools/GraphChainer/bin` and `tools/BadRead` from the root of the repository, so download them and compile `GraphChainer` if you have not.
10 | ```
11 | git submodule update --init --recursive ../../tools/Badread
12 | git submodule update --init --recursive ../../tools/GraphChainer
13 | # following GraphChainer compile instructions
14 | cd ../../tools/GraphChainer
15 | conda env create -f CondaEnvironment.yml
16 | conda activate GraphChainer
17 | make bin/GraphChainer
18 | 
19 | cd ../../experiments/aligner-evaluation
20 | conda activate aligner-evaluation
21 | ```
22 | The scripts `runexp.sh` expect `/usr/bin/time`, `openssl`, and `awk` to be installed (and the `aligner-evaluation` conda environment to be active).
23 | 
24 | ## Datasets
25 | | Graph                       | Construction                           | Input |
26 | |-----------------------------|----------------------------------------|-------|
27 | | chr22\_iEFG                 | `vcf2multialign` + `founderblockgraph` | [T2T-CHM13v2.0](https://github.com/marbl/CHM13) + [Phased T2T 1KGP panel](https://zenodo.org/records/7612953#.Y-8VD3bMJPY)
28 | | chr22\_vg                   | `vg`                                   | same as above |
29 | | chr22_vg_msa                | `vcf2multialign` + `vg -M`             | same as above |
30 | | covid19_100_iEFG_simplified | `founderblockgraph` + `efg-simplify`   | [MSA from efg-mems experiments](https://github.com/algbio/efg-mems)
31 | 
32 | All experiments except for `vg-comparison` and `mems` use the chromosome 22 iEFG built with the `vcf-to-hapl-to-efg` pipeline as file `input/chr22_iEFG.gfa`. You can get the graph from [zenodo](https://doi.org/10.5281/zenodo.14012882) as follows:
33 | ```
34 | wget https://zenodo.org/records/14012882/files/chr22_iEFG.gfa.gz?download=1 --output-document=input/chr22_iEFG.gfa.gz
35 | gunzip input/chr22_iEFG.gfa
36 | ```
37 | The `mems` experiment uses a SaRS-CoV-2 MSA of 100 strains (NCBI accession numbers in `input/covid19_100_acc.txt`) aligned with [ViralMSA](https://github.com/niemasd/ViralMSA):
38 | ```
39 | wget https://zenodo.org/records/14012882/files/covid19_100_iEFG_simplified.gfa.gz?download=1 --output-document=input/covid19_100_iEFG_simplified.gfa.gz
40 | gunzip input/covid19_100_iEFG_simplified.gfa.gz
41 | ```
42 | Finally, the `vg-comparison` experiment additionally uses the `vg` graphs built from the 1KGP phased VCF file or the MSA obtained with `vcf2multialign` from the same data:
43 | ```
44 | wget https://zenodo.org/records/14012882/files/chr22_vg.gfa.gz?download=1 --output-document=input/chr22_vg.gfa.gz
45 | gunzip input/chr22_vg.gfa.gz
46 | wget https://zenodo.org/records/14012882/files/chr22_vg_msa.gfa.gz?download=1 --output-document=input/chr22_vg_msa.gfa.gz
47 | gunzip input/chr22_vg_msa.gfa.gz
48 | ```
49 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/ahocorasick/runexp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # comparison of SRFAligner and efg-ahocorasickAligner
 3 | # remember to run `conda activate aligner-evaluation` before execution
 4 | 
 5 | set -e
 6 | set -o pipefail
 7 | 
 8 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
 9 | cd $thisfolder
10 | 
11 | # executable's absolute paths/commands (make sure they work!)
12 | graphaligner=$thisfolder/../../../tools/GraphAligner/bin/GraphAligner
13 | srfaligner=$thisfolder/../../../SRFAligner
14 | ahocorasickaligner=$thisfolder/../../../efg-ahocorasickAligner
15 | usrbintime=/usr/bin/time
16 | 
17 | # params
18 | inputgraph=$thisfolder/../input/chr22_iEFG.gfa
19 | coverage=30
20 | threads=1
21 | 
22 | # 0. setup
23 | mkdir output
24 | echo -n > output/runexp_log.txt
25 | set -a
26 | set +a
27 | 
28 | # 1. simulate path and reads
29 | # uncomment the following 3 lines and comment the following ones to use the reads in semi-repeat-free
30 | #ln -s $thisfolder/../semi-repeat-free/output/sim_reads.fastq output
31 | #ln -s $thisfolder/../semi-repeat-free/output/sim_reads_path.fasta output
32 | #ln -s $thisfolder/../semi-repeat-free/output/sim_reads_path.nodes output
33 | python3 ../scripts/generate_sim_reads.py \
34 | 	--graph $inputgraph \
35 | 	--fastq output/sim_reads.fastq \
36 | 	--seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt </dev/zero 2>/dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \
37 | 	--path output/sim_reads_path.nodes \
38 | 	--fasta output/sim_reads_path.fasta \
39 | 	--coverage $coverage \
40 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
41 | 
42 | # 2. run the aligners
43 | $usrbintime $srfaligner \
44 | 	-t $threads \
45 | 	-g $inputgraph \
46 | 	-m 0 \
47 | 	-p \
48 | 	-f output/sim_reads.fastq \
49 | 	-a output/semi_repeat_free_alignments.gaf \
50 | 	-w output \
51 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
52 | 
53 | $usrbintime $ahocorasickaligner \
54 | 	-g $inputgraph \
55 | 	-f output/sim_reads.fastq \
56 | 	-a output/ahocorasick_alignments.gaf \
57 | 	-w output \
58 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
59 | 
60 | # 3. pick first alignment
61 | for alignment in "ahocorasick_alignments.gaf" "semi_repeat_free_alignments.gaf"
62 | do
63 | 	awk '{if (found[$1] == "1") \
64 | 	          {} \
65 | 	      else
66 | 	          {found[$1]="1"; print}}' \
67 | 		output/$alignment > output/best_$alignment
68 | done
69 | 
70 | # 4. validate and plot results
71 | for alignment in semi_repeat_free ahocorasick
72 | do
73 | 	python3 ../scripts/compute_summary.py \
74 | 		-t 3 \
75 | 		--graph $inputgraph \
76 | 		--fastq output/sim_reads.fastq \
77 | 		--path output/sim_reads_path.nodes \
78 | 		--fasta output/sim_reads_path.fasta \
79 | 		--alignments output/best_${alignment}_alignments.gaf \
80 | 		--metrics output/metrics_${alignment}.mts &
81 | done
82 | wait $(jobs -p)
83 | 
84 | python3 ../scripts/compute_metrics.py \
85 | 	--output-name output/results \
86 | 	--summaries output/*.mts \
87 | 	--summaries-names $(basename -s ".mts" -a output/*.mts | sed 's/metrics_//')
88 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/chaining/runexp.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # remember to run `conda activate aligner-evaluation`
  3 | set -e
  4 | set -o pipefail
  5 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
  6 | cd $thisfolder
  7 | 
  8 | # executable's absolute paths/commands (make sure they work!)
  9 | BADREAD=$thisfolder/../../../tools/Badread/badread-runner.py
 10 | GRAPHCHAINER=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer
 11 | srfchainer=$thisfolder/../../../SRFChainer
 12 | usrbintime=/usr/bin/time
 13 | 
 14 | # params
 15 | inputgraph=$thisfolder/../input/chr22_iEFG.gfa
 16 | coverage=30
 17 | threads=64
 18 | 
 19 | # 0. setup
 20 | mkdir output
 21 | echo -n > output/runexp_log.txt
 22 | set -a
 23 | set +a
 24 | 
 25 | # 1. simulate path and reads
 26 | # comment the following 3 lines and uncomment the following ones to generate the reads (again)
 27 | ln -s $thisfolder/../semi-repeat-free/output/sim_reads.fastq output
 28 | ln -s $thisfolder/../semi-repeat-free/output/sim_reads_path.fasta output
 29 | ln -s $thisfolder/../semi-repeat-free/output/sim_reads_path.nodes output
 30 | #python3 ../scripts/generate_sim_reads.py \
 31 | #	--graph $inputgraph \
 32 | #	--fastq output/sim_reads.fastq \
 33 | #	--seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt </dev/zero 2>/dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \
 34 | #	--path output/sim_reads_path.nodes \
 35 | #	--fasta output/sim_reads_path.fasta \
 36 | #	--coverage $coverage \
 37 | #	2>> output/runexp_log.txt >> output/runexp_log.txt
 38 | 
 39 | # 2. run the aligners
 40 | $usrbintime $srfchainer \
 41 | 	-t $threads \
 42 | 	-g $inputgraph \
 43 | 	-f output/sim_reads.fastq \
 44 | 	-a output/semi_repeat_free_chain_alignments.gaf \
 45 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
 46 | 
 47 | for o in 1 5 10 50 100
 48 | do
 49 | 	$usrbintime $srfchainer \
 50 | 		-t $threads \
 51 | 		-g $inputgraph \
 52 | 		-o $o \
 53 | 		-f output/sim_reads.fastq \
 54 | 		-a output/srf_edge_longest_${o}_chain_alignments.gaf \
 55 | 		2>> output/runexp_log.txt >> output/runexp_log.txt
 56 | done
 57 | 
 58 | # 3. pick first alignment
 59 | for alignment in "semi_repeat_free_chain_alignments.gaf"
 60 | do
 61 | 	awk '{if (found[$1] == "1") \
 62 | 	          {} \
 63 | 	      else
 64 | 	          {found[$1]="1"; print}}' \
 65 | 		output/$alignment > output/best_$alignment
 66 | done
 67 | for o in 1 5 10 50 100
 68 | do
 69 | 	awk '{if (found[$1] == "1") \
 70 | 	          {} \
 71 | 	      else
 72 | 	          {found[$1]="1"; print}}' \
 73 | 		output/srf_edge_longest_${o}_chain_alignments.gaf > output/best_srf_edge_longest_${o}_chain_alignments.gaf
 74 | done
 75 | 
 76 | # 4. validate and plot results
 77 | for alignment in semi_repeat_free_chain
 78 | do
 79 | 	python3 ../scripts/compute_summary.py \
 80 | 		-t 3 \
 81 | 		--graph $inputgraph \
 82 | 		--fastq output/sim_reads.fastq \
 83 | 		--path output/sim_reads_path.nodes \
 84 | 		--fasta output/sim_reads_path.fasta \
 85 | 		--alignments output/best_${alignment}_alignments.gaf \
 86 | 		--metrics output/metrics_${alignment}.mts
 87 | done
 88 | for o in 1 5 10 50 100
 89 | do
 90 | 	python3 ../scripts/compute_summary.py \
 91 | 		-t 3 \
 92 | 		--graph $inputgraph \
 93 | 		--fastq output/sim_reads.fastq \
 94 | 		--path output/sim_reads_path.nodes \
 95 | 		--fasta output/sim_reads_path.fasta \
 96 | 		--alignments output/best_srf_edge_longest_${o}_chain_alignments.gaf \
 97 | 		--metrics output/metrics_srf_edge_longest_${o}_chain.mts
 98 | done
 99 | wait $(jobs -p)
100 | 
101 | python3 ../scripts/compute_metrics.py \
102 | 	--output-name output/results \
103 | 	--summaries output/*.mts \
104 | 	--summaries-names $(basename -s ".mts" -a output/*.mts | sed 's/metrics_//')
105 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/environment.yml:
--------------------------------------------------------------------------------
  1 | name: aligner-evaluation
  2 | channels:
  3 |   - anaconda
  4 |   - conda-forge
  5 |   - https://conda.anaconda.org/gurobi
  6 |   - defaults
  7 | dependencies:
  8 |   - _libgcc_mutex=0.1=main
  9 |   - _openmp_mutex=5.1=1_gnu
 10 |   - brotli=1.0.9=h166bdaf_7
 11 |   - brotli-bin=1.0.9=h166bdaf_7
 12 |   - bzip2=1.0.8=h7b6447c_0
 13 |   - ca-certificates=2022.12.7=ha878542_0
 14 |   - certifi=2022.12.7=pyhd8ed1ab_0
 15 |   - click=8.1.3=unix_pyhd8ed1ab_2
 16 |   - contourpy=1.0.5=py310hdb19cb5_0
 17 |   - cycler=0.11.0=pyhd8ed1ab_0
 18 |   - dbus=1.13.18=hb2f20db_0
 19 |   - expat=2.2.10=h9c3ff4c_0
 20 |   - fontconfig=2.14.1=hef1e5e3_0
 21 |   - fonttools=4.25.0=pyhd3eb1b0_0
 22 |   - freetype=2.10.4=h0708190_1
 23 |   - giflib=5.2.1=h36c2ea0_2
 24 |   - glib=2.69.1=h4ff587b_1
 25 |   - gst-plugins-base=1.14.1=h6a678d5_1
 26 |   - gstreamer=1.14.1=h5eee18b_1
 27 |   - icu=58.2=hf484d3e_1000
 28 |   - joblib=1.1.1=py310h06a4308_0
 29 |   - jpeg=9e=h166bdaf_1
 30 |   - keyutils=1.6.1=h166bdaf_0
 31 |   - kiwisolver=1.4.4=py310h6a678d5_0
 32 |   - krb5=1.19.3=h3790be6_0
 33 |   - lcms2=2.12=h3be6417_0
 34 |   - ld_impl_linux-64=2.38=h1181459_1
 35 |   - lerc=3.0=h295c915_0
 36 |   - libblas=3.9.0=15_linux64_openblas
 37 |   - libbrotlicommon=1.0.9=h166bdaf_7
 38 |   - libbrotlidec=1.0.9=h166bdaf_7
 39 |   - libbrotlienc=1.0.9=h166bdaf_7
 40 |   - libcblas=3.9.0=15_linux64_openblas
 41 |   - libclang=10.0.1=default_hb85057a_2
 42 |   - libdeflate=1.17=h5eee18b_0
 43 |   - libedit=3.1.20191231=he28a2e2_2
 44 |   - libevent=2.1.12=h8f2d780_0
 45 |   - libffi=3.3=he6710b0_2
 46 |   - libgcc-ng=11.2.0=h1234567_1
 47 |   - libgfortran-ng=12.2.0=h69a702a_19
 48 |   - libgfortran5=12.2.0=h337968e_19
 49 |   - libgomp=11.2.0=h1234567_1
 50 |   - liblapack=3.9.0=15_linux64_openblas
 51 |   - libllvm10=10.0.1=he513fc3_3
 52 |   - libopenblas=0.3.20=pthreads_h78a6416_0
 53 |   - libpng=1.6.39=h5eee18b_0
 54 |   - libpq=12.9=h16c4e8d_3
 55 |   - libprotobuf=3.20.3=he621ea3_0
 56 |   - libstdcxx-ng=11.2.0=h1234567_1
 57 |   - libtiff=4.5.0=h6a678d5_2
 58 |   - libuuid=1.41.5=h5eee18b_0
 59 |   - libwebp=1.2.4=h11a3e52_1
 60 |   - libwebp-base=1.2.4=h5eee18b_1
 61 |   - libxcb=1.15=h7f8727e_0
 62 |   - libxkbcommon=1.0.1=hfa300c1_0
 63 |   - libxml2=2.9.14=h74e7548_0
 64 |   - libxslt=1.1.35=h4e12654_0
 65 |   - lz4-c=1.9.3=h9c3ff4c_1
 66 |   - matplotlib=3.7.0=py310h06a4308_0
 67 |   - matplotlib-base=3.7.0=py310h1128e8f_0
 68 |   - munkres=1.1.4=pyh9f0ad1d_0
 69 |   - ncurses=6.4=h6a678d5_0
 70 |   - nspr=4.33=h295c915_0
 71 |   - nss=3.74=h0370c37_0
 72 |   - openssl=1.1.1s=h7f8727e_0
 73 |   - packaging=23.0=pyhd8ed1ab_0
 74 |   - pcre=8.45=h9c3ff4c_0
 75 |   - pillow=9.4.0=py310h6a678d5_0
 76 |   - pip=23.0.1=py310h06a4308_0
 77 |   - ply=3.11=py_1
 78 |   - protobuf=3.20.3=py310h6a678d5_0
 79 |   - pyparsing=3.0.9=pyhd8ed1ab_0
 80 |   - pyqt=5.15.7=py310h6a678d5_1
 81 |   - python=3.10.0=h12debd9_5
 82 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
 83 |   - python-dotenv=1.0.0=pyhd8ed1ab_0
 84 |   - python_abi=3.10=2_cp310
 85 |   - qt-main=5.15.2=h327a75a_7
 86 |   - qt-webengine=5.15.9=hd2b0992_4
 87 |   - qtwebkit=5.212=h4eab89a_4
 88 |   - readline=8.2=h5eee18b_0
 89 |   - setuptools=65.6.3=py310h06a4308_0
 90 |   - sip=6.6.2=py310h6a678d5_0
 91 |   - six=1.16.0=pyh6c4a22f_0
 92 |   - sqlite=3.40.1=h5082296_0
 93 |   - tk=8.6.12=h1ccaba5_0
 94 |   - toml=0.10.2=pyhd8ed1ab_0
 95 |   - tornado=6.1=py310h5764c6d_3
 96 |   - tzdata=2022g=h04d1e81_0
 97 |   - wheel=0.38.4=py310h06a4308_0
 98 |   - xz=5.2.10=h5eee18b_1
 99 |   - zlib=1.2.13=h5eee18b_0
100 |   - zstd=1.5.2=ha4553b6_0
101 |   - pip:
102 |     - edlib==1.3.9
103 |     - numpy==1.24.2
104 |     - pyqt5-sip==12.11.0
105 |     - scipy==1.10.1
106 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/final/runexp.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # remember to run `conda activate aligner-evaluation`
  3 | set -e
  4 | set -o pipefail
  5 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
  6 | cd $thisfolder
  7 | 
  8 | # executables' absolute paths/commands (make sure they work!)
  9 | BADREAD=$thisfolder/../../../tools/Badread/badread-runner.py
 10 | GRAPHCHAINER=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer
 11 | srfaligner=$thisfolder/../../../SRFAligner
 12 | srfchainer=$thisfolder/../../../SRFChainer
 13 | graphchainer=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer
 14 | minigraph=$thisfolder/../../../tools/minigraph/minigraph
 15 | minichain=$thisfolder/../../../tools/minichain/minichain
 16 | usrbintime=/usr/bin/time
 17 | 
 18 | # params
 19 | inputgraph=$thisfolder/../input/chr22_iEFG.gfa
 20 | coverage=30
 21 | threads=64
 22 | 
 23 | # 0. setup
 24 | mkdir output
 25 | echo -n > output/runexp_log.txt
 26 | set -a
 27 | set +a
 28 | 
 29 | # 1. simulate path and reads
 30 | # comment the following 3 lines and uncomment the following ones to generate the reads (again)
 31 | #ln -s $thisfolder/../semi-repeat-free/output/sim_reads.fastq output
 32 | #ln -s $thisfolder/../semi-repeat-free/output/sim_reads_path.fasta output
 33 | #ln -s $thisfolder/../semi-repeat-free/output/sim_reads_path.nodes output
 34 | python3 ../scripts/generate_sim_reads.py \
 35 | 	--graph $inputgraph \
 36 | 	--fastq output/sim_reads.fastq \
 37 | 	--seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt </dev/zero 2>/dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \
 38 | 	--path output/sim_reads_path.nodes \
 39 | 	--fasta output/sim_reads_path.fasta \
 40 | 	--coverage $coverage \
 41 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
 42 | 
 43 | # 2. run the aligners
 44 | $usrbintime $srfaligner \
 45 | 	-t $threads \
 46 | 	-g $inputgraph \
 47 | 	-c \
 48 | 	-f output/sim_reads.fastq \
 49 | 	-a output/srfaligner_alignments.gaf \
 50 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
 51 | 
 52 | $usrbintime $srfchainer \
 53 | 	-t $threads \
 54 | 	-g $inputgraph \
 55 | 	-c \
 56 | 	-f output/sim_reads.fastq \
 57 | 	-a output/srfchainer_alignments.gaf \
 58 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
 59 | 
 60 | cat output/sim_reads.fastq | cut -d' ' -f1 > output/sim_reads_fixed_header.fastq
 61 | $usrbintime $graphchainer \
 62 | 	-t $threads \
 63 | 	-g $inputgraph \
 64 | 	-f output/sim_reads_fixed_header.fastq \
 65 | 	-a output/graphchainer_alignments.gaf \
 66 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
 67 | 
 68 | $usrbintime $minigraph \
 69 | 	-t $threads \
 70 | 	-c \
 71 | 	-x lr \
 72 | 	$inputgraph \
 73 | 	output/sim_reads.fastq \
 74 | 	-o output/minigraph_alignments.gaf \
 75 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
 76 | 
 77 | $usrbintime $minichain \
 78 | 	-t $threads \
 79 | 	-c $inputgraph \
 80 | 	output/sim_reads.fastq \
 81 | 	-o output/minichain_alignments.gaf \
 82 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
 83 | 
 84 | # 3. pick first alignment
 85 | for alignment in "srfaligner_alignments.gaf" "srfchainer_alignments.gaf" "graphchainer_alignments.gaf" "minigraph_alignments.gaf" "minichain_alignments.gaf"
 86 | do
 87 | 	awk '{if (found[$1] == "1") \
 88 | 	          {} \
 89 | 	      else
 90 | 	          {found[$1]="1"; print}}' \
 91 | 		output/$alignment > output/best_$alignment
 92 | done
 93 | 
 94 | # 4. validate and plot results
 95 | for alignment in best_srfaligner best_srfchainer best_graphchainer minigraph minichain
 96 | do
 97 | 	python3 ../scripts/compute_summary.py \
 98 | 		-t 3 \
 99 | 		--graph $inputgraph \
100 | 		--fastq output/sim_reads.fastq \
101 | 		--path output/sim_reads_path.nodes \
102 | 		--fasta output/sim_reads_path.fasta \
103 | 		--alignments output/${alignment}_alignments.gaf \
104 | 		--metrics output/metrics_${alignment}.mts
105 | done
106 | wait $(jobs -p)
107 | 
108 | python3 ../scripts/compute_metrics.py \
109 | 	--output-name output/results \
110 | 	--summaries output/*.mts \
111 | 	--summaries-names $(basename -s ".mts" -a output/*.mts | sed 's/metrics_//')
112 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/input/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/algbio/SRFAligner/c698b10ce1b3f5ec20ef5f218f6d0259eebf7911/experiments/aligner-evaluation/input/.gitkeep


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/input/covid19_100_acc.txt:
--------------------------------------------------------------------------------
  1 | MT370921
  2 | MT358688
  3 | NC_045512
  4 | MT370999
  5 | MT251977
  6 | MT345836
  7 | MT263424
  8 | MT334572
  9 | MT246477
 10 | MN997409
 11 | MT370973
 12 | MT344945
 13 | MT345880
 14 | MT246478
 15 | MT114416
 16 | MT345801
 17 | MT328033
 18 | LC542809
 19 | MT293218
 20 | MT370906
 21 | MT259248
 22 | MT334531
 23 | MT350263
 24 | MT371017
 25 | MT350263
 26 | MT322407
 27 | LC528232
 28 | MT246459
 29 | MT291832
 30 | MT263406
 31 | MT358732
 32 | MT293207
 33 | MT326065
 34 | MT326104
 35 | MT325580
 36 | MT350263
 37 | MT345841
 38 | MT325571
 39 | MN996527
 40 | MT370949
 41 | MT358734
 42 | MT192759
 43 | MT370879
 44 | MT259228
 45 | MT370872
 46 | MT358401
 47 | MT291835
 48 | MT370948
 49 | MT358675
 50 | MT358680
 51 | NC_045512
 52 | MT322399
 53 | MT326174
 54 | MT039887
 55 | MT259236
 56 | MT246461
 57 | MT370886
 58 | MT370880
 59 | MT334565
 60 | MT114419
 61 | MT370943
 62 | MT358648
 63 | MT344959
 64 | MT344961
 65 | MT370899
 66 | MT358652
 67 | MT322419
 68 | MT325580
 69 | MT322404
 70 | MT370988
 71 | MT259277
 72 | MT370911
 73 | MT370862
 74 | MT370983
 75 | MT263423
 76 | MT322400
 77 | MT370990
 78 | MT345803
 79 | MT334536
 80 | MT263437
 81 | MT334573
 82 | MT358716
 83 | MT344955
 84 | MN938384
 85 | MT292575
 86 | MT358637
 87 | MT334557
 88 | MT345870
 89 | MT350249
 90 | MT263391
 91 | MT358656
 92 | MT350263
 93 | MT350236
 94 | MT370889
 95 | MT308696
 96 | MT256924
 97 | MT370846
 98 | MN988713
 99 | MT350237
100 | MT322415
101 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/mems/runexp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # remember to run `conda activate aligner-evaluation`
 3 | set -e
 4 | set -o pipefail
 5 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
 6 | cd $thisfolder
 7 | 
 8 | # executables' absolute paths/commands (make sure they work!)
 9 | BADREAD=$thisfolder/../../../tools/Badread/badread-runner.py
10 | GRAPHCHAINER=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer
11 | graphaligner=$thisfolder/../../../tools/GraphAligner/bin/GraphAligner
12 | srfaligner=$thisfolder/../../../SRFAligner
13 | efgmemsaligner=$thisfolder/../../../efg-memsAligner
14 | usrbintime=/usr/bin/time
15 | 
16 | # params
17 | inputgraph=$thisfolder/../input/covid19_100_iEFG_simplified.gfa
18 | coverage=1000
19 | threads=1
20 | 
21 | # 0. setup
22 | mkdir output
23 | echo -n > output/runexp_log.txt
24 | set -a
25 | set +a
26 | 
27 | # 1. simulate path and reads
28 | python3 ../scripts/generate_sim_reads.py \
29 | 	--graph $inputgraph \
30 | 	--fastq output/sim_reads.fastq \
31 | 	--seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt </dev/zero 2>/dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \
32 | 	--path output/sim_reads_path.nodes \
33 | 	--fasta output/sim_reads_path.fasta \
34 | 	--coverage $coverage \
35 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
36 | 
37 | # 2. run the aligners
38 | $usrbintime $srfaligner \
39 | 	-t $threads \
40 | 	-g $inputgraph \
41 | 	-i "N" \
42 | 	-f output/sim_reads.fastq \
43 | 	-a output/semi_repeat_free_alignments.gaf \
44 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
45 | 
46 | $usrbintime $graphaligner \
47 | 	-t $threads \
48 | 	-x "vg" \
49 | 	-g $inputgraph \
50 | 	-f output/sim_reads.fastq \
51 | 	-a output/graphaligner_alignments.gaf \
52 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
53 | 
54 | $usrbintime $efgmemsaligner \
55 | 	-g $inputgraph \
56 | 	-f output/sim_reads.fastq \
57 | 	-a output/efg_mems_alignments.gaf \
58 | 	-w output \
59 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
60 | 
61 | # 3. pick first alignment
62 | for alignment in "semi_repeat_free_alignments.gaf" "graphaligner_alignments.gaf" "efg_mems_alignments.gaf"
63 | do
64 | 	awk '{if (found[$1] == "1") \
65 | 	          {} \
66 | 	      else
67 | 	          {found[$1]="1"; print}}' \
68 | 		output/$alignment > output/best_$alignment
69 | done
70 | 
71 | # 4. validate and plot results
72 | for alignment in semi_repeat_free graphaligner efg_mems
73 | do
74 | 	python3 ../scripts/compute_summary.py \
75 | 		-t 3 \
76 | 		--graph $inputgraph \
77 | 		--fastq output/sim_reads.fastq \
78 | 		--path output/sim_reads_path.nodes \
79 | 		--fasta output/sim_reads_path.fasta \
80 | 		--alignments output/best_${alignment}_alignments.gaf \
81 | 		--metrics output/metrics_${alignment}.mts
82 | done
83 | wait $(jobs -p)
84 | 
85 | python3 ../scripts/compute_metrics.py \
86 | 	--output-name output/results \
87 | 	--summaries output/*.mts \
88 | 	--summaries-names $(basename -s ".mts" -a output/*.mts | sed 's/metrics_//')
89 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/scripts/.env:
--------------------------------------------------------------------------------
1 | BADREAD=../../../tools/Badread/badread-runner.py
2 | GRAPHCHAINER=../../../tools/GraphChainer/bin/GraphChainer
3 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/scripts/compute_metrics.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser, RawTextHelpFormatter
  2 | from numpy import linspace
  3 | from matplotlib.pyplot import subplots, savefig
  4 | 
  5 | sigma_range = list(linspace(0, 1, 1001))
  6 | 
  7 | 
  8 | def populate_metric_dicts(lines, col_idx, accuracy_dict, length_dict, distance=True):
  9 | 
 10 |     for sigma in sigma_range:
 11 |         accuracy_dict[sigma] = 0
 12 |         length_dict[sigma] = 0
 13 | 
 14 |     total_reads = 0
 15 |     total_length = 0
 16 | 
 17 |     for i, line in enumerate(lines):
 18 |         if i == 0:
 19 |             continue
 20 |         values = line.strip().split(',')
 21 |         edit_distance = float(values[col_idx])
 22 |         read_len = float(values[-1])
 23 |         truth_len = float(values[-2])
 24 | 
 25 |         used_len = read_len if col_idx == 2 else truth_len
 26 |         total_reads += 1
 27 |         total_length += read_len
 28 |         for sigma in sigma_range:
 29 |             if used_len > 0:
 30 |                 if distance:
 31 |                     if edit_distance / used_len <= sigma:
 32 |                         accuracy_dict[sigma] += 1
 33 |                         length_dict[sigma] += read_len
 34 |                 else:
 35 |                     if edit_distance / used_len >= sigma:
 36 |                         accuracy_dict[sigma] += 1
 37 |                         length_dict[sigma] += read_len
 38 | 
 39 |     # converting counts to percentages
 40 |     for sigma in sigma_range:
 41 |         accuracy_dict[sigma] = float(accuracy_dict[sigma]) / total_reads * 100
 42 |         length_dict[sigma] = float(length_dict[sigma]) / total_length * 100
 43 | 
 44 | 
 45 | def compute_metrics(args):
 46 | 
 47 |     # Metrics
 48 |     metrics = dict()
 49 |     for aligner, csv in zip(args.summaries_names, args.summaries):
 50 | 
 51 |         metrics[aligner] = {
 52 |             'overlap': {
 53 |                 'accuracy': dict(),
 54 |                 'length': dict()
 55 |             },
 56 |             'truth': {
 57 |                 'accuracy': dict(),
 58 |                 'length': dict()
 59 |             },
 60 |             'read': {
 61 |                 'accuracy': dict(),
 62 |                 'length': dict()
 63 |             }
 64 |         }
 65 | 
 66 |         lines = open(csv).read().split('\n')[:-1]
 67 |         populate_metric_dicts(
 68 |             lines, 4, metrics[aligner]['overlap']['accuracy'], metrics[aligner]['overlap']['length'], distance=False
 69 |         )
 70 |         populate_metric_dicts(lines, 3, metrics[aligner]['truth']['accuracy'], metrics[aligner]['truth']['length'])
 71 |         populate_metric_dicts(lines, 2, metrics[aligner]['read']['accuracy'], metrics[aligner]['read']['length'])
 72 | 
 73 |     # Values for tables (Hardcoded for now)
 74 |     first_delta = 0.1
 75 |     second_delta = 0.9500000000000001
 76 |     first_sigma = 0.1
 77 | 
 78 |     # Tables accuracy
 79 | 
 80 |     # Compute lines for each aligner
 81 |     aligners_lines_overlap_accuracy = ""
 82 |     aligners_lines_truth_read_accuracy = ""
 83 |     for aligner in args.summaries_names:
 84 |         aligners_lines_overlap_accuracy += f"    & {aligner}                    & {round(metrics[aligner]['overlap']['accuracy'][first_delta], 2)}\\%            & {round(metrics[aligner]['overlap']['accuracy'][second_delta], 2)}\\%            \\\\\n"
 85 |         aligners_lines_truth_read_accuracy += f"    & {aligner}                    & {round(metrics[aligner]['truth']['accuracy'][first_sigma], 2)}\\%            & {round(metrics[aligner]['read']['accuracy'][first_sigma], 2)}\\%            \\\\\n"
 86 | 
 87 |     tables_accuracy = f"""
 88 | \\begin{{tabular}}{{|c | l | l | l |}} 
 89 |     \\hline
 90 |     Dataset & \\multicolumn{{1}}{{c|}}{{Aligner}} & \\multicolumn{{2}}{{c|}}{{Correctly aligned}} \\\\ 
 91 |     \\cline{{3-4}} & & \\multicolumn{{1}}{{c|}}{{$\\delta = {round(first_delta, 2)}$}} & \\multicolumn{{1}}{{c|}}{{$\\delta = {round(second_delta, 2)}$}} \\\\
 92 |     \\hline\\hline
 93 |     \\multirow{{{len(args.summaries_names)}}}{{*}}{{{args.output_name}}}{aligners_lines_overlap_accuracy}\\hline
 94 | \\end{{tabular}}
 95 | 
 96 | \\begin{{tabular}}{{|c | l | l | l |}} 
 97 |     \\hline
 98 |     Dataset & \\multicolumn{{1}}{{c|}}{{Aligner}} & \\multicolumn{{2}}{{c|}}{{Correctly aligned}} \\\\ 
 99 |     \\cline{{3-4}} & & \\multicolumn{{1}}{{c|}}{{$\\sigma_{{truth}} = {round(first_sigma, 2)}$}} & \\multicolumn{{1}}{{c|}}{{$\\sigma_{{read}} = {round(first_sigma, 2)}$}} \\\\
100 |     \\hline\\hline
101 |     \\multirow{{{len(args.summaries_names)}}}{{*}}{{{args.output_name}}}{aligners_lines_truth_read_accuracy}\\hline
102 | \\end{{tabular}}
103 |     """
104 | 
105 |     tex_file = open(f'{args.output_name}_accuracy.tex', 'w')
106 |     tex_file.write(tables_accuracy)
107 |     tex_file.close()
108 | 
109 |     # Plot accuracy
110 |     fig, (overlap, truth, read) = subplots(3, 1)
111 | 
112 |     overlap.set_xlabel('Overlap criterion ($\delta$)')
113 |     overlap.set_ylabel('% Correctly aligned')
114 |     truth.set_xlabel('Edit distance criterion ($\sigma_{truth}$)')
115 |     truth.set_ylabel('% Correctly aligned')
116 |     read.set_xlabel('Edit distance criterion ($\sigma_{read}$)')
117 |     read.set_ylabel('% Correctly aligned')
118 | 
119 |     for aligner in args.summaries_names:
120 |         overlap.plot(sigma_range, metrics[aligner]['overlap']['accuracy'].values(), label=aligner)
121 |         truth.plot(sigma_range, metrics[aligner]['truth']['accuracy'].values(), label=aligner)
122 |         read.plot(sigma_range, metrics[aligner]['read']['accuracy'].values(), label=aligner)
123 | 
124 |     overlap.grid()
125 |     overlap.legend()
126 |     truth.grid()
127 |     truth.legend()
128 |     read.grid()
129 |     read.legend()
130 | 
131 | 
132 |     fig.set_figwidth(3.87)
133 |     fig.set_figheight(11.5)
134 |     fig.tight_layout(pad=1.15)
135 | 
136 |     savefig(f'{args.output_name}_accuracy.pdf')
137 | 
138 | 
139 |     # Tables length
140 | 
141 |     # Compute lines for each aligner
142 |     aligners_lines_overlap_length = ""
143 |     aligners_lines_truth_read_length = ""
144 |     for aligner in args.summaries_names:
145 |         aligners_lines_overlap_length += f"    & {aligner}                    & {round(metrics[aligner]['overlap']['length'][first_delta], 2)}\\%            & {round(metrics[aligner]['overlap']['length'][second_delta], 2)}\\%            \\\\\n"
146 |         aligners_lines_truth_read_length += f"    & {aligner}                    & {round(metrics[aligner]['truth']['length'][first_sigma], 2)}\\%            & {round(metrics[aligner]['read']['length'][first_sigma], 2)}\\%            \\\\\n"
147 | 
148 |     tables_length = f"""
149 |     \\begin{{tabular}}{{|c | l | l | l |}} 
150 |         \\hline
151 |         Dataset & \\multicolumn{{1}}{{c|}}{{Aligner}} & \\multicolumn{{2}}{{c|}}{{Good length}} \\\\ 
152 |         \\cline{{3-4}} & & \\multicolumn{{1}}{{c|}}{{$\\delta = {round(first_delta, 2)}$}} & \\multicolumn{{1}}{{c|}}{{$\\delta = {round(second_delta, 2)}$}} \\\\
153 |         \\hline\\hline
154 |         \\multirow{{{len(args.summaries_names)}}}{{*}}{{{args.output_name}}}{aligners_lines_overlap_length}\\hline
155 |     \\end{{tabular}}
156 | 
157 |     \\begin{{tabular}}{{|c | l | l | l |}} 
158 |         \\hline
159 |         Dataset & \\multicolumn{{1}}{{c|}}{{Aligner}} & \\multicolumn{{2}}{{c|}}{{Good length}} \\\\ 
160 |         \\cline{{3-4}} & & \\multicolumn{{1}}{{c|}}{{$\\sigma_{{truth}} = {round(first_sigma, 2)}$}} & \\multicolumn{{1}}{{c|}}{{$\\sigma_{{read}} = {round(first_sigma, 2)}$}} \\\\
161 |         \\hline\\hline
162 |         \\multirow{{{len(args.summaries_names)}}}{{*}}{{{args.output_name}}}{aligners_lines_truth_read_length}\\hline
163 |     \\end{{tabular}}
164 |         """
165 | 
166 |     tex_file = open(f'{args.output_name}_length.tex', 'w')
167 |     tex_file.write(tables_length)
168 |     tex_file.close()
169 | 
170 | 
171 | 
172 |     # Plot length
173 |     fig, (overlap, truth, read) = subplots(3, 1)
174 | 
175 |     overlap.set_xlabel('Overlap criterion ($\delta$)')
176 |     overlap.set_ylabel('% Good length')
177 |     truth.set_xlabel('Edit distance criterion ($\sigma_{truth}$)')
178 |     truth.set_ylabel('% Good length')
179 |     read.set_xlabel('Edit distance criterion ($\sigma_{read}$)')
180 |     read.set_ylabel('% Good length')
181 | 
182 |     for aligner in args.summaries_names:
183 |         overlap.plot(sigma_range, metrics[aligner]['overlap']['length'].values(), label=aligner)
184 |         truth.plot(sigma_range, metrics[aligner]['truth']['length'].values(), label=aligner)
185 |         read.plot(sigma_range, metrics[aligner]['read']['length'].values(), label=aligner)
186 | 
187 |     overlap.grid()
188 |     overlap.legend()
189 |     truth.grid()
190 |     truth.legend()
191 |     read.grid()
192 |     read.legend()
193 | 
194 |     fig.set_figwidth(3.87)
195 |     fig.set_figheight(11.5)
196 |     fig.tight_layout(pad=1.15)
197 | 
198 |     savefig(f'{args.output_name}_length.pdf')
199 | 
200 | 
201 | if __name__ == '__main__':
202 | 
203 |     parser = ArgumentParser(
204 |         description='''
205 |                Computes distance and overlap metrics based on summary files.
206 |                Outputs plots and tables.            
207 |             ''',
208 |         formatter_class=RawTextHelpFormatter
209 |     )
210 | 
211 |     requiredNamed = parser.add_argument_group('required arguments')
212 |     requiredNamed.add_argument('-n', '--output-name', type=str, help='Output base name', required=True)
213 |     requiredNamed.add_argument(
214 |         '-s', '--summaries', type=str,
215 |         help='Input summary (csv( files used to compute metrics, one per aligner/configuration',
216 |         required=True, nargs='+'
217 |     )
218 |     requiredNamed.add_argument(
219 |         '-sn', '--summaries-names', type=str, help='Name of aligners/configurations, used in plot/table legends',
220 |         required=True, nargs='+'
221 |     )
222 | 
223 |     compute_metrics(parser.parse_args())
224 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/scripts/compute_summary.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser, RawTextHelpFormatter
  2 | from gzip import GzipFile
  3 | from google.protobuf.internal.decoder import _DecodeVarint32
  4 | from vg_pb2 import Alignment
  5 | from numpy import cumsum
  6 | from joblib import Parallel, delayed
  7 | from edlib import align
  8 | from bisect import bisect
  9 | from re import split
 10 | 
 11 | 
 12 | def load_graph(gfa_graph):
 13 | 
 14 |     vertex_labels, edges = dict(), dict()
 15 | 
 16 |     for line in open(gfa_graph).read().split('\n')[:-1]:
 17 |         if line[0] == 'S':
 18 |             str_id, label = line[1:].strip().split()
 19 |             vertex_labels[str_id] = label
 20 |         if line[0] == 'L':
 21 |             tail_str_id, _, head_str_id, _, _ = line[1:].strip().split()
 22 |             tail_id, head_id = tail_str_id, head_str_id
 23 |             if tail_id not in edges:
 24 |                 edges[tail_id] = list()
 25 |             edges[tail_id].append(head_id)
 26 | 
 27 |     return vertex_labels, edges
 28 | 
 29 | 
 30 | def get_read_info(read_label, read_header):
 31 | 
 32 |     s, t = map(int, read_header.split()[1].split(',')[-1].split('-'))
 33 |     is_reverse_comp = '-strand' in read_header.split()[1].split(',')
 34 | 
 35 |     return read_label, is_reverse_comp, s, t
 36 | 
 37 | 
 38 | def load_reads_and_ref(fastq, fasta, path):
 39 | 
 40 |     fastq_lines = open(fastq).read().split('\n')[:-1]
 41 | 
 42 |     ref = open(fasta).readlines()[-1].strip() if fasta else ''
 43 |     ref_rev_comp = ''.join({'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}[b] for b in ref[::-1])
 44 |     path = [s.strip().split()[1] for s in open(path).readlines()] if path else list()
 45 | 
 46 |     if path:
 47 |         reads = {
 48 |             read_header.strip().split()[0][1:]: get_read_info(read_label.strip(), read_header.strip())
 49 |             for read_header, read_label in zip(fastq_lines[::4], fastq_lines[1::4])
 50 |         }
 51 |     else:
 52 |         reads = {
 53 |             read_header.strip().split()[0][1:]: (read_label.strip(), False, -1, -1)
 54 |             for read_header, read_label in zip(fastq_lines[::4], fastq_lines[1::4])
 55 |         }
 56 | 
 57 |     return reads, ref, ref_rev_comp, path
 58 | 
 59 | 
 60 | def read_gam(gam_filename):
 61 | 
 62 |     with open(gam_filename, 'rb') as f:
 63 |         buf = GzipFile(fileobj=f).read()
 64 |         n = 0
 65 |         while n < len(buf):
 66 |             an, n = _DecodeVarint32(buf, n)
 67 |             for i in range(an):
 68 |                 msg_len, n = _DecodeVarint32(buf, n)
 69 |                 msg_buf = buf[n:n + msg_len]
 70 |                 n += msg_len
 71 |                 aln = Alignment()
 72 |                 aln.ParseFromString(msg_buf)
 73 |                 yield aln
 74 | 
 75 | 
 76 | def parse_gam(raw_gam, vertex_labels):
 77 | 
 78 |     name = raw_gam.name.split()[0]
 79 |     rev_cnt = 0
 80 |     path = list()
 81 |     idx, n = 0, len(raw_gam.path.mapping)
 82 |     first_node_off, last_node_off = 0, 0
 83 | 
 84 |     seqs = list()
 85 | 
 86 |     for x in raw_gam.path.mapping:
 87 | 
 88 |         node_name = x.position.name
 89 |         if node_name =='':
 90 |             node_name = str(x.position.node_id)
 91 |         ll = vertex_labels[node_name]
 92 |         original_len = len(ll)
 93 | 
 94 |         if x.position.is_reverse:
 95 |             rev_cnt += 1
 96 |         if idx == 0:
 97 |             if rev_cnt > 0:
 98 |                 first_node_off = original_len - x.position.offset
 99 |             else:
100 |                 first_node_off = x.position.offset
101 | 
102 |         if idx == n - 1:
103 |             suma = sum(i.from_length for i in x.edit)
104 |             if rev_cnt > 0:
105 |                 last_node_off = original_len - suma - (x.position.offset if idx == 0 else 0)
106 |             else:
107 |                 last_node_off = suma + (x.position.offset if idx == 0 else 0)
108 | 
109 |         if idx == 0 and idx == n - 1:
110 |             if rev_cnt > 0:
111 |                 ll = ll[last_node_off:first_node_off]
112 |             else:
113 |                 ll = ll[first_node_off:last_node_off]
114 |         elif idx == 0:
115 |             if rev_cnt > 0:
116 |                 ll = ll[:first_node_off]
117 |             else:
118 |                 ll = ll[first_node_off:]
119 |         elif idx == n - 1:
120 |             if rev_cnt > 0:
121 |                 ll = ll[last_node_off:]
122 |             else:
123 |                 ll = ll[:last_node_off]
124 | 
125 |         if rev_cnt > 0:
126 |             ll = ''.join({'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}[b] for b in ll[::-1])
127 | 
128 |         path.append(node_name)
129 |         seqs.append(ll)
130 |         idx += 1
131 | 
132 |     seq = ''.join(seqs)
133 | 
134 |     return name, seq, len(path), rev_cnt, len(seq), path, first_node_off, last_node_off
135 | 
136 | 
137 | def load_gam(gam_filename, vertex_labels):
138 | 
139 |     ret = dict()
140 | 
141 |     for raw_gam in read_gam(gam_filename):
142 |         a = parse_gam(raw_gam, vertex_labels)
143 |         if a[0] not in ret:
144 |             ret[a[0]] = a
145 |         else:
146 |             # take the longer one when multiple alignments
147 |             if a[-4] > ret[a[0]][-4]:
148 |                 ret[a[0]] = a
149 | 
150 |     return ret
151 | 
152 | 
153 | def read_gaf(gaf_filename):
154 | 
155 |     for line in open(gaf_filename, 'r').read().split('\n')[:-1]:
156 |         items = line.split('\t')
157 |         raw_path = items[5]
158 |         path = split('<|>', raw_path)[1:]
159 |         rev = '<' in raw_path
160 | 
161 |         yield items[0], path, rev, int(items[7]), int(items[8])
162 | 
163 | 
164 | def parse_gaf(raw_gaf, vertex_labels):
165 | 
166 |     id, path, rev, f_o, l_o = raw_gaf
167 | 
168 |     if rev:
169 |         rev_cnt = len(path)
170 |         first_node_off, last_node_off = len(vertex_labels[path[0]]) - f_o, len(vertex_labels[path[-1]]) - l_o
171 |     else:
172 |         rev_cnt = 0
173 |         first_node_off, last_node_off = f_o, l_o
174 | 
175 | 
176 |     seqs = list()
177 |     n = len(path)
178 | 
179 |     for idx, node_id in enumerate(path):
180 | 
181 |         ll = vertex_labels[node_id]
182 |         original_length = len(ll)
183 | 
184 |         if idx == 0 and idx == n - 1:
185 |             if rev_cnt > 0:
186 |                 ll = ll[last_node_off:first_node_off]
187 |             else:
188 |                 ll = ll[first_node_off:last_node_off]
189 |         elif idx == 0:
190 |             if rev_cnt > 0:
191 |                 ll = ll[:first_node_off]
192 |             else:
193 |                 ll = ll[first_node_off:]
194 |         elif idx == n - 1:
195 |             if rev_cnt > 0:
196 |                 ll = ll[last_node_off:]
197 |             else:
198 |                 ll = ll[:last_node_off]
199 | 
200 |         if rev_cnt > 0:
201 |             ll = ''.join({'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}[b] for b in ll[::-1])
202 |             if idx < n - 1:
203 |                 last_node_off += original_length
204 |         else:
205 |             if idx < n - 1:
206 |                 last_node_off -= original_length
207 | 
208 |         seqs.append(ll)
209 | 
210 |     seq = ''.join(seqs)
211 | 
212 |     return id, seq, n, rev_cnt, len(seq), path, first_node_off, last_node_off
213 | 
214 | 
215 | def load_gaf(gaf_filename, vertex_labels):
216 | 
217 |     ret = dict()
218 | 
219 |     for raw_gaf in read_gaf(gaf_filename):
220 |         a = parse_gaf(raw_gaf, vertex_labels)
221 |         if a[0] not in ret:
222 |             ret[a[0]] = a
223 |         else:
224 |             # take the longer one when multiple alignments
225 |             if a[-4] > ret[a[0]][-4]:
226 |                 ret[a[0]] = a
227 | 
228 |     return ret
229 | 
230 | 
231 | def compute_overlap(read, node):
232 |     return max(0, min(read[1], node[1]) - max(read[0], node[0]))
233 | 
234 | 
235 | def compute_summary(args):
236 | 
237 |     vertex_labels, edges = load_graph(args.graph)
238 |     reads, ref_seq, ref_seq_rev_comp, ref_path = load_reads_and_ref(args.fastq, args.fasta, args.path)
239 |     node_limits = list(cumsum([len(vertex_labels[v]) for v in ref_path]))
240 | 
241 |     for alignment_filename, metric in zip(args.alignments, args.metrics):
242 |         if alignment_filename.endswith('.gam'):
243 |             alignments = load_gam(alignment_filename, vertex_labels)
244 |         elif alignment_filename.endswith('.gaf'):
245 |             alignments = load_gaf(alignment_filename, vertex_labels)
246 | 
247 | 
248 |         csv = open(metric, 'w')
249 |         csv.write('id,len_aln,ed_read,ed_true,overlap,len_truth,len_read\n')
250 | 
251 |         def compute_edit(read_id):
252 | 
253 |             read_label, reverse_read, s, t = reads[read_id]
254 |             t = min(t, len(ref_seq))
255 |             true_seq = ''
256 | 
257 |             if s > 0 and t > 0:  # SIMULATED READ
258 | 
259 |                 if reverse_read:
260 |                     true_seq = ref_seq_rev_comp[s: t]
261 |                     s, t = len(ref_seq) - t, len(ref_seq) - s  ## Transform to original coordinates
262 |                 else:
263 |                     true_seq = ref_seq[s: t]
264 | 
265 |                 first_node_index, last_node_index = bisect(node_limits, s), bisect(node_limits, t - 1)
266 |                 read_path = ref_path[first_node_index:last_node_index + 1]
267 |                 read_path_intervals = [[0, len(vertex_labels[n])] for n in read_path]
268 |                 read_path_intervals[0][0] = s - (0 if first_node_index == 0 else node_limits[first_node_index - 1])
269 |                 read_path_intervals[-1][-1] = t - (0 if last_node_index == 0 else node_limits[last_node_index - 1])
270 |                 read_path_nodes = {read_path[i]: read_path_intervals[i] for i in range(len(read_path))}
271 | 
272 |             aln_seq = ''
273 |             overlap = 0
274 | 
275 |             if read_id in alignments:
276 | 
277 |                 a = alignments[read_id]
278 |                 aln_seq = a[1]
279 | 
280 |                 if s > 0 and t > 0:
281 | 
282 |                     first_node_off = a[-2]
283 |                     last_node_off = a[-1]
284 |                     reverse_alignment = a[3] > 0
285 | 
286 |                     if reverse_alignment == reverse_read:  # Only consider overlap if same direction paths
287 | 
288 |                         alignment_path = a[5]
289 |                         alignment_path_intervals = [[0, len(vertex_labels[n])] for n in alignment_path]
290 |                         if reverse_alignment:
291 |                             alignment_path_intervals[0][-1] = first_node_off
292 |                             alignment_path_intervals[-1][0] = last_node_off
293 |                         else:
294 |                             alignment_path_intervals[0][0] = first_node_off
295 |                             alignment_path_intervals[-1][-1] = last_node_off
296 | 
297 | 
298 |                         alignment_path_nodes = {alignment_path[i]: alignment_path_intervals[i] for i in
299 |                                                 range(len(alignment_path))}
300 |                         overlap = sum(compute_overlap(read_path_nodes[x], alignment_path_nodes[x]) for x in a[5] if
301 |                                       x in read_path_nodes)
302 | 
303 |             row = list()
304 |             row.append(read_id)
305 |             row.append(len(aln_seq))
306 |             row.append(align(read_label, aln_seq, mode='NW')['editDistance'])
307 |             row.append(align(true_seq, aln_seq, mode='NW')['editDistance'])
308 |             row.append(overlap)
309 |             row.append(len(true_seq))
310 |             row.append(len(read_label))
311 | 
312 |             return row
313 | 
314 |         reads_ids = [id for id in reads]
315 |         reads_n = len(reads_ids)
316 |         block_size = 200
317 |         n_blocks = reads_n // block_size + 1
318 | 
319 |         def compute_edit_kernel(tid):
320 |             l, r = tid * reads_n // n_blocks, (tid + 1) * reads_n // n_blocks
321 |             tmp = []
322 |             for i in range(l, r):
323 |                 tmp.append(compute_edit(reads_ids[i]))
324 |             return tmp
325 | 
326 |         tmp_list = Parallel(n_jobs=args.threads, prefer="threads")(
327 |             delayed(compute_edit_kernel)(t_idx) for t_idx in range(n_blocks))
328 |         processed_list = list()
329 |         for c in tmp_list:
330 |             processed_list += c
331 | 
332 |         for row in processed_list:
333 |             csv.write(','.join(list(map(str, row))) + '\n')
334 |         csv.close()
335 | 
336 | 
337 | if __name__ == '__main__':
338 | 
339 |     parser = ArgumentParser(
340 |         description='''
341 |                Computes distances between alignments and input reads.
342 |                If read are simulated it also computes overlaps and distances to truth.            
343 |             ''',
344 |         formatter_class=RawTextHelpFormatter
345 |     )
346 | 
347 |     requiredNamed = parser.add_argument_group('required arguments')
348 |     requiredNamed.add_argument('-g', '--graph', type=str, help='Input gfa file', required=True)
349 |     requiredNamed.add_argument('-fq', '--fastq', type=str, help='Input fastq file', required=True)
350 |     requiredNamed.add_argument(
351 |         '-als', '--alignments', type=str, help='Output gam/gaf files (with extension, each)', required=True, nargs='+'
352 |     )
353 |     requiredNamed.add_argument(
354 |         '-mts', '--metrics', type=str, help='Output csv files with metrics', required=True, nargs='+'
355 |     )
356 | 
357 |     parser.add_argument('-p', '--path', type=str, help='Output path file (node ids of selected path)')
358 |     parser.add_argument('-fa', '--fasta', type=str, help='Output fasta of original path')
359 |     parser.add_argument('-t', '--threads', type=int, help='Number of threads', default=30)
360 | 
361 |     compute_summary(parser.parse_args())
362 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/scripts/generate_sim_reads.py:
--------------------------------------------------------------------------------
 1 | from os import getenv
 2 | from dotenv import load_dotenv
 3 | from subprocess import run
 4 | from argparse import ArgumentParser, RawTextHelpFormatter
 5 | 
 6 | 
 7 | def generate_sim_reads(args):
 8 | 
 9 |     load_dotenv()
10 |     badread = getenv('BADREAD')
11 |     graphchainer = getenv('GRAPHCHAINER')
12 | 
13 |     run(
14 |         f'{graphchainer} --generate-path --generate-path-seed {args.seed} -g {args.graph} -f {args.fasta} -a tmp.gam'
15 |         .split()
16 |     )
17 | 
18 |     run(f'mv tmp.gam {args.path}'.split())
19 | 
20 |     run(
21 |         f'python {badread} simulate --identity 95,99,2.5 --seed {args.seed} --reference {args.fasta} '
22 |         f'--quantity {args.coverage}x --length 15000,13000 --error_model nanopore2023 --qscore_model nanopore2023 --junk_reads 0 --random_reads 0 '
23 |         f'--chimeras 0'.split(),
24 |         stdout=open(args.fastq, 'wb')
25 |     )
26 | 
27 | 
28 | if __name__ == '__main__':
29 | 
30 |     parser = ArgumentParser(
31 |         description='''
32 |             Generates simulated reads from a random path of an input vg/gfa file using the Badread simulator.
33 |             Badread parameters are fixed to --identity 95,99,2.5 --length 15000,13000 --error_model nanopore2023
34 |             --qscore_model nanopore2023 --junk_reads 0 --random_reads 0 --chimeras 0.
35 |         ''',
36 |         formatter_class=RawTextHelpFormatter
37 |     )
38 | 
39 |     requiredNamed = parser.add_argument_group('required arguments')
40 |     requiredNamed.add_argument('-g', '--graph', type=str, help='Input vg/gfa file', required=True)
41 |     requiredNamed.add_argument('-fq', '--fastq', type=str, help='Output fastq file', required=True)
42 | 
43 |     parser.add_argument('-s', '--seed', type=int, help='Seed for random path generator and Badread', default=0)
44 |     parser.add_argument(
45 |         '-p', '--path', type=str, help='Output path file (node ids of selected path)', default='tmp.path'
46 |     )
47 |     parser.add_argument('-fa', '--fasta', type=str, help='Output fasta of original path', default='tmp.fasta')
48 |     parser.add_argument('-c', '--coverage', type=int, help='Coverage value given to Badread', default=15)
49 | 
50 |     generate_sim_reads(parser.parse_args())
51 | 
52 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/scripts/run_experiment.py:
--------------------------------------------------------------------------------
 1 | from os import getenv
 2 | from dotenv import load_dotenv
 3 | from subprocess import run
 4 | from argparse import ArgumentParser, RawTextHelpFormatter
 5 | 
 6 | 
 7 | def run_experiment(args):
 8 |     load_dotenv()
 9 | 
10 |     srfaligner = getenv('SRFALIGNER')
11 |     run(
12 |         f'{srfaligner} -t {args.threads} -f {args.fastq} -g {args.graph} '
13 |         f'-a {args.alignment}_srfaligner.gaf -w output'.split()
14 |     )
15 | 
16 | #    graphaligner = getenv('GRAPHALIGNER')
17 | #    run(
18 | #        f'{graphaligner} -t {args.threads} -x vg -f {args.fastq} -g {args.graph} --verbose '
19 | #        f'-a {args.alignment}_graphaligner.gaf'.split()
20 | #    )
21 | 
22 | #    minigraph = getenv('MINIGRAPH')
23 | #    run(
24 | #        f'{minigraph} -t {args.threads} -c {args.graph} {args.fastq}'.split(),
25 | #        stdout=open(f'{args.alignment}_minigraph.gaf', 'wb')
26 | #    )
27 | 
28 | #    srfchainer = getenv('SRFCHAINER')
29 | #    run(
30 | #        f'{srfchainer} -t {args.threads} -f {args.fastq} -g {args.graph} '
31 | #        f'-a {args.alignment}_srfchainer.gaf -w output'.split()
32 | #    )
33 | 
34 | #    graphchainer = getenv('GRAPHCHAINER')
35 | #    run(
36 | #        f'{graphchainer} -t {args.threads} -f {args.fastq} -g {args.graph} '
37 | #        f'-a {args.alignment}_graphchainer.gam '.split()
38 | #    )
39 | 
40 | #    minichain = getenv('MINICHAIN')
41 | #    run(
42 | #        f'{minichain} -t {args.threads} -c {args.graph} {args.fastq}'.split(),
43 | #        stdout=open(f'{args.alignment}_minichain.gaf', 'wb')
44 | #    )
45 | 
46 | 
47 | if __name__ == '__main__':
48 | 
49 |     parser = ArgumentParser(
50 |         description='''
51 |                Run aligners GraphAligner, minigraph, and srfaligner on the vg(?)/gfa and fastq files specified.            
52 |             ''',
53 |         formatter_class=RawTextHelpFormatter
54 |     )
55 | 
56 |     requiredNamed = parser.add_argument_group('required arguments')
57 |     requiredNamed.add_argument('-g', '--graph', type=str, help='Input vg/gfa file', required=True)
58 |     requiredNamed.add_argument('-fq', '--fastq', type=str, help='Input fastq file', required=True)
59 |     requiredNamed.add_argument(
60 |         '-a', '--alignment', type=str, help='Output gam/gaf files (without extension)', required=True
61 |     )
62 | 
63 |     parser.add_argument('-t', '--threads', type=int, help='Number of threads', default=30)
64 | 
65 |     run_experiment(parser.parse_args())
66 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/semi-repeat-free/runexp.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # remember to run `conda activate aligner-evaluation`
  3 | set -e
  4 | set -o pipefail
  5 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
  6 | cd $thisfolder
  7 | 
  8 | # executable's absolute paths/commands (make sure they work!)
  9 | BADREAD=$thisfolder/../../../tools/Badread/badread-runner.py
 10 | GRAPHCHAINER=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer
 11 | graphaligner=$thisfolder/../../../tools/GraphAligner/bin/GraphAligner
 12 | srfaligner=$thisfolder/../../../SRFAligner
 13 | usrbintime=/usr/bin/time
 14 | 
 15 | # params
 16 | inputgraph=$thisfolder/../input/chr22_iEFG.gfa
 17 | coverage=30
 18 | threads=64
 19 | 
 20 | # 0. setup
 21 | mkdir output
 22 | echo -n > output/runexp_log.txt
 23 | set -a
 24 | set +a
 25 | 
 26 | # 1. simulate path and reads
 27 | python3 ../scripts/generate_sim_reads.py \
 28 | 	--graph $inputgraph \
 29 | 	--fastq output/sim_reads.fastq \
 30 | 	--seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt </dev/zero 2>/dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \
 31 | 	--path output/sim_reads_path.nodes \
 32 | 	--fasta output/sim_reads_path.fasta \
 33 | 	--coverage $coverage \
 34 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
 35 | 
 36 | # 2. run the aligners
 37 | $usrbintime $srfaligner \
 38 | 	-t $threads \
 39 | 	-g $inputgraph \
 40 | 	-f output/sim_reads.fastq \
 41 | 	-a output/semi_repeat_free_alignments.gaf \
 42 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
 43 | 
 44 | $usrbintime $graphaligner \
 45 | 	-t $threads \
 46 | 	-x "vg" \
 47 | 	-g $inputgraph \
 48 | 	-f output/sim_reads.fastq \
 49 | 	-a output/graphaligner_alignments.gaf \
 50 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
 51 | 
 52 | for o in 1 5 10 50 100
 53 | do
 54 | 	$usrbintime $srfaligner \
 55 | 		-t $threads \
 56 | 		-g $inputgraph \
 57 | 		-o $o \
 58 | 		-f output/sim_reads.fastq \
 59 | 		-a output/srf_edge_longest_${o}_alignments.gaf \
 60 | 		2>> output/runexp_log.txt >> output/runexp_log.txt
 61 | done
 62 | 
 63 | # 3. pick first alignment
 64 | for alignment in "semi_repeat_free_alignments.gaf" "graphaligner_alignments.gaf"
 65 | do
 66 | 	awk '{if (found[$1] == "1") \
 67 | 	          {} \
 68 | 	      else
 69 | 	          {found[$1]="1"; print}}' \
 70 | 		output/$alignment > output/best_$alignment
 71 | done
 72 | for o in 1 5 10 50 100
 73 | do
 74 | 	awk '{if (found[$1] == "1") \
 75 | 	          {} \
 76 | 	      else
 77 | 	          {found[$1]="1"; print}}' \
 78 | 		output/srf_edge_longest_${o}_alignments.gaf > output/best_srf_edge_longest_${o}_alignments.gaf
 79 | done
 80 | 
 81 | # 4. validate and plot results
 82 | for alignment in semi_repeat_free graphaligner
 83 | do
 84 | 	python3 ../scripts/compute_summary.py \
 85 | 		-t 3 \
 86 | 		--graph $inputgraph \
 87 | 		--fastq output/sim_reads.fastq \
 88 | 		--path output/sim_reads_path.nodes \
 89 | 		--fasta output/sim_reads_path.fasta \
 90 | 		--alignments output/best_${alignment}_alignments.gaf \
 91 | 		--metrics output/metrics_${alignment}.mts
 92 | done
 93 | for o in 1 5 10 50 100
 94 | do
 95 | 	python3 ../scripts/compute_summary.py \
 96 | 		-t 3 \
 97 | 		--graph $inputgraph \
 98 | 		--fastq output/sim_reads.fastq \
 99 | 		--path output/sim_reads_path.nodes \
100 | 		--fasta output/sim_reads_path.fasta \
101 | 		--alignments output/best_srf_edge_longest_${o}_alignments.gaf \
102 | 		--metrics output/metrics_srf_edge_longest_${o}.mts
103 | done
104 | wait $(jobs -p)
105 | 
106 | python3 ../scripts/compute_metrics.py \
107 | 	--output-name output/results \
108 | 	--summaries output/*.mts \
109 | 	--summaries-names $(basename -s ".mts" -a output/*.mts | sed 's/metrics_//')
110 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/vg-comparison/runexp.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # remember to run `conda activate aligner-evaluation`
  3 | set -e
  4 | set -o pipefail
  5 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
  6 | cd $thisfolder
  7 | 
  8 | # executable's absolute paths/commands (make sure they work!)
  9 | BADREAD=$thisfolder/../../../tools/Badread/badread-runner.py
 10 | GRAPHCHAINER=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer
 11 | graphaligner=$thisfolder/../../../tools/GraphAligner/bin/GraphAligner
 12 | usrbintime=/usr/bin/time
 13 | 
 14 | # params
 15 | inputgraphs=($thisfolder/../input/chr22_iEFG.gfa $thisfolder/../input/chr22_vg.gfa $thisfolder/../input/chr22_vg_msa.gfa)
 16 | graphnames=(iefg vg vgmsa)
 17 | coverage=30
 18 | threads=64
 19 | 
 20 | # 0. setup
 21 | mkdir output
 22 | echo -n > output/runexp_log.txt
 23 | 
 24 | # 1. simulate path and reads
 25 | for ((g = 0 ; g < ${#inputgraphs[@]} ; g++))
 26 | do
 27 | 	# TODO maybe parallelize this?
 28 | 	inputgraph="${inputgraphs[$g]}"
 29 | 	graphname="${graphnames[$g]}"
 30 | 	python3 ../scripts/generate_sim_reads.py \
 31 | 		--graph $inputgraph \
 32 | 		--fastq output/sim_reads_$graphname.fastq \
 33 | 		--seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt </dev/zero 2>/dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \
 34 | 		--path output/sim_reads_path_$graphname.nodes \
 35 | 		--fasta output/sim_reads_path_$graphname.fasta \
 36 | 		--coverage $coverage \
 37 | 		2>> output/runexp_log.txt >> output/runexp_log.txt
 38 | done
 39 | 
 40 | # 3. run the aligners on each dataset
 41 | for ((g1 = 0 ; g1 < ${#inputgraphs[@]} ; g1++))
 42 | do
 43 | 	for ((g2 = 0 ; g2 < ${#inputgraphs[@]} ; g2++))
 44 | 	do
 45 | 		inputgraph="${inputgraphs[$g1]}"
 46 | 		graphname="${graphnames[$g1]}"
 47 | 		datasetname="${graphnames[$g2]}"
 48 | 		reads=output/sim_reads_$datasetname.fastq
 49 | 		$usrbintime $graphaligner \
 50 | 			-t $threads \
 51 | 			-x vg \
 52 | 			-g $inputgraph \
 53 | 			-f $reads \
 54 | 			-a output/${graphname}_graph_${datasetname}_reads_alignments.gaf \
 55 | 			2>> output/runexp_log.txt >> output/runexp_log.txt
 56 | 
 57 | 		# pick first alignment
 58 | 		awk '{if (found[$1] == "1") \
 59 | 		          {} \
 60 | 		      else
 61 | 		          {found[$1]="1"; print}}' \
 62 | 		output/${graphname}_graph_${datasetname}_reads_alignments.gaf \
 63 | 		> output/best_${graphname}_graph_${datasetname}_reads_alignments.gaf
 64 | 	done
 65 | done
 66 | 
 67 | # 4. validate and plot results
 68 | for ((g1 = 0 ; g1 < ${#inputgraphs[@]} ; g1++))
 69 | do
 70 | 	for ((g2 = 0 ; g2 < ${#inputgraphs[@]} ; g2++))
 71 | 	do
 72 | 		inputgraph="${inputgraphs[$g1]}"
 73 | 		graphname="${graphnames[$g1]}"
 74 | 		if [ $g1 -eq $g2 ]
 75 | 		then
 76 | 			# we have ground truth
 77 | 			reads=output/sim_reads_$graphname.fastq
 78 | 			path=output/sim_reads_path_$graphname.nodes
 79 | 			fasta=output/sim_reads_path_$graphname.fasta
 80 | 			alignments=output/best_${graphname}_graph_${graphname}_reads_alignments.gaf
 81 | 			python3 ../scripts/compute_summary.py \
 82 | 				-t 3 \
 83 | 				--graph $inputgraph \
 84 | 				--fastq $reads \
 85 | 				--path $path \
 86 | 				--fasta $fasta \
 87 | 				--alignments $alignments \
 88 | 				--metrics output/metrics_${graphname}_graph_${graphname}_reads.mts
 89 | 		else
 90 | 			# we do not have ground truth
 91 | 			datasetname="${graphnames[$g2]}"
 92 | 			reads=output/sim_reads_$datasetname.fastq
 93 | 			alignments=output/best_${graphname}_graph_${datasetname}_reads_alignments.gaf
 94 | 			python3 ../scripts/compute_summary.py \
 95 | 				-t 3 \
 96 | 				--graph $inputgraph \
 97 | 				--fastq $reads \
 98 | 				--alignments $alignments \
 99 | 				--metrics output/metrics_${graphname}_graph_${datasetname}_reads.mts
100 | 		fi
101 | 	done
102 | done
103 | wait $(jobs -p)
104 | 
105 | for ((g = 0 ; g < ${#inputgraphs[@]} ; g++))
106 | do
107 | 	graphname="${graphnames[$g]}"
108 | 	python3 ../scripts/compute_metrics.py \
109 | 		--output-name output/results_${graphname}_graph \
110 | 		--summaries output/metrics_${graphname}_graph_*.mts \
111 | 		--summaries-names $(basename -s ".mts" -a output/metrics_${graphname}_graph_*.mts | sed 's/metrics_//')
112 | done
113 | 


--------------------------------------------------------------------------------
/experiments/aligner-evaluation/vg-unchop/runexp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # remember to run `conda activate aligner-evaluation`
 3 | set -e
 4 | set -o pipefail
 5 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
 6 | cd $thisfolder
 7 | 
 8 | # executables' absolute paths/commands (make sure they work!)
 9 | BADREAD=$thisfolder/../../../tools/Badread/badread-runner.py
10 | GRAPHCHAINER=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer
11 | graphaligner=$thisfolder/../../../tools/GraphAligner/bin/GraphAligner
12 | graphchainer=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer
13 | minigraph=$thisfolder/../../../tools/minigraph/minigraph
14 | minichain=$thisfolder/../../../tools/minichain/minichain
15 | usrbintime=/usr/bin/time
16 | 
17 | # params
18 | inputgraph=$thisfolder/../input/chr22_vg_unchop.gfa
19 | coverage=30
20 | threads=64
21 | 
22 | ## 0. setup
23 | mkdir output
24 | echo -n > output/runexp_log.txt
25 | 
26 | # 1. simulate path and reads
27 | python3 ../scripts/generate_sim_reads.py \
28 | 	--graph $inputgraph \
29 | 	--fastq output/sim_reads.fastq \
30 | 	--seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt </dev/zero 2>/dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \
31 | 	--path output/sim_reads_path.nodes \
32 | 	--fasta output/sim_reads_path.fasta \
33 | 	--coverage $coverage \
34 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
35 | 
36 | # 2. run the aligners
37 | cat output/sim_reads.fastq | cut -d' ' -f1 > output/sim_reads_fixed_header.fastq
38 | $usrbintime $graphaligner \
39 | 	-t $threads \
40 | 	-x "vg" \
41 | 	-g $inputgraph \
42 | 	-f output/sim_reads_fixed_header.fastq \
43 | 	-a output/graphaligner_alignments.gaf \
44 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
45 | 
46 | $usrbintime $graphchainer \
47 | 	-t $threads \
48 | 	-g $inputgraph \
49 | 	-f output/sim_reads_fixed_header.fastq \
50 | 	-a output/graphchainer_alignments.gaf \
51 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
52 | 
53 | $usrbintime $minigraph \
54 | 	-t $threads \
55 | 	-c \
56 | 	-x lr \
57 | 	$inputgraph \
58 | 	output/sim_reads.fastq \
59 | 	-o output/minigraph_alignments.gaf \
60 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
61 | 
62 | $usrbintime $minichain \
63 | 	-t $threads \
64 | 	-c $inputgraph \
65 | 	output/sim_reads.fastq \
66 | 	-o output/minichain_alignments.gaf \
67 | 	2>> output/runexp_log.txt >> output/runexp_log.txt
68 | 
69 | # 3. pick first alignment
70 | for alignment in "graphaligner_alignments.gaf" "graphchainer_alignments.gaf"
71 | do
72 | 	awk '{if (found[$1] == "1") \
73 | 	          {} \
74 | 	      else
75 | 	          {found[$1]="1"; print}}' \
76 | 		output/$alignment > output/best_$alignment
77 | done
78 | 
79 | # 4. validate and plot results
80 | for alignment in best_graphaligner best_graphchainer minigraph minichain
81 | do
82 | 	python3 ../scripts/compute_summary.py \
83 | 		-t 8 \
84 | 		--graph $inputgraph \
85 | 		--fastq output/sim_reads.fastq \
86 | 		--path output/sim_reads_path.nodes \
87 | 		--fasta output/sim_reads_path.fasta \
88 | 		--alignments output/${alignment}_alignments.gaf \
89 | 		--metrics output/metrics_${alignment}.mts &
90 | done
91 | wait $(jobs -p)
92 | 
93 | python3 ../scripts/compute_metrics.py \
94 | 	--output-name output/results \
95 | 	--summaries output/*.mts \
96 | 	--summaries-names $(basename -s ".mts" -a output/*.mts | sed 's/metrics_//')
97 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/README.md:
--------------------------------------------------------------------------------
 1 | # Graph statistics
 2 | Analisys of iEFGs or DAGs in GFA format containing only forward `L` links. Command `./run.sh graph.gfa` computes:
 3 | 
 4 | - the number of nodes, edges, and base pairs of the graph
 5 | - the ⌈N50⌉ metric, that is, the smallest k such that ≥50% of the bases are covered by segments of length ≤k
 6 | - the length of the longest node
 7 | - the maximum number H of nodes in a block,  if `graph.gfa` is an iEFG
 8 | - the width (size of smallest path set covering the nodes, using `GraphChainer`)
 9 | - the number of branching nodes (outdegree ≥ 2), choices (sum of outdegrees ≥ 2), the branching factor (maximum number of branching nodes in any path), and the number of maximal paths
10 | 
11 | ## Prerequisites
12 | The scripts used depend on [`octave-cli`](https://octave.org/), `gawk`, `awk`, and `GraphChainer`: the last one is expected to be found in folder `tools/GraphChainer/bin` from the root of this repository, and can be obtained with command
13 | ```console
14 | git submodule update --init --recursive ../../tools/GraphChainer
15 | ```
16 | and by following the compilation instructions in its README.
17 | 
18 | ## Limitations
19 | The computation of the ⌈N50⌉ is quite slow and inefficient. The computation of the number of maximal paths is also not efficient and keeps in memory many large numbers.
20 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]]
 3 | then
 4 | 	>&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1
 5 | fi
 6 | 
 7 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
 8 | 
 9 | echo -e "nodes:\t$($thisfolder/scripts/compute-nodes.sh $1)"
10 | echo -e "edges:\t$($thisfolder/scripts/compute-edges.sh $1)"
11 | echo -e "bases:\t$($thisfolder/scripts/compute-bps.sh $1)"
12 | echo -e "N50:\t$($thisfolder/scripts/compute-N.sh $1 50 2>> /dev/null)" 
13 | echo -e "longest node:\t$($thisfolder/scripts/compute-longest-node.sh $1)"
14 | echo -e "H:\t$($thisfolder/scripts/compute-efg-H.sh $1)"
15 | echo -e "width:\t$($thisfolder/scripts/compute-width.sh $1)"
16 | echo -e "branching nodes:\t$($thisfolder/scripts/compute-branching-nodes.sh $1)"
17 | echo -e "choices:\t$($thisfolder/scripts/compute-choices.sh $1)"
18 | echo -e "branching factor:\t$($thisfolder/scripts/compute-branching-factor.sh $1 2>> /dev/null)"
19 | echo -e "paths:\t$($thisfolder/scripts/compute-paths.sh $1 2>> /dev/null)"
20 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/scripts/compute-N.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | if [[ $# -lt 2 ]] || [[ $# -gt 2 ]]
 4 | then
 5 | 	>&2 echo "Usage: $(basename $0) graph.gfa [0-100]" ; exit 1
 6 | fi
 7 | 
 8 | # Configuration
 9 | quantiles="$(echo $2 / 100 | bc -l)"
10 | 
11 | lengths=$(mktemp)
12 | >&2 echo "generated tmp file $lengths (should be removed automatically on exit)..."
13 | trap '{ rm -f -- "$lengths"; }' EXIT
14 | 
15 | grep "^S" $1 | awk '{for (i = 1 ; i <= length($3) ; i++) {print length($3)}}' | sort -n > $lengths
16 | 
17 | octaveout=$(octave-cli --eval "format long; x = dlmread('$lengths'); q = quantile(x, [0.00 $quantiles]); N = ceil(q([2:length(q)])); disp(N)" | tr -s " " "\t" | cut -f2-)
18 | 
19 | echo -e "$octaveout"
20 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/scripts/compute-bps.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eo pipefail
3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]]
4 | then
5 | 	>&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1
6 | fi
7 | 
8 | grep "^S" $1 | cut -f3 | tr -d "\n" | wc -c
9 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/scripts/compute-branching-factor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]]
 4 | then
 5 | 	>&2 echo "Usage: $(basename $0) dag.gfa" ; exit 1
 6 | fi
 7 | 
 8 | tempdir=$(mktemp -d)
 9 | >&2 echo "generated temp dir $tempdir (should be removed automatically on exit)..."
10 | trap '{ rm -rf -- "$tempdir"; }' EXIT
11 | 
12 | # sort nodes and edges in topological order
13 | grep "^L" $1 | awk '{print $2,$4}' > "$tempdir/edges"
14 | tsort "$tempdir/edges" > "$tempdir/topological_nodes"
15 | 
16 | awk '(NR == FNR) { inneighbors[$2]=inneighbors[$2] FS $1; next } { \
17 | 	n=split(inneighbors[$1],array); \
18 | 	for (i=1;i<=n;++i) \
19 | 		print array[i],$1}' \
20 | 	"$tempdir/edges" "$tempdir/topological_nodes" > "$tempdir/topological_edges"
21 | 
22 | # find all branching nodes
23 | awk '{outdegree[$1]+=1;} \
24 | 	END \
25 | 	{ \
26 | 		for (key in outdegree) { \
27 | 			if (outdegree[key] > 1) {print key} \
28 | 		}; \
29 | 	};' "$tempdir/edges" > "$tempdir/branching"
30 | 
31 | # use branching nodes and topological edges to copute the branching factor
32 | gawk --bignum '(NR == FNR) { branching[$1]=1; next } { \
33 | 	if (bfactor[$1] + branching[$1] > bfactor[$2]) \
34 | 		{bfactor[$2]=bfactor[$1] + branching[$1]} \
35 | 	} END { \
36 | 		max=0 ; \
37 | 		for (key in bfactor) { \
38 | 			if (bfactor[key] > max) { \
39 | 				max=bfactor[key] \
40 | 			} \
41 | 		} ;
42 | 		print max
43 | 	}' "$tempdir/branching" "$tempdir/topological_edges"
44 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/scripts/compute-branching-nodes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]]
 4 | then
 5 | 	>&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1
 6 | fi
 7 | 
 8 | grep "^L" $1 | \
 9 | 	awk '{outdegree[$2]+=1;} \
10 | 	END \
11 | 	{ \
12 | 		for (key in outdegree) { \
13 | 			if (outdegree[key] > 1) {result+=1} \
14 | 		}; \
15 | 		print result \
16 | 	};'
17 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/scripts/compute-choices.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]]
 4 | then
 5 | 	>&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1
 6 | fi
 7 | 
 8 | grep "^L" $1 | \
 9 | 	awk '{outdegree[$2]+=1;} \
10 | 	END \
11 | 	{ \
12 | 		for (key in outdegree) { \
13 | 			if (outdegree[key] > 1) {result+=outdegree[key]} \
14 | 		}; \
15 | 		print result \
16 | 	};'
17 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/scripts/compute-edges.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eo pipefail
3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]]
4 | then
5 | 	>&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1
6 | fi
7 | 
8 | grep "^L" $1 | wc -l
9 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/scripts/compute-efg-H.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eo pipefail
3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]]
4 | then
5 | 	>&2 echo "Usage: $(basename $0) efg.gfa" ; exit 1
6 | fi
7 | 
8 | head -n 3 $1 | grep "^B" | tr "\t" "\n" | tail -n +2 | sort -nr | head -n 1
9 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/scripts/compute-longest-node.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eo pipefail
3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]]
4 | then
5 | 	>&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1
6 | fi
7 | 
8 | grep "^S" $1 | awk '{if (length($3) > max) {max = length($3)}} END {print max}'
9 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/scripts/compute-nodes.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eo pipefail
3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]]
4 | then
5 | 	>&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1
6 | fi
7 | 
8 | grep "^S" $1 | wc -l
9 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/scripts/compute-paths.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]]
 4 | then
 5 | 	>&2 echo "Usage: $(basename $0) connected-dag.gfa" ; exit 1
 6 | fi
 7 | 
 8 | tempdir=$(mktemp -d)
 9 | >&2 echo "generated temp dir $tempdir (should be removed automatically on exit)..."
10 | trap '{ rm -rf -- "$tempdir"; }' EXIT
11 | 
12 | # compute topological nodes and edges
13 | grep "^L" $1 | awk '{print $2,$4}' > "$tempdir/edges"
14 | tsort "$tempdir/edges" > "$tempdir/topological_nodes"
15 | 
16 | awk '(NR == FNR) { inneighbors[$2]=inneighbors[$2] FS $1; next } { \
17 | 	n=split(inneighbors[$1],array); \
18 | 	for (i=1;i<=n;++i) \
19 | 		print array[i],$1}' \
20 | 	"$tempdir/edges" "$tempdir/topological_nodes" > "$tempdir/topological_edges"
21 | 
22 | # find sinks in the graph
23 | awk '{nodes[$2]=1; nonsinks[$1]=1} \
24 | 	END \
25 | 	{ \
26 | 		for (key in nodes) { \
27 | 			if (nonsinks[key] != 1) {print key} \
28 | 		}; \
29 | 	};' "$tempdir/edges" > "$tempdir/sinks"
30 | 
31 | # use edges and sinks to compute paths to each node
32 | gawk --bignum '(NR == FNR) { sinks[$1]=1; next } { \
33 | 	if (paths[$1] > 0) \
34 | 		{paths[$2]+=paths[$1]} \
35 | 	else \
36 | 		{paths[$2]+=1}}
37 | 	END { \
38 | 		result=0 ; \
39 | 		for (key in sinks) { \
40 | 			result+=paths[key]} ; \
41 | 		printf "%e", result ; \
42 | 		print ""\
43 | 	}' "$tempdir/sinks" "$tempdir/topological_edges"
44 | 


--------------------------------------------------------------------------------
/experiments/graph-statistics/scripts/compute-width.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]]
 4 | then
 5 | 	>&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1
 6 | fi
 7 | 
 8 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
 9 | graphchainer=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer
10 | 
11 | $graphchainer --graph-statistics -g $1 -f fakereads.fastq -a fakealns.gaf 2>&1 | tail -n 1 | cut -d' ' -f8
12 | 


--------------------------------------------------------------------------------
/experiments/msa-validation/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation of sequence-to-graph aligners on simulated reads
 2 | Script to compare the output of `vcf2multialign -H` on a chromosome of the [T2T-CHM13v2.0](https://github.com/marbl/CHM13) + [Phased T2T 1KGP panel](https://zenodo.org/records/7612953#.Y-8VD3bMJPY) dataset with that of [`bcftools consensus`](https://samtools.github.io/bcftools/howtos/consensus-sequence.html). Specifically, each sequence of the MSA generated with `vcf2multialign` is stripped of gaps and compared to the corresponding output of `bcftools consensus` using [`edlib-aligner`](https://github.com/Martinsos/edlib). Each Levenshtein distance (NW) is collected and the min, max, and average distance are computed.
 3 | 
 4 | ## Prerequisites
 5 | The script depends on `awk` and `bcftools`. It expects to find executable `edlib-aligner` and `seqtk` in folders `tools/edlib/meson-build` and `tools/seqtk/seqtk` from the root of the repository, which can be compiled as follows:
 6 | ```console
 7 | git submodule update --init ../../tools/edlib
 8 | make -C ../../tools/edlib
 9 | git submodule update --init ../../tools/seqtk
10 | make -C ../../tools/seqtk
11 | ```
12 | 
13 | ## Usage
14 | Usage is as follows:
15 | ```console
16 | ./validate.sh MSA.fasta reference.fasta variations.vcf.gz threads output_stats.txt
17 | ```
18 | where `MSA.fasta` is the MSA computed by `vcf2multialign -H`, `reference.fasta` contains the reference chromosome, `variations.vcf.gz` contains the variations to the chromosome, threads is a positive number of threads, and `output_stats.txt` is the desired output file. For the chromosome 22 built in experiment `vcf-to-hapl-to-efg`, the command to run is
19 | ```console
20 | ./validate.sh ../vcf-to-hapl-to-efg/output/sampled_haplotypes.a2m ../vcf-to-hapl-to-efg/chr22_uppercase.fasta ../vcf-to-hapl-to-efg/1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz 64 output_stats.txt
21 | ```
22 | 


--------------------------------------------------------------------------------
/experiments/msa-validation/validate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | if [[ $# -lt 5 ]] || [[ $# -gt 5 ]]
 4 | then
 5 | 	>&2 echo "Usage: $(basename $0) MSA.fasta reference.fasta variations.vcf.gz threads output_stats.txt" ; exit 1
 6 | fi
 7 | 
 8 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
 9 | export bcftools=bcftools
10 | export edlib=../../tools/edlib/meson-build/edlib-aligner
11 | export seqtk=../../tools/seqtk/seqtk
12 | 
13 | msa=$1
14 | export ref=$2
15 | export vcf=$3
16 | threads=$4
17 | outputfile=$5
18 | 
19 | if [ -f "$outputfile" ]
20 | then
21 | 	>&2 echo "$outputfile already exists!"; exit 1
22 | fi
23 | 
24 | function align_destroy () { # $1 is temp file containing gapless fasta sequence, to be destroyed
25 | 	header=$(head -n 1 $1)
26 | 	sample=$(echo ${header:1} | cut -d'-' -f1)
27 | 	haplot=$(echo ${header:1} | cut -d'-' -f2)
28 | 	echo -en "$sample-$haplot\t"
29 | 	$edlib \
30 | 		$1 \
31 | 		<($bcftools consensus -f $ref -s $sample -H $haplot $vcf 2>> /dev/null | $seqtk seq -U /dev/stdin) \
32 | 		| grep "^#0" | cut -d' ' -f2
33 | 	rm $1
34 | }
35 | # make align_destroy visible to GNU parallel
36 | export SHELL=$(type -p bash)
37 | export -f align_destroy
38 | 
39 | while read -u 3 header
40 | do
41 | 	while [[ $(find . -type f -name "tmp*" | wc -l) -gt $threads ]]
42 | 	do
43 | 		sleep 1s
44 | 	done
45 | 	read -u 3 entry
46 | 	sample=$(echo ${header:1} | cut -d'-' -f1)
47 | 	haplot=$(echo ${header:1} | cut -d'-' -f2)
48 | 	if [ "$sample" == "REF" ] ; then continue ; fi
49 | 	gaplessentry=$(echo "$entry" | tr -d "-")
50 | 
51 | 	tempfile=$(mktemp -p $thisfolder)
52 | 	echo "$header" > $tempfile
53 | 	echo "$gaplessentry" >> $tempfile
54 | 	echo "$tempfile"
55 | done 3<$msa | parallel --keep-order --jobs $threads align_destroy > $outputfile
56 | 
57 | awk 'BEGIN { min = 2^1024 ; max = 0 } \
58 | 	{ sum += $2 ; count++ ; \
59 | 	  if ($2 < min) min = $2 ; \
60 | 	  if ($2 > max) max = $2 } \
61 | 	END { print "average:\t"sum / count ; print "    min:\t"min ; print "    max:\t"max }' \
62 | 	$outputfile >> $outputfile
63 | 
64 | tail -n 3 $outputfile
65 | 
66 | 


--------------------------------------------------------------------------------
/experiments/short-read-exact-match/README.md:
--------------------------------------------------------------------------------
 1 | # Exact match of short reads on the chr22 iEFG
 2 | We compare the short-read exact matching solution of `efg-locate` on the chromosome 22 iEFG built with the pipeline at `experiments/vcf-to-hapl-to-efg` to that of `bwa` on the T2T-CHM13 linear reference for chromosome 22. After checking out the *Prerequisites* and *Datasets* sections, run the script `runexp.sh` (requires 42G of disk space) and check `output/runexp_log.txt` for the results.
 3 | 
 4 | ## Prerequisites
 5 | Script `runexp.sh` expects `efg-locate`, `bwa`, and `seqtk` to be located in folder `tools/efg-locate`, `tools/bwa`, and `tools/seqtk` from the root of this repository. You can download and compile them with the following commands (executed from this folder):
 6 | ```console
 7 | make -C ../../tools/efg-locate
 8 | git submodule update --init ../../tools/{bwa,seqtk}
 9 | make -C ../../tools/bwa
10 | make -C ../../tools/seqtk
11 | ```
12 | 
13 | ## Datasets
14 | Obtain the graph (262M) with command
15 | ```console
16 | wget https://zenodo.org/records/15112649/files/chr22_iEFG.gfa.gz?download=1 --output-document=input/chr22_iEFG.gfa.gz && gunzip input/chr22_iEFG.gfa.gz
17 | ```
18 | the reads (24GB) with
19 | ```console
20 | wget 'https://cs.helsinki.fi/group/gsa/panvc-founders/scalability-experiment/reads/ERR1025645_sample05_1.fq.gz' --output-document=input/ERR1025645_sample05_1.fq.gz
21 | ```
22 | and the chromosome 22 reference (1GB) with
23 | ```console
24 | wget "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0.fa.gz" --output-document=input/chm13v2.0.fa.gz
25 | seqtk subseq input/chm13v2.0.fa.gz <(echo "chr22") | seqtk seq -U - > input/chr22_uppercase.fasta
26 | ```
27 | 


--------------------------------------------------------------------------------
/experiments/short-read-exact-match/input/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/algbio/SRFAligner/c698b10ce1b3f5ec20ef5f218f6d0259eebf7911/experiments/short-read-exact-match/input/.gitkeep


--------------------------------------------------------------------------------
/experiments/short-read-exact-match/runexp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | set -o pipefail
 4 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
 5 | cd $thisfolder
 6 | 
 7 | # executable's absolute paths/commands (make sure they work!)
 8 | bwa=$thisfolder/../../tools/bwa/bwa
 9 | efglocate=$thisfolder/../../tools/efg-locate/efg-locate
10 | seqtk=$thisfolder/../../tools/seqtk/seqtk
11 | usrbintime=/usr/bin/time
12 | 
13 | # params
14 | inputgraph=$thisfolder/input/chr22_iEFG.gfa
15 | inputreference=$thisfolder/input/chr22_uppercase.fasta
16 | inputreads=$thisfolder/input/ERR1025645_sample05_1.fq.gz
17 | threads=16
18 | 
19 | # 0. setup
20 | mkdir output
21 | echo -n > output/runexp_log.txt
22 | 
23 | echo "# 1. match in the chr22 iEFG (efg-locate)" >> output/runexp_log.txt
24 | /usr/bin/time $efglocate \
25 | 	--reverse-complement \
26 | 	--threads $threads \
27 | 	$inputgraph \
28 | 	<(seqtk seq -A $inputreads) \
29 | 	output/efg_locate_matches.gaf \
30 | 	>> output/runexp_log.txt 2>> output/runexp_log.txt
31 | 
32 | echo "# 2. match in the chr22 reference (bwa)" >> output/runexp_log.txt
33 | ln -s $inputreference $thisfolder/output/ref.fasta
34 | /usr/bin/time $bwa index output/ref.fasta 2>> output/runexp_log.txt
35 | /usr/bin/time $bwa aln -n 0 -k 0 -l 100 -t $threads \
36 | 	output/ref.fasta \
37 | 	$inputreads \
38 | 	> output/bwa_matches.sai 2>> output/runexp_log.txt
39 | /usr/bin/time $bwa samse \
40 | 	output/ref.fasta \
41 | 	output/bwa_matches.sai \
42 | 	$inputreads \
43 | 	> output/bwa_matches.sam 2>> output/runexp_log.txt
44 | 
45 | echo "# 3. compute stats" >> output/runexp_log.txt
46 | echo -n "efg-locate took" $(grep system output/runexp_log.txt | head -n 1 | cut -d' ' -f3 | cut -d'e' -f1) >> output/runexp_log.txt
47 | echo " and matched" $(cut -f1 output/efg_locate_matches.gaf | uniq | sort | uniq | wc -l) "reads" >> output/runexp_log.txt
48 | 
49 | echo -n "bwa took" $(grep system output/runexp_log.txt | tail -n 3 | cut -d' ' -f3 | cut -d'e' -f1 | tr "\n" " ") >> output/runexp_log.txt
50 | echo " and matched" $(cat output/bwa_matches.sam | awk '{if ($6 == "100M") {print}}' | cut -f1 | uniq | sort | uniq | wc -l) "reads" >> output/runexp_log.txt
51 | 


--------------------------------------------------------------------------------
/experiments/vcf-to-hapl-to-efg/README.md:
--------------------------------------------------------------------------------
 1 | # vcf-to-hapl-to-efg experiment
 2 | Pipeline to build an indexable Elastic Founder Graph from a VCF file plus reference. After checking out the 'Prerequisites' and 'Datasets and obtaining the input data' sections of this document, you can build the chromosome 22 iEFG with command
 3 | ```console
 4 | /usr/bin/time ./sample-and-build-efg-heuristic.sh -f chr22_uppercase.fasta -v 1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz -c chr22 -s 2504 -M 8 -t 64
 5 | ```
 6 | This requires ~600 GB of disk space and ~600 GB of RAM. If you want to generate an iEFG from fewer haplotypes, for example 20, run command
 7 | ```console
 8 | /usr/bin/time ./sample-and-build-efg-heuristic.sh -f chr22_uppercase.fasta -v 1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz -c chr22 -s 20 -t 64
 9 | ```
10 | Finally, the iEFG can be stripped of its paths with command
11 | ```console
12 | grep -v "^P" output/efg-unsimplified.gfa > chr22_iEFG.gfa
13 | ```
14 | 
15 | ## Prerequisites
16 | The pipeline expects [`bcftools`](https://www.htslib.org/download/) and [`vcf2multialign`](https://github.com/tsnorri/vcf2multialign) to be found in the search path variable `PATH`, and expects `founderblockgraph` to be in folder `tools/founderblockgraph` from the root of this repository. You can get and compile `founderblockgraph` with
17 | ```console
18 | git submodule update --init --recursive ../../tools/founderblockgraphs
19 | make -C ../../tools/founderblockgraphs
20 | ```
21 | 
22 | In case you obtain `bcftools` and `vcf2multialign` in a different way, modify lines 11-13 of `sample-and-build-efg-heuristic.sh` accordingly. To manipulate FASTQ files in the next section, we also use [`seqtk`](https://github.com/lh3/seqtk).
23 | 
24 | ## Datasets and obtaining the input data
25 | We use the [phased T2T 1KGP panel](https://zenodo.org/records/7612953) (Version 1.0) by Joseph Lalli, based on [T2T-CHM13v2.0](https://github.com/marbl/CHM13). We can easily obtain chromosome 22 with
26 | ```console
27 | wget "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0.fa.gz"
28 | seqtk subseq chm13v2.0.fa.gz <(echo "chr22") | seqtk seq -U - > chr22_uppercase.fasta
29 | ```
30 | and obtain the chr22 variations using commands
31 | ```console
32 | wget "https://zenodo.org/records/7612953/files/phased_T2T_panel.tar"
33 | tar -xvf phased_T2T_panel.tar phased_T2T_panel/1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz --strip-components 1
34 | tar -xvf phased_T2T_panel.tar phased_T2T_panel/1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz.tbi --strip-components 1
35 | ```
36 | 
37 | ## iEFG validation
38 | You can validate that the iEFG was correctly built from the MSA rows with `efg-locate`:
39 | ```console
40 | ../../tools/efg-locate/efg-locate -t 64 --overwrite \
41 |     output/efg-unsimplified.gfa \
42 |     <(awk '{if (substr($0, 0, 1) == ">") {print} else {gsub(/-/, "", $0); print $0}}' \
43 |         output/sampled_haplotypes.a2m) \
44 |     /dev/null
45 | ```
46 | 
47 | ## Versions of the software used
48 | | Tool              |          Version |
49 | | ----------------- | ---------------- |
50 | | founderblockgraph | 439ef67 (GitHub) |
51 | | vcf2multialign    |    1.2.2 23f3f42 |
52 | | seqtk             |   1.4-r130-dirty |
53 | | bcftools          |             1.20 |
54 | 
55 | ## todo
56 | - [] gzip some of the intermediate files
57 | 


--------------------------------------------------------------------------------
/experiments/vcf-to-hapl-to-efg/sample-and-build-efg-heuristic.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # sample some haplotypes from reference + VCF file, build MSA with vcf2multialign, and build iEFG with founderblockgraph
  3 | set -e
  4 | set -o pipefail
  5 | 
  6 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
  7 | 
  8 | #
  9 | # executables
 10 | #
 11 | founderblockgraph=$thisfolder/../../tools/founderblockgraphs/founderblockgraph
 12 | vcf2multialign=vcf2multialign
 13 | bcftools=bcftools
 14 | 
 15 | #
 16 | # setup
 17 | #
 18 | threads=8
 19 | heuristicsubset=""
 20 | 
 21 | # parsing command line options
 22 | print_help()
 23 | {
 24 |    echo "usage: $0 [-f reference.fa] [-v variation.vcf] [-c chromosome] [-s nhapl] [-M Mrows] [-t threads]"
 25 |    echo "	-h --help:  show this screen"
 26 |    echo "	-f reference:  reference (FASTA format)                 used to generate MSA"
 27 |    echo "	-v variation:  variation (VCF format, possibly gzipped) used to generate MSA"
 28 |    echo "	-c chromosome: chromosome name                          used to generate MSA (see vcf2multialign)"
 29 |    echo "	-s nhapl: sample the haplotype list and keep 'nhapl' random haplotypes"
 30 |    echo "	-M Mrows: set parameter --heuristic-subset='Mrows' for iEFG construction (see founderblockgraph)"
 31 |    echo "	-t threads: threads used in iEFG construction                            (see founderblockgraph)"
 32 | }
 33 | 
 34 | # https://stackoverflow.com/questions/12022592/how-can-i-use-long-options-with-the-bash-getopts-builtin
 35 | for arg in "$@"; do
 36 | 	shift
 37 | 	case "$arg" in
 38 | 		'--help')                set -- "$@" '-h'   ;;
 39 | 		*)                       set -- "$@" "$arg" ;;
 40 | 	esac
 41 | done
 42 | 
 43 | argf=false ; argc=false ; argv=false ; args=false ; argM=false ;
 44 | OPTIND=1
 45 | while getopts "hf:v:c:r:s:M:t:" option; do
 46 | 	case $option in
 47 | 		f) # fasta + vcf input : fasta
 48 | 			argf=true
 49 | 			reference="$(realpath $OPTARG)" ;;
 50 | 		v) # fasta + vcf input : vcf
 51 | 			argv=true
 52 | 			vcf="$(realpath $OPTARG)" ;;
 53 | 		c) # fasta + vcf input : chromosome for vcf2multialign
 54 | 			argc=true
 55 | 			chromosome="$OPTARG" ;;
 56 | 		s) # number of samples
 57 | 			args=true
 58 | 			nhapl="$OPTARG" ;;
 59 | 		M) # heuristic subset
 60 | 			argM=true
 61 | 			heuristicsubset="--heuristic-subset $OPTARG" ;;
 62 | 		t) # threads
 63 | 			argt=true
 64 | 			threads="$OPTARG" ;;
 65 | 		h) # display help
 66 | 			print_help
 67 | 			exit;;
 68 | 		\?) # invalid option
 69 | 			echo "Error: Invalid option"
 70 | 			exit;;
 71 | 	esac
 72 | done
 73 | shift $(expr $OPTIND - 1) # remove options from positional parameters
 74 | 
 75 | if [ "$argf" = false ] || [ "$argc" = false ] || [ "$argv" = false ] || [ "$args" = false ]
 76 | then
 77 | 	print_help
 78 | 	exit
 79 | fi
 80 | 
 81 | outputfolder=$thisfolder/output
 82 | mkdir $outputfolder
 83 | if [[ $? -gt 0 ]] ; then echo "Output directory $outputfolder already exists!" ; exit 1 ; fi
 84 | log=$outputfolder/log.txt
 85 | stats=$outputfolder/stats.txt
 86 | cd $outputfolder
 87 | 
 88 | #
 89 | # randomness source
 90 | #
 91 | # https://www.gnu.org/software/coreutils/manual/html_node/Random-sources.html#Random-sources
 92 | get_seeded_random()
 93 | {
 94 |   seed="$1"
 95 |   openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
 96 |     </dev/zero 2>/dev/null
 97 | }
 98 | 
 99 | #
100 | # 1. sampling the vcf
101 | #
102 | echo -n "Sampling the vcf..."
103 | # TODO I am expecting the haplotypes to be after the 9th field specified by the VCF header, is this always correct?
104 | $bcftools query -l $vcf > haplotypes
105 | cat haplotypes | shuf -n $nhapl --random-source=<(get_seeded_random "semi-repeat-free") > sampled_haplotypes
106 | $bcftools view --samples-file sampled_haplotypes $vcf > sampled_haplotypes.vcf
107 | # FIX for the T2T 1KGP data and vcf2multialign, see https://github.com/tsnorri/vcf2multialign/issues/5
108 | sed -i 's/e+06//g' sampled_haplotypes.vcf
109 | sed -i 's/INFO=<ID=AC,Number=1/INFO=<ID=AC,Number=A/g' sampled_haplotypes.vcf
110 | echo " done."
111 | 
112 | #
113 | # 2. vcf -> MSA
114 | #
115 | echo -n "Computing the MSA..."
116 | /usr/bin/time $vcf2multialign \
117 | 	--founder-sequences=50 \
118 | 	--input-reference=$reference \
119 | 	--input-variants=sampled_haplotypes.vcf \
120 | 	--chromosome $chromosome \
121 | 	--output-graph variant.graph >> $log 2>> $log
122 | 
123 | /usr/bin/time $vcf2multialign \
124 | 	--input-reference=$reference \
125 | 	--input-graph variant.graph \
126 | 	-H \
127 | 	-s sampled_haplotypes.a2m >> $log 2>> $log
128 | echo " done."
129 | 
130 | #
131 | # 3. MSA -> iEFG
132 | #
133 | echo -n "Building the indexable Elastic Founder Graph..."
134 | /usr/bin/time $founderblockgraph --elastic --gfa --ignore-chars="N" --output-paths --threads=$threads --input=sampled_haplotypes.a2m --output=efg-unsimplified.gfa $heuristicsubset >> $log 2>> $log
135 | echo " done."
136 | 


--------------------------------------------------------------------------------
/test/graph1.gfa:
--------------------------------------------------------------------------------
  1 | X	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40
  2 | B	1	3	4	2	2	1	2	2	5	2	2	1	1	3	4	1	1	2	1	1	1	3	1	2	2	5	1	1	1	1	1	1	6	1	2	1	3	1	2	1	1
  3 | S	484932	AAGATTGTGCCACTGCACTCCAGGC
  4 | S	484965	CCTGTCCTAGGCCAG
  5 | S	484998	GTGGAAGACAGAAAT
  6 | S	484989	AAACCCAAAGCAGAC
  7 | S	484992	AAAGCTCCTGAAGGG
  8 | S	484988	GGGGAAACAGCCAC
  9 | S	484951	GAGGCTTCTACAC
 10 | S	484963	CCGGTGAGAAGATTAG
 11 | S	484975	AGAAGGCCGAGGCAGAGAAT
 12 | S	484978	TGCTTGAACCTGGGAGGTGGAGGTTGCAGTGAGCCAAGATCGTGCCACTGC
 13 | S	484947	AATCTGATTAATTGCGAGGAGTCTTTG
 14 | S	484954	GAAAGCAATATTAT
 15 | S	484984	AATTTATTAAATACTCACTGTG
 16 | S	484964	GTCATCAAAACCTG
 17 | S	484948	AATCTGATGAATTGCGAGGAGTCGTTG
 18 | S	484952	GAGGCTTCCACAC
 19 | S	484934	GAAATAATAATAAAAAAAAAAG
 20 | S	484967	CTGCAGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGCGGGTGGATCAC
 21 | S	484941	GATAATACTGAGAAG
 22 | S	484999	GTGGAAAACAGAAAAT
 23 | S	484985	AATATATTAAATACTCACTGTG
 24 | S	484955	CAGAATTGAATTTAA
 25 | S	484968	CTGCAGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGTGGATCAC
 26 | S	484939	GAAATAATAATATAAAAAAAAG
 27 | S	484931	AAGATCGTGCCACTGCACTCCAGGC
 28 | S	484983	CCTCCAGCCTGGGCAGCAGATCGAGACTCCATCTCAAAAA
 29 | S	484949	AATCTGTTGAATTGCGAGGAGTCTTTG
 30 | S	484950	AATCTGATGAATTGCGAGGAGTCTTTA
 31 | S	484943	TCATTTGGGGAGGCT
 32 | S	484929	GAAAATTAGCTGGGCATGGTGGCGGGCGCCTATAGTCC
 33 | S	484940	ATAAAGATCTTAA
 34 | S	484991	AAAGCTCCTCAAGGG
 35 | S	484969	CTGCAGTGGCTCACACCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGTGGATCAC
 36 | S	484979	CCTCCAGCCTGGGCAACAGATCGAGACTCCGTCTCAAAAA
 37 | S	484996	GTTTGATTAATAATG
 38 | S	484994	GTTTACTTAATAATG
 39 | S	484987	GGCAAATACCATTTT
 40 | S	484922	TTACTTTTTTAAAGAT
 41 | S	484923	AAAGATCTTGAC
 42 | S	484982	CCTCCAGCCTGGGCAACAGATTGAGACTCCGTCTCAAAAA
 43 | S	484995	GTTTGCTGAATAATG
 44 | S	484990	AAACCCAAAACAGAC
 45 | S	484958	GCAAATTTATTTGGGCA
 46 | S	484936	GAAATAATAATAAAAAAAAAAAG
 47 | S	484938	GAAATAATAATAATAAAAAAAG
 48 | S	484980	CCTCCAGCCTGGGCAACAGATCGAGACTCCATCTCAAAAA
 49 | S	484953	GAAAGCAATATTGT
 50 | S	484942	GTGTAGGTTGAGG
 51 | S	484927	GAAAATTAGCTGGGCATGGTGGCGGGCGCCTGTAGTCC
 52 | S	484981	CCTCCAGCCTGGGCAACAGATCGAGACTCCGTCTCAAAAAA
 53 | S	484962	CCGGTGAGAAGTTTAG
 54 | S	484937	GAAATAATAATAAAAAAAAAG
 55 | S	484924	CGGGCGCGGTGGCTCACACCTGTAATCCCAGCATTTTGGGAGGCCGAGGC
 56 | S	484973	TCAAAAATCAGCTGGGTGTGGTGGCG
 57 | S	484970	GAGGTCAGGAGTTCAAGACCAGCCTGGCCAAGATGGTGAAACCCCATCTCTACTAAAAA
 58 | S	484925	CGGGCGCGGTGGCTCACACCTGTAATCCCAGCATTTAGGGAGGCCGAGGC
 59 | S	484976	AGAAGGCTGAGGCAGAGAAT
 60 | S	484926	AGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGGCTAATACGGTGAAACCCTGTCTCTACTAAAAATACA
 61 | S	484945	CAGGATACAGACTT
 62 | S	484957	GCAAATTCAATTGGGCA
 63 | S	484961	CCAGTTTGGAAGTC
 64 | S	484997	GTGGAAGACAGAAAAT
 65 | S	484930	CAGGCAGAGCTTGCAGTGAGCA
 66 | S	484933	TGGGCAACTGAGCGAGACTCCATCT
 67 | S	484946	AATCTGATGAATTGCGAGGAGTCTTTG
 68 | S	484974	AGCACCTGTAATCCCATCTACTC
 69 | S	484956	GCAAATTTAATTGGGCA
 70 | S	484971	GAGGTCAGGAGTCAAGACCAGCCTGGCCAAGATGGTGAAACCCCATCTCTACTAAAAA
 71 | S	484972	GAGGTCAGGAATTCAAGACCAGCCTGGCCAAGATGGTGAAACCCCATCTCTACTAAAAA
 72 | S	484966	CTGCAGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGTGGATCAC
 73 | S	484959	GAGGTTTAGTCTGT
 74 | S	484993	GTTTGCTTAATAATG
 75 | S	484935	GAAATAATAATAATAAAAAAAAG
 76 | S	484944	TTGAATGTTAAGTT
 77 | S	484986	GGCAAATACAATTTT
 78 | S	484977	TGCTTGAACCTGGGAGGTGGAGGTTGCAGTGAGCCAAGATCGTGTCACTGC
 79 | S	484960	ATTGGATATGGGGAA
 80 | S	484928	GAAAATTAGCTGGGCATGGTGGCGGGCGCTTGTAGTCC
 81 | S	485000	AAATGCATATAATCAATA
 82 | L	484922	+	484923	+	0M
 83 | L	484923	+	484924	+	0M
 84 | L	484923	+	484925	+	0M
 85 | L	484924	+	484926	+	0M
 86 | L	484925	+	484926	+	0M
 87 | L	484926	+	484927	+	0M
 88 | L	484926	+	484928	+	0M
 89 | L	484926	+	484929	+	0M
 90 | L	484927	+	484930	+	0M
 91 | L	484928	+	484930	+	0M
 92 | L	484929	+	484930	+	0M
 93 | L	484930	+	484931	+	0M
 94 | L	484930	+	484932	+	0M
 95 | L	484931	+	484933	+	0M
 96 | L	484932	+	484933	+	0M
 97 | L	484933	+	484934	+	0M
 98 | L	484933	+	484935	+	0M
 99 | L	484933	+	484936	+	0M
100 | L	484933	+	484937	+	0M
101 | L	484933	+	484938	+	0M
102 | L	484933	+	484939	+	0M
103 | L	484934	+	484940	+	0M
104 | L	484935	+	484940	+	0M
105 | L	484936	+	484940	+	0M
106 | L	484937	+	484940	+	0M
107 | L	484938	+	484940	+	0M
108 | L	484939	+	484940	+	0M
109 | L	484940	+	484941	+	0M
110 | L	484941	+	484942	+	0M
111 | L	484942	+	484943	+	0M
112 | L	484943	+	484944	+	0M
113 | L	484944	+	484945	+	0M
114 | L	484945	+	484946	+	0M
115 | L	484945	+	484947	+	0M
116 | L	484945	+	484948	+	0M
117 | L	484945	+	484949	+	0M
118 | L	484945	+	484950	+	0M
119 | L	484946	+	484951	+	0M
120 | L	484946	+	484952	+	0M
121 | L	484947	+	484951	+	0M
122 | L	484948	+	484951	+	0M
123 | L	484949	+	484951	+	0M
124 | L	484950	+	484951	+	0M
125 | L	484951	+	484953	+	0M
126 | L	484951	+	484954	+	0M
127 | L	484952	+	484953	+	0M
128 | L	484953	+	484955	+	0M
129 | L	484954	+	484955	+	0M
130 | L	484955	+	484956	+	0M
131 | L	484955	+	484957	+	0M
132 | L	484955	+	484958	+	0M
133 | L	484956	+	484959	+	0M
134 | L	484957	+	484959	+	0M
135 | L	484958	+	484959	+	0M
136 | L	484959	+	484960	+	0M
137 | L	484960	+	484961	+	0M
138 | L	484961	+	484962	+	0M
139 | L	484961	+	484963	+	0M
140 | L	484962	+	484964	+	0M
141 | L	484963	+	484964	+	0M
142 | L	484964	+	484965	+	0M
143 | L	484965	+	484966	+	0M
144 | L	484965	+	484967	+	0M
145 | L	484965	+	484968	+	0M
146 | L	484965	+	484969	+	0M
147 | L	484966	+	484970	+	0M
148 | L	484966	+	484971	+	0M
149 | L	484966	+	484972	+	0M
150 | L	484967	+	484970	+	0M
151 | L	484968	+	484970	+	0M
152 | L	484969	+	484970	+	0M
153 | L	484970	+	484973	+	0M
154 | L	484971	+	484973	+	0M
155 | L	484972	+	484973	+	0M
156 | L	484973	+	484974	+	0M
157 | L	484974	+	484975	+	0M
158 | L	484974	+	484976	+	0M
159 | L	484975	+	484977	+	0M
160 | L	484975	+	484978	+	0M
161 | L	484976	+	484977	+	0M
162 | L	484977	+	484979	+	0M
163 | L	484977	+	484980	+	0M
164 | L	484977	+	484982	+	0M
165 | L	484977	+	484983	+	0M
166 | L	484978	+	484981	+	0M
167 | L	484979	+	484984	+	0M
168 | L	484980	+	484984	+	0M
169 | L	484981	+	484985	+	0M
170 | L	484982	+	484984	+	0M
171 | L	484983	+	484984	+	0M
172 | L	484984	+	484986	+	0M
173 | L	484984	+	484987	+	0M
174 | L	484985	+	484986	+	0M
175 | L	484986	+	484988	+	0M
176 | L	484987	+	484988	+	0M
177 | L	484988	+	484989	+	0M
178 | L	484988	+	484990	+	0M
179 | L	484989	+	484991	+	0M
180 | L	484989	+	484992	+	0M
181 | L	484990	+	484991	+	0M
182 | L	484991	+	484993	+	0M
183 | L	484991	+	484994	+	0M
184 | L	484991	+	484995	+	0M
185 | L	484991	+	484996	+	0M
186 | L	484992	+	484993	+	0M
187 | L	484993	+	484997	+	0M
188 | L	484993	+	484998	+	0M
189 | L	484993	+	484999	+	0M
190 | L	484994	+	484997	+	0M
191 | L	484995	+	484997	+	0M
192 | L	484996	+	484997	+	0M
193 | L	484997	+	485000	+	0M
194 | L	484998	+	485000	+	0M
195 | L	484999	+	485000	+	0M
196 | 


--------------------------------------------------------------------------------
/test/graph2.gfa:
--------------------------------------------------------------------------------
 1 | X	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30
 2 | B	1	1	1	1	1	6	1	1	1	1	1	1	1	1	1	1	3	2	1	2	1	1	1	2	1	1	1	1	1	1	1
 3 | S	252283	GCATTTCGTCATTTCA
 4 | S	252282	TTCTTCTCATTGTT
 5 | S	252290	TTCATTTCATTTCATCATTTCATCTTTTCATCTC
 6 | S	252285	ATCATCTCATTTCATCTT
 7 | S	252299	TCATTTCACTTCATCATTTCATTTC
 8 | S	252287	TTCATCTCATTTCATCATTTCATCTTTTCATCTC
 9 | S	252284	TCATTTCATCATTTCACTTCATCTCATCATTTCATCTCATGATTTCATCTC
10 | S	252296	TTCATCATTTCATCATTTCATATCATTT
11 | S	252294	CATCATTTCATTTCATTATTTCATCATTTA
12 | S	252297	CTTCATTTCACCAT
13 | S	252289	TTCATCTCATTTCATCTTTTCATCTC
14 | S	252298	TTGATCTTCTCATT
15 | S	252293	CAATTCGTCATTT
16 | S	252295	ATCATTTAACTTCAT
17 | S	252292	GTCATTTCATTTGATCATTTCATTTCAT
18 | S	252281	TATTTCATTTCAGCAT
19 | S	252291	TTCATCTCATTTCATTTCATCATTCATCTTTTCATCTC
20 | S	252286	TTCATCTCATTTCATTTCATCATTTCATCTTTTCATCTC
21 | S	252288	TTCATCTCATCATTTCATCTTTTCATCTC
22 | S	252307	TTTTGTTTCATTATA
23 | S	252317	CCTTTTCATTTCATCATTTCATTTC
24 | S	252309	CCTTTTCATTTCATTTCATCAT
25 | S	252304	TATTTCATCATTCCATTTCATCATTTCA
26 | S	252310	TTCATTTCATATCATTTCCTCA
27 | S	252302	TATTTCATCATTCCATTTCATCATTTCATTACATTTCATCATTTCA
28 | S	252316	CGTTTCATTACA
29 | S	252318	ATCATTTCATATCATTTCCTCA
30 | S	252300	ATCATTTCATTTCCTCATTTCATTTC
31 | S	252315	TTTCATCTCATCATTTCATCTTTTAATCTCATTTCATTTAATCATTT
32 | S	252311	ATTCATTATTTCATC
33 | S	252319	ATTCATCATTTCATCTTTTCATTTC
34 | S	252303	TATTTCATCATTCCATTTCATTACATTTCATCATTTCA
35 | S	252306	CTTCATCTCATCATTTCATCTTTTAATCTCATTTCATTTAATCA
36 | S	252305	CTTCATCTCATCATTTCATCTCATGATTTCATTTCATCATTTCATCTTTTAATCTCATTTCATTTAATCA
37 | S	252313	CATTTCATCATTCCATTTCATCATTTCATTACATTTCATCATTTCACTTCATCTCATCATTTCATCTCATGATTTCA
38 | S	252308	CCTTTTCATTTCATTTCATCATTTCAT
39 | S	252321	CTTCATTATTTCATTTCATTTCA
40 | S	252312	TTTTCATTTCATTTCAC
41 | S	252320	ATCATTTCATCATTTCATTTCATCATTTCA
42 | S	252301	ACCATTTCATCATTT
43 | S	252314	CATTTCATCATTCCATTTCATCATTTCATTACATTTCATCATTTCACTTCATCTCATCATTTCATCTCATGA
44 | L	252281	+	252282	+	0M
45 | L	252282	+	252283	+	0M
46 | L	252283	+	252284	+	0M
47 | L	252284	+	252285	+	0M
48 | L	252285	+	252286	+	0M
49 | L	252285	+	252287	+	0M
50 | L	252285	+	252288	+	0M
51 | L	252285	+	252289	+	0M
52 | L	252285	+	252290	+	0M
53 | L	252285	+	252291	+	0M
54 | L	252286	+	252292	+	0M
55 | L	252287	+	252292	+	0M
56 | L	252288	+	252292	+	0M
57 | L	252289	+	252292	+	0M
58 | L	252290	+	252292	+	0M
59 | L	252291	+	252292	+	0M
60 | L	252292	+	252293	+	0M
61 | L	252293	+	252294	+	0M
62 | L	252294	+	252295	+	0M
63 | L	252295	+	252296	+	0M
64 | L	252296	+	252297	+	0M
65 | L	252297	+	252298	+	0M
66 | L	252298	+	252299	+	0M
67 | L	252299	+	252300	+	0M
68 | L	252300	+	252301	+	0M
69 | L	252301	+	252302	+	0M
70 | L	252301	+	252303	+	0M
71 | L	252301	+	252304	+	0M
72 | L	252302	+	252305	+	0M
73 | L	252302	+	252306	+	0M
74 | L	252303	+	252305	+	0M
75 | L	252304	+	252305	+	0M
76 | L	252305	+	252307	+	0M
77 | L	252306	+	252307	+	0M
78 | L	252307	+	252308	+	0M
79 | L	252307	+	252309	+	0M
80 | L	252308	+	252310	+	0M
81 | L	252309	+	252310	+	0M
82 | L	252310	+	252311	+	0M
83 | L	252311	+	252312	+	0M
84 | L	252312	+	252313	+	0M
85 | L	252312	+	252314	+	0M
86 | L	252313	+	252315	+	0M
87 | L	252314	+	252315	+	0M
88 | L	252315	+	252316	+	0M
89 | L	252316	+	252317	+	0M
90 | L	252317	+	252318	+	0M
91 | L	252318	+	252319	+	0M
92 | L	252319	+	252320	+	0M
93 | L	252320	+	252321	+	0M
94 | 


--------------------------------------------------------------------------------
/test/graph3.gfa:
--------------------------------------------------------------------------------
 1 | X	0	1	2	3	4	5	6	7	8	9	10
 2 | B	1	1	1	1	1	1	1	1	1	1	1
 3 | S	319767	ATGCTTCTGTCTAGATTTGATATGAAGATATTCCCGTTTCCAACGAAATCTTCAAATCTATCCAAATGTCCACTTGCAGATTCAACAAAAAGTGTTTTTCAGAACTGCTCTATCAAAAGAAAGATCCACCTCTGTTAGCTGAGTTCACTCATCACAAACAAGTTTATGAGAATGCTTCTGTCTAGTTTTTATTTGAAGATATTTCCTTTCTCACCATAGACCTGAAAGCTGTCCTAATGTTCACTTCCAGATACTACAGAAAGAGTGTTTCAAAACTGCTGTACGAAAGGGAATGTTCAACTCTGTGACTTGAATGCACACATCACAAAGAAGTTTCTGAGGATGCTGCTGTCTACTTTTTATACGTAATCCCGTTTCCAACGAAATCCTCCAA
 4 | S	319769	ACAAGGAAGATTCTGAGATTGCTTCTGTCTAGTTTTTATGGGAAGATATTT
 5 | S	319773	AAGGCCTCAGAGCGCTCCAAATATCCACTTGCACATACTACAAAAAGAGTGCCTCAAAGCTGCTCTCTGAAACGGAATGTTCAACTCTATGAGTTGAATGCAAACATCGCAAAGACGTTTCTGAGAATGCTTCTGTCTAGATTTGATATGAAGATATTCCCGTTTCCAACGAAATCTTCAAATCTATCCAAATGTCCACTTGCAGATTCAACAAAAAGTGTTTTTCAGAACTGCTCTATCAAAAGAAAGATCCACCTCTGTTAGCTGAGTTCACACATCACAAACAAGTTTATGAGAATGCTTCTGTCTAGTTTTTATTTGAAGATATTTCCTTTCTCACCATAGACCTGAAAGCTGTCCTAATGTTCACTTCCAGATACTACAGAAAGAGTGTTTCAAAACTGCTGTACGAAAGGGAATGTTCAACTCTGTGACTTGAATGCACACATCACAAAGAAGTTTCTGAGGATGCTGCTGTCT
 6 | S	319766	TTTCAACTCTGTGACTTGAATGCAGACATCACAGAGCAGTTTCTGAGAATGCTTCTGTCTAGATTTTATAGGAAGATATTCCCGTTTCCAACGAAATCTTCACAGCTATCCAAATATCCACTTGCAGATTCTACAAAAAGAGTGTATCAAAACTGCTCTGTCAAAAGGAAGGTTCTTCTCTGTTAGGTGAGTGCATACGTCATAAAGGAGTTTCTGAGAATGTTTCTGTCTAGTGGTTATGGGAAGATATTTGCTTTTTCACCGTAGGCCTCAGAGCGCTCCAAATATCCACTTGCACATACTACAAAAAGAGTGCTTCAAAGCTGCTCTCTGAAACGGAATGTTCAACTCTATGAGTTGAATGCAAACATCACAAAGACGTTTCTGAGA
 7 | S	319771	CGCAGATTCTACAAAAAGAGTGTATCAAAACTGCTCTGTCAAAAGGAAGGTTCTTCTCTGTTAGGTGAGTGCATACGTCATAAAGGAGTTTCTGAGAATGTTTCTGTCTAGTGGTTATGGGAAGATATTTGCTTTTTCACCGTAGGCCTCAGAGCGCTCCAAATATCCACTTGCACATACTACAAAAAGAGTGCCTCAAAGCTGCTCTCTGAAACGGAATGTTCAACTCTATGAGTTGAATGCAAACATCACAAAGACGTTTCTGAGAATGCTTCTGTCTAGATTTGATATGAAGATATTCCCGTTTCCAACGAAATCTTCCAATCTATCCAAATGTCCACTTGCAGATTCAACAAAAAGTGTTTTTCAGAACTGCTCTATCAAAAGAAAGATCCACCTCTGTTAGCTGAGTTCACACATCACAAACAAGTTTATGAGAATGCTTCTGTCTAGTTTTTATTTGAAGATATTTCCTTTCTCACCATAGACCTGAAAGCTGTCCTAATGTTCACTTCCAGATACTACAGAAAGAGTGTTTTAAAACTGCTGTACGAAAGGGAATGTTCAACTCTGTGACTTGAATGCACACATCACAAAGAAGTTTCTGAGGATGCTGCTGTCTACTTTTTATACGTAATCCCGTTTCCAACGAAATCCTCCAAGCTATCCAAATATCCACTTGCAGATTCCACAGAAAGAGTGTTTCAAAACAGCTCTGTCAATAGAAAGGTTCAACTCTGTTAGCTGCGTGCATATATCCCAAAGAAGATTCTGAGATTGCTTCTGTCTAGTTTTTATGGAAGATATTTCCCTTTTCACCGTAGGCGTCAAGGCGCTCCAAATGTCCACTTCCAGATACTACAAAAAGAGTGTTTCAAACCTACTCTGTGAAAGGGAATATTCAACTCTGTGACTTGAATGCAGATATCACAAAGAAGTTTCTGAGAATGCTTCTGTCGAGATTTTATATGAAGATATTCCCGTTTCCAACGAAATCCTG
 8 | S	319776	ACTACAAAAAGAGTGTTTCAAACCTACTCTGTGAAAGGGAATATTCAACTCTGTGACTTGAATGCACATATCACAAAGAAGTTTCTGAGAATGCTTCTGTCGAGATTTTATATGAAGATATTCCCGTTTCCAACGAAATCCTGAAATCTATCCAAATATCCCCTCGCAGATTCTACAAAAAGAGTGTTTCAAAACTGCTCTGTAAAAAGAAAGGTTCAACTCTGTTACTTCAGTACACACATCACAAACAAGTTTCACAGAATGCTTCTTTCTAGCTTGTAGGGGAAGATATTCCCTTTATCACCATGGGCCTCAAACCGTCCGAA
 9 | S	319770	GCCTTTTCACCGTAGGCGTCAAGTCGCTCCAAATGTCCACTTCCAGATACTACAAAAAGAGTGTTTCAAACCTACTCTGTGAAAGGGAATATTCAACTCTGTGACTTGAATGCAGATATCACAAAGAAGTTTCTGAGAATGCTTCTGTCGAGATTTTATATGAAGATATTCCCGTTTCCAACGAAATCCTGAAATCTCTCCAAATATCCCCT
10 | S	319774	TCTTTTTATACGTAATCCCGTTTCCAATGAAATCCTCCAAGCTATCCAAATATCCACTTGCAGATTCCACAGAAAGACTGTTTCAAAA
11 | S	319768	TCTATCCAAATATCCACTTGCAGATTCCACAGAAAGACTGTTTCAAAACTGCTCTGTCAATAGAAACGTTCAACTCTGTTAGCTGCGTGCATATATC
12 | S	319775	CTGCTCTGTCAATAAAAAGGTTCAACTCTGTTAGCTGCGTGCATATATCCCAAAGAAGATTCTGAGATTGCTTCTGTCTAGTTTTTATGGGAAGATATTTCCCTTTTCACCGTAGGCGTCAAGGCGCTCCAAATGTCCACTTCCAGAT
13 | S	319772	AAATCTATCCAAATATCCCCTCGCAGATTCTACAAAAAGAGTGTTTCAAAACTGCTCTGTAAAAAGAAAGGTTCAACTCTGTTAGTTGAGTACACACATCACAAACAAGTTTCACAGAATGCTTCTTTCTAGCTTGTAGGGGAAGATATTCCCTTTATCACCATGGGCCTCCAACCGTCCGAAACTTCCACTTCCATATACTACAAAAAGAGCGTTTCAAACCTGCTCTATGAAAGGCAATGTTCAACTCTGTGACTTGAATGCAGACATCACAGAGCAGTTTCTGAGAATGCTTCTGTCTAGATTTTATAGGAAGATATTCCCGTTTCCAACGAAATCTTCACAGCTATCCAAATATCCACTTGCAGATTCTACAAAAAGAGTGTATCAAAACTGCTCTGTCAAAAGGAAGGTTCTTCTCTGTTAGGTGAGTGCATACGTCATAAAGGAGTTTCTGAGAATGTTTCCGTCTAGTGGTTATGGGAAGATATTTGCTTTTTCACCG
14 | L	319766	+	319767	+	0M
15 | L	319767	+	319768	+	0M
16 | L	319768	+	319769	+	0M
17 | L	319769	+	319770	+	0M
18 | L	319770	+	319771	+	0M
19 | L	319771	+	319772	+	0M
20 | L	319772	+	319773	+	0M
21 | L	319773	+	319774	+	0M
22 | L	319774	+	319775	+	0M
23 | L	319775	+	319776	+	0M
24 | 


--------------------------------------------------------------------------------
/test/read1.fastq:
--------------------------------------------------------------------------------
1 | @reverse_strand_read
2 | AGTAGATGGGTCGCACCATCTTGGCAGGCTGGTCTTGAACTCCTGACCTCGTGATCCACCCGCCTCGGCCTCCCAGAGTGCTGGGATTACAGGCGTGAGCCACTGCAGCTGGCGTAGGACAGGCAGGTTTTGATGACCTGAATCTTCTCACCGGGACTTCCAAACTGGTTCCCCATATCCAATACAGACTAAACCTCTGCCCAATCGAATTTGCTTGAATTTCAATTCTGACAATATTGCTGGTGTAGAAGCCTCTAAAGACTCCTGGCAATTCATCAGATTAAGTCTGTATCCTGAACTTAACATTCAAAGCCTCCCCAAATGACCTCAACCTACACCTTCTCAGTATTATTTTAAGATCTTTATCTTTTTTTATTATTATTATTTCAGATGGAGTCGCTCATTGCCCAGCCTGGAGTGCAGTGGCACGATCTTTGCTCACTGCAAGCTCTGCCTGGGACTATCGGCGCCCGCCACCATGCCCAGCTAATTTTTCTGTATTTTTAGTAGAGACGGGTTTCACCGTATTAGCCAGGATGGTTACCGATCTCCTGACCTCGT
3 | +
4 | FUDLLF@ADCJBIRBD9F4NE&)',:,2'DJ;AB=<:+*{JNC'DW2GJ/<@19FLDFF:DHAG:/CI?EBDCF%*%('&PBHH7C4@OJ?9F>E?{2==E@DJDIC@A7590,1+)37FF4@LE@*I>C(K@KE{HH/,-*CDJ6@@L@G3?>@EJ1C;IGHEHCGJEK{90/5@EALC><IKA9GG=B?>9EBF=<F9FH9BG''4-0F70;&-,)05+&8C6+?E>>3GLN?>?)8,)%')&*J{-JD]P78,C=<JLE,+)4*,(+(?E1@DCK/OEF{=>E>IE-C;N7XJAJ?8E<=%56A>?a?BN=4M<GA6F=9{I>Z7;%`<I(3=8EBGI=F;7MLC<C-))F('))GO=991E7>8A3EK?BALKE+B-3AH1ME0GIGF*@=1)&(,)(0'()'CKD47DNC(:JEIGLG:GDL,SGEO>HD:P(A5BLGF/C&EMM0S{2X>?D,I9@9;9%,%3{C6ELDJ9QJ:.E{HE-{D=B0:U,5*.)7.HQ'D;=GJ6G(3<>4*-*VO7;M@K-HI@6NBG?0;@/'1*$.&%&((*F;SK:IBD@=={
5 | 


--------------------------------------------------------------------------------
/test/read2.fastq:
--------------------------------------------------------------------------------
1 | @forward_read
2 | CTTTTCATCTCGTCATTTCATTTGATCATTCAGTTTCATCAATTCGTCATTTCATCATTTAATTTCATTATTTCATCAATTAATCATCTAACTTCATTTCATCATTTCATCATTTTCATATCATTTCTTCATACCACGTTTGATCTATCTCTAGTTCATTTCACTTCGTCATTATTTCATCATTTCATTTACTCATTTCATTTGACCATTTAATCATTTTTATTTCAACACCCCATTTCATTACATTTCATCATTTCACTTTCATCTCATCATTTCATGTCATGATTTCATTTCATCAGTTCATCTTTTAATCTCATTTATTTTAATCATTTTGTTCATTATACCTTTCATTTCATTTCATGATTTCATTTGGTTTGATATCATTTCCTCAATACATTATTTCATCTTTTCATTTCATTTCACCATTTCATAATTCCATTTCATCATT
3 | +
4 | &'&%/C8-D?EEK;50J9MCA1CJKI*,B<&))2)*JXAGTFHLM)HA-6@>LLD{-0@52.+2)/L=AADJDN3?B1%()0/5==;,,<((.?B@LB3:A:NE(L3GCD4:13?0+%&DJ6A@L,<M5>).=%)'*(%(('//DB(-%+(&2&4''24FFKB966=31&0.,9).1LH9:JNC<GK2=>(/7-/-4G=+2-C-*+*27(D&(1(/):'.::'+'6;'&7+)%('.R/JIJ+3HG=UCEE>A9DFHL=4.++'*,+=E?CY>4:<4:7+2('3B<KDCO)CVA8`;0?)*/')6F@(6>,J@BH{>+B)2/%&C(3*:@=PK4795'0+&HB{-@)=;0)6)0,M,H1.7;'&-5,3+D,@,.*'1%)*)/O)UN?L<)11<.&&4**E7*GD@D;:M>1KAIG:L4:L?PN:<F=H(&51))<(4ND1.=L?859D5
5 | 


--------------------------------------------------------------------------------
/test/read3.fastq:
--------------------------------------------------------------------------------
1 | @reverse_complement_read
2 | ACCTTCTATTGACAGAGCAGTTTTGAAACAGTCTTTCTGTGGAATCTGCAAGTGGATATTTGGATAGCTTGGAGGATTTCGTTGGAAACGGGATTACGTATAAAAAGTAAGACAGCAGCATTCTCAAAAACTTCTTTGTGATGTGTGCATTCAAGTCACAGAGTTGAACATTCCCTTTCGTACAGCAGTTCTTAAACGCTCTTTCTGTAGTATCTGGAAGTGAACATTAGGACAGCTTTCAGGTCTATGGTGAGAAAGGAAATATCTTCAAATAAAAAACTAGACAGAAGCATTCTCATAAACTTGTTTGTGATGTGTGAACTCAGCTAACAGAGGTGGATGCTTTCTTTTGATAGAGCAGTTCTGAAAAACACTTTTTGTTGAATCTGCAAGTGGACATTTGGATAGATTTGAAGATTTCGTTGGAAACGGGAATATCTTCATATCAAATCTAGACAGAAGCATGCTCAGAAACGTCTTTGTGATGTTTGCATTCAACTCATAGAGTTGAACATTCCGTTTCAGAGCAGCTTTGAGGCACTCTTTTTGTAGTATGTGCGAGTGGATATTTGGAGCGCTCTGAGGCC
3 | +
4 | {:7;3()+1B@AG3GE5<B/=6>FA8FNC?BDHBN<:3KU;7?<KD)KK=H(JK:K-BOAHFCD2)EIJMHE{FGXDKFHB9>BAM:83KDE=GLAEC(F?H.5:2-0,&)**)<?CACL7/)%(-++'6*<DMJBH4>{K)G?>H@668IC)CGDJ)DH:2?3E8?A9A>H=BDJL5D>F:A7@J49E;'7+3(ENBA@{IICI=J7P{FHAE:PIBD3@NBD<JU=?JDBC-K:@SAI9KDIT@?+GFD>E3E4EHB8FEKD8&S>F@*4:?2:@,,.1-(EbLKQM7HA<?{<GM2@@;GBHCN4DG6B)CKF4D9L{622A3@IAN{GF;=K?70+4/G(@.MLJ:CGJ9AI>:ECCJ:89{TEF{F@XI9>@P@@E=*P@J/?F(H@?2D7DEMEE<B)AE<MED:6@D88=A@3CEKABC0OD?RG8JI<P?3@H>5{JG?@GBETA/L{{H@LGE:4+0646(3AA;M9HGAEIISRCB,L7BCHKQH<FEAB`9S?E+CHIBSBGNFE{G4>IAI+A(.&.')K79KIKIJ{FC=A<G6>EBF<ED5)':@)1'6)95/7;KB'FI)C>;LCU6MIGHA
5 | 


--------------------------------------------------------------------------------
/tools/ChainX-block-graph/Makefile:
--------------------------------------------------------------------------------
 1 | CPPFLAGS=-Ofast -march=native --std=c++20 -pthread
 2 | #CPPFLAGS=-g -O0 --std=c++20 -pthread
 3 | HEADERS=-I ../sdsl-lite-v3/include -I ../concurrentqueue
 4 | 
 5 | all : chainx-block-graph 
 6 | 
 7 | chainx-block-graph : chainx-block-graph.cpp chainx-block-graph.hpp chaining.hpp efg.hpp command-line-parsing/cmdline.h command-line-parsing/cmdline.c
 8 | 	g++ $(CPPFLAGS) $(HEADERS) \
 9 | 	chainx-block-graph.cpp command-line-parsing/cmdline.c \
10 | 	-o chainx-block-graph
11 | 
12 | # uncomment for development
13 | #command-line-parsing/cmdline%c command-line-parsing/cmdline%h : command-line-parsing/config.ggo
14 | #	gengetopt \
15 | #		--input=./command-line-parsing/config.ggo \
16 | #		--output-dir=./command-line-parsing/ \
17 | #		--unnamed-opts
18 | 
19 | test : chainx-block-graph test/test.sh
20 | 	test/test.sh
21 | 
22 | .PHONY : clean all test cleanall
23 | 
24 | #clean :
25 | #	rm -Rf command-line-parsing/cmdline.c command-line-parsing/cmdline.h test/output*
26 | #cleanall :
27 | #	rm -Rf chainx-block-graph command-line-parsing/cmdline.c command-line-parsing/cmdline.h test/output*
28 | clean :
29 | 	rm -Rf test/output*
30 | cleanall :
31 | 	rm -Rf chainx-block-graph test/output*
32 | 


--------------------------------------------------------------------------------
/tools/ChainX-block-graph/README.md:
--------------------------------------------------------------------------------
1 | # chainx-block-graph
2 | Program to perform co-linear chaining on the Elastic Degenerate String relaxation of Elastic Founder Graphs.
3 | 
4 | ## todo
5 |  - always collect statistics
6 |  - docs
7 |  - more tests trying complex chains
8 |  - get secondary chains with fancy backtracking
9 | 


--------------------------------------------------------------------------------
/tools/ChainX-block-graph/chaining.hpp:
--------------------------------------------------------------------------------
  1 | //code adapted from https://github.com/at-cg/ChainX
  2 | //Chirag Jain, Daniel Gibney and Sharma Thankachan. Algorithms for Colinear Chaining with Overlaps and Gap Costs. Journal of Computational Biology, 2022
  3 | //license: https://www.apache.org/licenses/LICENSE-2.0.txt
  4 | 
  5 | #ifndef CHAIN_HPP 
  6 | #define CHAIN_HPP
  7 | 
  8 | #include "efg.hpp"
  9 | 
 10 | //#define CHAIN_HPP_DEBUG
 11 | 
 12 | using std::vector, std::swap;
 13 | 
 14 | namespace chainx_block_graph {
 15 | /**
 16 |  * Compute optimal chain based on anchor-restricted edit distance using
 17 |  * strong precedence criteria optimized to run faster using engineering
 18 |  * trick(s), comparison mode: global.
 19 |  * We assume the anchors are sorted by the starting positions in the
 20 |  * linear text and that there are two dummy anchors marking the
 21 |  * beginning and end of the references.
 22 |  * graph.init_eds_support() must have been called before this function
 23 |  **/
 24 | vector<GAFHit> chain_global_eds(vector<GAFHit> &anchors, const Elasticfoundergraph &graph, const int initial_guess, const double ramp_up_factor, Stats &stats, const bool removesol = false)
 25 | {
 26 | 	//graph.init_eds_support();
 27 | 
 28 | 	int n = anchors.size();
 29 | 	vector<int> costs(n, 0);
 30 | 	vector<int> backtrack(n, 0);
 31 | 
 32 | 	int bound_redit = initial_guess; //distance assumed to be <= initial_guess
 33 | 	int revisions = 0;
 34 | 	//with this assumption on upper bound of distance, a gap of >bound_redit will not be allowed between adjacent anchors
 35 | 
 36 | 	while (true) {
 37 | 		int inner_loop_start = 0;
 38 | 
 39 | 		for(int j=1; j<n; j++) {
 40 | 			//compute cost[j] here
 41 | 			int find_min_cost = std::numeric_limits<int>::max();
 42 | 			int backtrack_min_cost = std::numeric_limits<int>::max();
 43 | 
 44 | 			// anchor i < anchor j 
 45 | 
 46 | 			while (anchors[inner_loop_start].gap_query(anchors[j]) > bound_redit)
 47 | 				inner_loop_start++;
 48 | 
 49 | 			for(int i=j-1; i>=inner_loop_start; i--) {
 50 | 				if (costs[i] < std::numeric_limits<int>::max() and are_colinear_eds(anchors[i], anchors[j], graph)) {
 51 | 					int g = max_gap_eds(anchors[i], anchors[j], graph);
 52 | 					int o = overlap_eds(anchors[i], anchors[j], graph);
 53 | #ifdef CHAIN_HPP_DEBUG
 54 | 					std::cerr << "anchors[" << i << "] -> anchors[" << j << "]: g = " << g << ", o = " << o << std::endl;
 55 | #endif
 56 | 					if (costs[i] + g + o < find_min_cost) {
 57 | 						find_min_cost = costs[i] + g + o;
 58 | 						backtrack_min_cost = i;
 59 | 					}
 60 | 				}
 61 | 			}
 62 | 			//save optimal cost at offset j
 63 | 			costs[j] = find_min_cost;
 64 | 			backtrack[j] = backtrack_min_cost;
 65 | 		}
 66 | 
 67 | 		if (costs[n-1] > bound_redit) {
 68 | 			bound_redit = bound_redit * ramp_up_factor;
 69 | 			revisions++;
 70 | 		} else {
 71 | 			break;
 72 | 		}
 73 | 	}
 74 | 
 75 | #ifdef CHAIN_HPP_DEBUG
 76 | 		std::cerr << "Cost DP array = ";
 77 | 		for (auto c : costs)
 78 | 			std::cerr << c << " ";
 79 | 		std::cerr << std::endl;
 80 | 		std::cerr << "Backtrack array = ";
 81 | 		for (auto b : backtrack)
 82 | 			std::cerr << b << " ";
 83 | 		std::cerr << std::endl;
 84 | 		std::cerr << "Chaining cost computed " << revisions + 1 << " times" << "\n";
 85 | #endif
 86 | 
 87 | 	//TODO: consider freeing here the space of costs array
 88 | 	// backtrack optimal solution
 89 | 	vector<GAFHit> solution;
 90 | 	for (int j = backtrack[n - 1]; j > 0; j = backtrack[j])
 91 | 	{
 92 | 		solution.push_back(anchors[j]);
 93 | 	}
 94 | 	std::reverse(solution.begin(), solution.end());
 95 | 	if (removesol)
 96 | 	{
 97 | 		vector<GAFHit> newanchors;
 98 | 		newanchors.reserve(anchors.size() - solution.size());
 99 | 		for (int i = 0, j = 0; i < anchors.size(); i += 1)
100 | 		{
101 | 			if (j < solution.size() and anchors[i] == solution[j])
102 | 			{
103 | 				j += 1;
104 | 			} else {
105 | 				newanchors.push_back(anchors[i]);
106 | 			}
107 | 		}
108 | 		swap(anchors, newanchors);
109 | 	}
110 | 
111 | 	//std::cout << "distance = " << costs[n-1] << std::endl;
112 | 	if (solution.size() > 0) {
113 | 		stats.maxiterations = std::max(stats.maxiterations, revisions);
114 | 		stats.miniterations = std::min(stats.miniterations, revisions);
115 | 		stats.totaliterations += revisions;
116 | 		stats.maxcost = std::max(stats.maxcost, costs[n-1]);
117 | 		stats.mincost = std::min(stats.mincost, costs[n-1]);
118 | 		stats.totalcost += costs[n-1];
119 | 		const double relativecost = (double)costs[n-1] / solution.at(0).get_query_length();
120 | 		stats.maxrelativecost = std::max(stats.maxrelativecost, relativecost);
121 | 		stats.minrelativecost = std::min(stats.minrelativecost, relativecost);
122 | 		stats.totalrelativecost += relativecost;
123 | 	}
124 | 
125 | 	return solution;
126 | }
127 | 
128 | /**
129 |  * See chain_global_eds, comparison mode: semiglobal.
130 |  **/
131 | vector<GAFHit> chain_semiglobal_eds(vector<GAFHit> &anchors, const Elasticfoundergraph &graph, const int initial_guess, const double ramp_up_factor, Stats &stats, const bool removesol = false)
132 | {
133 | 	//graph.init_eds_support();
134 | 
135 | 	int n = anchors.size();
136 | 	vector<int> costs(n, 0);
137 | 	vector<int> backtrack(n, 0);
138 | 
139 | 	int bound_redit = initial_guess; //distance assumed to be <= initial_guess
140 | 	int revisions = 0;
141 | 	//with this assumption on upper bound of distance, a gap of >bound_redit will not be allowed between adjacent anchors
142 | 
143 | 	while (true) {
144 | 		int inner_loop_start = 0;
145 | 
146 | 		for(int j=1; j<n; j++) {
147 | 			//compute cost[j] here
148 | 			int find_min_cost = std::numeric_limits<int>::max();
149 | 			int backtrack_min_cost = std::numeric_limits<int>::max();
150 | 
151 | 			// anchor i < anchor j 
152 | 
153 | 			while (anchors[inner_loop_start].gap_query(anchors[j]) > bound_redit)
154 | 				inner_loop_start++;
155 | 
156 | 			{
157 | 				//always consider the first dummy anchor 
158 | 				//connection to first dummy anchor is done with modified cost to allow free gaps
159 | 				//int i_d = std::get<1>(anchors[0]) + std::get<2>(anchors[0]) - 1;
160 | 				//int qry_gap = j_c - i_d - 1;
161 | 				int queryg = GAFHit::gap_query(anchors[0], anchors[j]);
162 | 				find_min_cost = std::min(find_min_cost, costs[0] + queryg);
163 | 				backtrack_min_cost = 0;
164 | #ifdef CHAIN_HPP_DEBUG
165 | 					std::cerr << "anchors[" << "0" << "] -> anchors[" << j << "]: queryg = " << queryg << std::endl;
166 | #endif
167 | 			}
168 | 
169 | 			//process all anchors in array for the final last dummy anchor
170 | 			if (j == n-1)
171 | 				inner_loop_start=0;
172 | 
173 | 			for(int i=j-1; i>=inner_loop_start; i--) {
174 | 				if (costs[i] < std::numeric_limits<int>::max() and are_colinear_eds(anchors[i], anchors[j], graph)) {
175 | 					int g;
176 | 					if (j == n-1) //modified cost for the last dummy anchor to allow free gaps
177 | 						g = GAFHit::gap_query(anchors[i], anchors[j]);
178 | 					else
179 | 						g = max_gap_eds(anchors[i], anchors[j], graph);
180 | 
181 | 					int o = overlap_eds(anchors[i], anchors[j], graph);
182 | #ifdef CHAIN_HPP_DEBUG
183 | 					std::cerr << "anchors[" << i << "] -> anchors[" << j << "]: g = " << g << ", o = " << o << std::endl;
184 | #endif
185 | 					if (costs[i] + g + o < find_min_cost) {
186 | 						find_min_cost = costs[i] + g + o;
187 | 						backtrack_min_cost = i;
188 | 					}
189 | 				}
190 | 			}
191 | 			//save optimal cost at offset j
192 | 			costs[j] = find_min_cost;
193 | 			backtrack[j] = backtrack_min_cost;
194 | 		}
195 | 
196 | 		if (costs[n-1] > bound_redit) {
197 | 			bound_redit = bound_redit * ramp_up_factor;
198 | 			revisions++;
199 | 		} else {
200 | 			break;
201 | 		}
202 | 	}
203 | 
204 | #ifdef CHAIN_HPP_DEBUG
205 | 		std::cerr << "Cost DP array = ";
206 | 		for (auto c : costs)
207 | 			std::cerr << c << " ";
208 | 		std::cerr << std::endl;
209 | 		std::cerr << "Backtrack array = ";
210 | 		for (auto b : backtrack)
211 | 			std::cerr << b << " ";
212 | 		std::cerr << std::endl;
213 | 		std::cerr << "Chaining cost computed " << revisions + 1 << " times" << "\n";
214 | #endif
215 | 
216 | 	//TODO: consider freeing here the space of costs array
217 | 	// backtrack optimal solution
218 | 	vector<GAFHit> solution;
219 | 	for (int j = backtrack[n - 1]; j > 0; j = backtrack[j])
220 | 	{
221 | 		solution.push_back(anchors[j]);
222 | 	}
223 | 	std::reverse(solution.begin(), solution.end());
224 | 	if (removesol)
225 | 	{
226 | 		vector<GAFHit> newanchors;
227 | 		newanchors.reserve(anchors.size() - solution.size());
228 | 		for (int i = 0, j = 0; i < anchors.size(); i += 1)
229 | 		{
230 | 			if (j < solution.size() and anchors[i] == solution[j])
231 | 			{
232 | 				j += 1;
233 | 			} else {
234 | 				newanchors.push_back(anchors[i]);
235 | 			}
236 | 		}
237 | 		swap(anchors, newanchors);
238 | 	}
239 | 
240 | 	//std::cerr << "distance = " << costs[n-1] << std::endl;
241 | 	//std::cerr << "length - distance = " << anchors[0].get_query_length() - costs[n-1]  << std::endl;
242 | 	if (solution.size() > 0) {
243 | 		stats.maxiterations = std::max(stats.maxiterations, revisions);
244 | 		stats.miniterations = std::min(stats.miniterations, revisions);
245 | 		stats.totaliterations += revisions;
246 | 		stats.maxcost = std::max(stats.maxcost, costs[n-1]);
247 | 		stats.mincost = std::min(stats.mincost, costs[n-1]);
248 | 		stats.totalcost += costs[n-1];
249 | 		const double relativecost = (double)costs[n-1] / solution.at(0).get_query_length();
250 | 		stats.maxrelativecost = std::max(stats.maxrelativecost, relativecost);
251 | 		stats.minrelativecost = std::min(stats.minrelativecost, relativecost);
252 | 		stats.totalrelativecost += relativecost;
253 | 	}
254 | 
255 | 	return solution;
256 | }
257 | 
258 | } // Namespace chainx_block_graph
259 | 
260 | #endif //CHAIN_HPP
261 | 


--------------------------------------------------------------------------------
/tools/ChainX-block-graph/chaining_hpp_license.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2015 Georgia Institute of Technology
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/tools/ChainX-block-graph/chainx-block-graph.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CHAINX_BLOCK_GRAPH_HPP
 2 | #define CHAINX_BLOCK_GRAPH_HPP
 3 | 
 4 | #include <iostream>
 5 | #include <fstream>
 6 | #include <limits>
 7 | 
 8 | using std::string, std::ifstream, std::ofstream;
 9 | 
10 | namespace chainx_block_graph {
11 | struct Params {
12 | 	ifstream graphfs;
13 | 	ifstream anchorsfs;
14 | 	ofstream outputfs;
15 | 	string ignorechars;
16 | 	bool unsorted_anchors;
17 | 	bool global;
18 | 	bool semiglobal;
19 | 	bool nosplit;
20 | 	bool splitgraphaligner;
21 | 	int threads;
22 | 	int alternativealignments;
23 | 	int initialguess;
24 | 	double initialguesscov;
25 | 	double rampupfactor;
26 | };
27 | 
28 | struct Stats {
29 | 	unsigned long long seeds = 0;
30 | 	unsigned long long reads = 0;
31 | 	int maxiterations = 0;
32 | 	int miniterations = std::numeric_limits<int>::max();
33 | 	unsigned long long totaliterations = 0;
34 | 
35 | 	int maxcost = 0;
36 | 	int mincost = std::numeric_limits<int>::max();
37 | 	unsigned long long totalcost = 0;
38 | 
39 | 	double maxrelativecost = 0;
40 | 	double minrelativecost = std::numeric_limits<double>::max();
41 | 	double totalrelativecost = 0;
42 | };
43 | 
44 | struct Stats mergestats(const struct Stats &s1, const struct Stats &s2)
45 | {
46 | 	struct Stats s;
47 | 	s.seeds = s1.seeds + s2.seeds;
48 | 	s.reads = s1.reads + s2.reads;
49 | 	s.maxiterations = std::max(s1.maxiterations, s2.maxiterations);
50 | 	s.miniterations = std::min(s1.miniterations, s2.miniterations);
51 | 	s.totaliterations = s1.totaliterations + s2.totaliterations;
52 | 	s.maxcost = std::max(s1.maxcost, s2.maxcost);
53 | 	s.mincost = std::min(s1.mincost, s2.mincost);
54 | 	s.totalcost = s1.totalcost + s2.totalcost;
55 | 	s.maxrelativecost = std::max(s1.maxrelativecost, s2.maxrelativecost);
56 | 	s.minrelativecost = std::min(s1.minrelativecost, s2.minrelativecost);
57 | 	s.totalrelativecost = s1.totalrelativecost + s2.totalrelativecost;
58 | 	return s;
59 | }
60 | 
61 | }
62 | 
63 | #endif
64 | 


--------------------------------------------------------------------------------
/tools/ChainX-block-graph/command-line-parsing/cmdline.h:
--------------------------------------------------------------------------------
  1 | /** @file cmdline.h
  2 |  *  @brief The header file for the command line option parser
  3 |  *  generated by GNU Gengetopt version 2.23
  4 |  *  http://www.gnu.org/software/gengetopt.
  5 |  *  DO NOT modify this file, since it can be overwritten
  6 |  *  @author GNU Gengetopt */
  7 | 
  8 | #ifndef CMDLINE_H
  9 | #define CMDLINE_H
 10 | 
 11 | /* If we use autoconf.  */
 12 | #ifdef HAVE_CONFIG_H
 13 | #include "config.h"
 14 | #endif
 15 | 
 16 | #include <stdio.h> /* for FILE */
 17 | 
 18 | #ifdef __cplusplus
 19 | extern "C" {
 20 | #endif /* __cplusplus */
 21 | 
 22 | #ifndef CMDLINE_PARSER_PACKAGE
 23 | /** @brief the program name (used for printing errors) */
 24 | #define CMDLINE_PARSER_PACKAGE "chainx-block-graph"
 25 | #endif
 26 | 
 27 | #ifndef CMDLINE_PARSER_PACKAGE_NAME
 28 | /** @brief the complete program name (used for help and version) */
 29 | #define CMDLINE_PARSER_PACKAGE_NAME "chainx-block-graph"
 30 | #endif
 31 | 
 32 | #ifndef CMDLINE_PARSER_VERSION
 33 | /** @brief the program version */
 34 | #define CMDLINE_PARSER_VERSION "0.0"
 35 | #endif
 36 | 
 37 | /** @brief Where the command line options are stored */
 38 | struct gengetopt_args_info
 39 | {
 40 |   const char *help_help; /**< @brief Print help and exit help description.  */
 41 |   const char *full_help_help; /**< @brief Print help, including hidden options, and exit help description.  */
 42 |   const char *version_help; /**< @brief Print version and exit help description.  */
 43 |   int chain_to_eds_flag;	/**< @brief Perform chaining on the Elastic Degenerate String relaxation of the graph (currently the only implemented chaining solution) (default=on).  */
 44 |   const char *chain_to_eds_help; /**< @brief Perform chaining on the Elastic Degenerate String relaxation of the graph (currently the only implemented chaining solution) help description.  */
 45 |   int global_flag;	/**< @brief Chain between the whole query and any maximal graph path (default=off).  */
 46 |   const char *global_help; /**< @brief Chain between the whole query and any maximal graph path help description.  */
 47 |   int semi_global_flag;	/**< @brief Chain between the whole query and any graph subpath (default=off).  */
 48 |   const char *semi_global_help; /**< @brief Chain between the whole query and any graph subpath help description.  */
 49 |   int unsorted_input_flag;	/**< @brief Do not assume the input GAF anchors to be sorted (at the cost of loading all anchors) (default=off).  */
 50 |   const char *unsorted_input_help; /**< @brief Do not assume the input GAF anchors to be sorted (at the cost of loading all anchors) help description.  */
 51 |   int no_split_output_matches_flag;	/**< @brief Do not split edge matches into node matches in the output chains (default=off).  */
 52 |   const char *no_split_output_matches_help; /**< @brief Do not split edge matches into node matches in the output chains help description.  */
 53 |   long initial_guess_arg;	/**< @brief Fix a constant starting guess for the cost of the optimal chain (default='100').  */
 54 |   char * initial_guess_orig;	/**< @brief Fix a constant starting guess for the cost of the optimal chain original value given at command line.  */
 55 |   const char *initial_guess_help; /**< @brief Fix a constant starting guess for the cost of the optimal chain help description.  */
 56 |   double initial_guess_coverage_arg;	/**< @brief Have the starting guess for the optimal cost be a fraction of the inverse coverage of the read (GUESS * (read length - read coverage)) instead of a constant (by default this is disabled) (default='0').  */
 57 |   char * initial_guess_coverage_orig;	/**< @brief Have the starting guess for the optimal cost be a fraction of the inverse coverage of the read (GUESS * (read length - read coverage)) instead of a constant (by default this is disabled) original value given at command line.  */
 58 |   const char *initial_guess_coverage_help; /**< @brief Have the starting guess for the optimal cost be a fraction of the inverse coverage of the read (GUESS * (read length - read coverage)) instead of a constant (by default this is disabled) help description.  */
 59 |   double ramp_up_factor_arg;	/**< @brief At each chaining iteration, multiply by RAMPUP the guess for the cost of the optimal chain (default='4.0').  */
 60 |   char * ramp_up_factor_orig;	/**< @brief At each chaining iteration, multiply by RAMPUP the guess for the cost of the optimal chain original value given at command line.  */
 61 |   const char *ramp_up_factor_help; /**< @brief At each chaining iteration, multiply by RAMPUP the guess for the cost of the optimal chain help description.  */
 62 |   long alternative_chains_arg;	/**< @brief Chain N+1 times, removing the used anchors after each execution, and output all chains (default='0').  */
 63 |   char * alternative_chains_orig;	/**< @brief Chain N+1 times, removing the used anchors after each execution, and output all chains original value given at command line.  */
 64 |   const char *alternative_chains_help; /**< @brief Chain N+1 times, removing the used anchors after each execution, and output all chains help description.  */
 65 |   long threads_arg;	/**< @brief Max # threads (default='-1').  */
 66 |   char * threads_orig;	/**< @brief Max # threads original value given at command line.  */
 67 |   const char *threads_help; /**< @brief Max # threads help description.  */
 68 |   int overwrite_flag;	/**< @brief Overwrite the output file, if it exists (default=off).  */
 69 |   const char *overwrite_help; /**< @brief Overwrite the output file, if it exists help description.  */
 70 |   int split_output_matches_graphaligner_flag;	/**< @brief Filter out node matches of length 1 for use in GraphAligner (default=off).  */
 71 |   const char *split_output_matches_graphaligner_help; /**< @brief Filter out node matches of length 1 for use in GraphAligner help description.  */
 72 |   
 73 |   unsigned int help_given ;	/**< @brief Whether help was given.  */
 74 |   unsigned int full_help_given ;	/**< @brief Whether full-help was given.  */
 75 |   unsigned int version_given ;	/**< @brief Whether version was given.  */
 76 |   unsigned int chain_to_eds_given ;	/**< @brief Whether chain-to-eds was given.  */
 77 |   unsigned int global_given ;	/**< @brief Whether global was given.  */
 78 |   unsigned int semi_global_given ;	/**< @brief Whether semi-global was given.  */
 79 |   unsigned int unsorted_input_given ;	/**< @brief Whether unsorted-input was given.  */
 80 |   unsigned int no_split_output_matches_given ;	/**< @brief Whether no-split-output-matches was given.  */
 81 |   unsigned int initial_guess_given ;	/**< @brief Whether initial-guess was given.  */
 82 |   unsigned int initial_guess_coverage_given ;	/**< @brief Whether initial-guess-coverage was given.  */
 83 |   unsigned int ramp_up_factor_given ;	/**< @brief Whether ramp-up-factor was given.  */
 84 |   unsigned int alternative_chains_given ;	/**< @brief Whether alternative-chains was given.  */
 85 |   unsigned int threads_given ;	/**< @brief Whether threads was given.  */
 86 |   unsigned int overwrite_given ;	/**< @brief Whether overwrite was given.  */
 87 |   unsigned int split_output_matches_graphaligner_given ;	/**< @brief Whether split-output-matches-graphaligner was given.  */
 88 | 
 89 |   char **inputs ; /**< @brief unnamed options (options without names) */
 90 |   unsigned inputs_num ; /**< @brief unnamed options number */
 91 | } ;
 92 | 
 93 | /** @brief The additional parameters to pass to parser functions */
 94 | struct cmdline_parser_params
 95 | {
 96 |   int override; /**< @brief whether to override possibly already present options (default 0) */
 97 |   int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */
 98 |   int check_required; /**< @brief whether to check that all required options were provided (default 1) */
 99 |   int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */
100 |   int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */
101 | } ;
102 | 
103 | /** @brief the purpose string of the program */
104 | extern const char *gengetopt_args_info_purpose;
105 | /** @brief the usage string of the program */
106 | extern const char *gengetopt_args_info_usage;
107 | /** @brief the description string of the program */
108 | extern const char *gengetopt_args_info_description;
109 | /** @brief all the lines making the help output */
110 | extern const char *gengetopt_args_info_help[];
111 | /** @brief all the lines making the full help output (including hidden options) */
112 | extern const char *gengetopt_args_info_full_help[];
113 | 
114 | /**
115 |  * The command line parser
116 |  * @param argc the number of command line options
117 |  * @param argv the command line options
118 |  * @param args_info the structure where option information will be stored
119 |  * @return 0 if everything went fine, NON 0 if an error took place
120 |  */
121 | int cmdline_parser (int argc, char **argv,
122 |   struct gengetopt_args_info *args_info);
123 | 
124 | /**
125 |  * The command line parser (version with additional parameters - deprecated)
126 |  * @param argc the number of command line options
127 |  * @param argv the command line options
128 |  * @param args_info the structure where option information will be stored
129 |  * @param override whether to override possibly already present options
130 |  * @param initialize whether to initialize the option structure my_args_info
131 |  * @param check_required whether to check that all required options were provided
132 |  * @return 0 if everything went fine, NON 0 if an error took place
133 |  * @deprecated use cmdline_parser_ext() instead
134 |  */
135 | int cmdline_parser2 (int argc, char **argv,
136 |   struct gengetopt_args_info *args_info,
137 |   int override, int initialize, int check_required);
138 | 
139 | /**
140 |  * The command line parser (version with additional parameters)
141 |  * @param argc the number of command line options
142 |  * @param argv the command line options
143 |  * @param args_info the structure where option information will be stored
144 |  * @param params additional parameters for the parser
145 |  * @return 0 if everything went fine, NON 0 if an error took place
146 |  */
147 | int cmdline_parser_ext (int argc, char **argv,
148 |   struct gengetopt_args_info *args_info,
149 |   struct cmdline_parser_params *params);
150 | 
151 | /**
152 |  * Save the contents of the option struct into an already open FILE stream.
153 |  * @param outfile the stream where to dump options
154 |  * @param args_info the option struct to dump
155 |  * @return 0 if everything went fine, NON 0 if an error took place
156 |  */
157 | int cmdline_parser_dump(FILE *outfile,
158 |   struct gengetopt_args_info *args_info);
159 | 
160 | /**
161 |  * Save the contents of the option struct into a (text) file.
162 |  * This file can be read by the config file parser (if generated by gengetopt)
163 |  * @param filename the file where to save
164 |  * @param args_info the option struct to save
165 |  * @return 0 if everything went fine, NON 0 if an error took place
166 |  */
167 | int cmdline_parser_file_save(const char *filename,
168 |   struct gengetopt_args_info *args_info);
169 | 
170 | /**
171 |  * Print the help
172 |  */
173 | void cmdline_parser_print_help(void);
174 | /**
175 |  * Print the full help (including hidden options)
176 |  */
177 | void cmdline_parser_print_full_help(void);
178 | /**
179 |  * Print the version
180 |  */
181 | void cmdline_parser_print_version(void);
182 | 
183 | /**
184 |  * Initializes all the fields a cmdline_parser_params structure 
185 |  * to their default values
186 |  * @param params the structure to initialize
187 |  */
188 | void cmdline_parser_params_init(struct cmdline_parser_params *params);
189 | 
190 | /**
191 |  * Allocates dynamically a cmdline_parser_params structure and initializes
192 |  * all its fields to their default values
193 |  * @return the created and initialized cmdline_parser_params structure
194 |  */
195 | struct cmdline_parser_params *cmdline_parser_params_create(void);
196 | 
197 | /**
198 |  * Initializes the passed gengetopt_args_info structure's fields
199 |  * (also set default values for options that have a default)
200 |  * @param args_info the structure to initialize
201 |  */
202 | void cmdline_parser_init (struct gengetopt_args_info *args_info);
203 | /**
204 |  * Deallocates the string fields of the gengetopt_args_info structure
205 |  * (but does not deallocate the structure itself)
206 |  * @param args_info the structure to deallocate
207 |  */
208 | void cmdline_parser_free (struct gengetopt_args_info *args_info);
209 | 
210 | /**
211 |  * Checks that all the required options were specified
212 |  * @param args_info the structure to check
213 |  * @param prog_name the name of the program that will be used to print
214 |  *   possible errors
215 |  * @return
216 |  */
217 | int cmdline_parser_required (struct gengetopt_args_info *args_info,
218 |   const char *prog_name);
219 | 
220 | 
221 | #ifdef __cplusplus
222 | }
223 | #endif /* __cplusplus */
224 | #endif /* CMDLINE_H */
225 | 


--------------------------------------------------------------------------------
/tools/ChainX-block-graph/command-line-parsing/config.ggo:
--------------------------------------------------------------------------------
 1 | version	"0.0"
 2 | package	"chainx-block-graph"
 3 | purpose	"Program to perform colinear chaining on Elastic Founder Graphs"
 4 | usage	"chainx-block-graph (--global|--semi-global) graph.gfa anchors.gaf output.gaf"
 5 | 
 6 | description	"The program takes in input an Elastic Founder Graph (xGFA) and exact matches between text queries and the graph (GAF), it computes the anchor-restricted edit distance between the queries and the (relaxation of the) graph, and outputs the corresponding chain in GAF format."
 7 | 
 8 | option	"chain-to-eds"	-	"Perform chaining on the Elastic Degenerate String relaxation of the graph (currently the only implemented chaining solution)"	flag	on
 9 | option	"global"	g	"Chain between the whole query and any maximal graph path"	flag off
10 | option	"semi-global"	s	"Chain between the whole query and any graph subpath"	flag off
11 | option	"unsorted-input"	-	"Do not assume the input GAF anchors to be sorted (at the cost of loading all anchors)"	flag off
12 | option	"no-split-output-matches"		-	"Do not split edge matches into node matches in the output chains"	flag off
13 | option	"initial-guess"		-	"Fix a constant starting guess for the cost of the optimal chain"	long	typestr = "GUESS"	default="100"	optional
14 | option	"initial-guess-coverage"	-	"Have the starting guess for the optimal cost be a fraction of the inverse coverage of the read (GUESS * (read length - read coverage)) instead of a constant (by default this is disabled)"	double	typestr = "GUESS"	default="0"	optional
15 | option	"ramp-up-factor"	-	"At each chaining iteration, multiply by RAMPUP the guess for the cost of the optimal chain"	double	typestr = "RAMPUP"	default="4.0"	optional
16 | option	"alternative-chains"		a	"Chain N+1 times, removing the used anchors after each execution, and output all chains"	long	typestr = "N"	default="0"	optional
17 | 
18 | option	"threads"		t	"Max # threads"		long	typestr = "THREADNUM"	default = "-1"	optional
19 | option	"overwrite"	-	"Overwrite the output file, if it exists"	flag	off
20 | 
21 | option	"split-output-matches-graphaligner"	-	"Filter out node matches of length 1 for use in GraphAligner"	flag off	hidden
22 | 


--------------------------------------------------------------------------------
/tools/ChainX-block-graph/test/correctoutput/anchors-1-global.gaf:
--------------------------------------------------------------------------------
1 | q1	8	1	3	+	>n2	4	1	3	0	0	255
2 | q1	8	3	5	+	>n3	3	0	2	0	0	255
3 | q1	8	5	7	+	>n5	4	1	3	0	0	255
4 | q2	8	0	1	+	>n2	4	3	4	0	0	255
5 | q2	8	1	4	+	>n4	3	0	3	0	0	255
6 | q2	8	1	4	+	>n4	3	0	3	0	0	255
7 | q2	8	4	8	+	>n5	4	0	4	0	0	255
8 | 


--------------------------------------------------------------------------------
/tools/ChainX-block-graph/test/correctoutput/anchors-1-semi-global.gaf:
--------------------------------------------------------------------------------
1 | q2	8	0	1	+	>n2	4	3	4	0	0	255
2 | q2	8	1	4	+	>n4	3	0	3	0	0	255
3 | q2	8	1	4	+	>n4	3	0	3	0	0	255
4 | q2	8	4	8	+	>n5	4	0	4	0	0	255
5 | q1	8	1	3	+	>n3	3	0	2	0	0	255
6 | q1	8	5	7	+	>n5	4	1	3	0	0	255
7 | 


--------------------------------------------------------------------------------
/tools/ChainX-block-graph/test/input/anchors-1.gaf:
--------------------------------------------------------------------------------
 1 | q1	8	0	1	+	>n4	3	0	1	0	0	255
 2 | q1	8	7	8	+	>n4	3	0	1	0	0	255
 3 | q1	8	1	3	+	>n2	4	1	3	0	0	255
 4 | q1	8	1	3	+	>n3	3	0	2	0	0	255
 5 | q1	8	1	3	+	>n5	4	1	3	0	0	255
 6 | q1	8	3	5	+	>n2	4	1	3	0	0	255
 7 | q1	8	3	5	+	>n3	3	0	2	0	0	255
 8 | q1	8	3	5	+	>n5	4	1	3	0	0	255
 9 | q1	8	5	7	+	>n2	4	1	3	0	0	255
10 | q1	8	5	7	+	>n3	3	0	2	0	0	255
11 | q1	8	5	7	+	>n5	4	1	3	0	0	255
12 | q2	8	0	4	+	>n2>n4	7	3	7	0	0	255
13 | q2	8	1	8	+	>n4>n5	7	0	7	0	0	255
14 | 


--------------------------------------------------------------------------------
/tools/ChainX-block-graph/test/input/graph.gfa:
--------------------------------------------------------------------------------
 1 | M	2	12
 2 | X	1	2	6	9
 3 | B	1	1	2	1
 4 | S	n1	A
 5 | S	n2	TACT
 6 | S	n3	ACT
 7 | S	n4	TTT
 8 | S	n5	AACT
 9 | L	n1	+	n2	+	0M
10 | L	n2	+	n3	+	0M
11 | L	n2	+	n4	+	0M
12 | L	n3	+	n5	+	0M
13 | L	n4	+	n5	+	0M
14 | 


--------------------------------------------------------------------------------
/tools/ChainX-block-graph/test/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
 4 | outputfolder=$thisfolder/output-$(date -Iminutes)
 5 | logfile=$outputfolder/log
 6 | chainxblockgraph=$thisfolder/../chainx-block-graph
 7 | 
 8 | mkdir $outputfolder
 9 | echo -n > $logfile
10 | 
11 | for testfile in $thisfolder/input/anchors-1.gaf
12 | do
13 | 	echo "$testfile : " >> $logfile
14 | 	basename=$(basename $testfile)
15 | 	outfileg=$outputfolder/${basename%.*}-global.gaf
16 | 	correctg=$thisfolder/correctoutput/${basename%.*}-global.gaf
17 | 	$chainxblockgraph --unsorted-input --global $thisfolder/input/graph.gfa $testfile $outfileg \
18 | 		>> $logfile 2>> $logfile
19 | 	diff <(sort $outfileg) <(sort $correctg) > /dev/null 2>/dev/null
20 | 	exitcode=$? ; if [ $exitcode -ne 0 ] ; then
21 | 		echo "Test failed on file $testfile!" | tee -a $logfile
22 | 		exit 1
23 | 	fi
24 | 
25 | 	outfilesg=$outputfolder/${basename%.*}-semi-global.gaf
26 | 	correctsg=$thisfolder/correctoutput/${basename%.*}-semi-global.gaf
27 | 
28 | 	$chainxblockgraph --unsorted-input --semi-global $thisfolder/input/graph.gfa $testfile $outfilesg \
29 | 		>> $logfile 2>> $logfile
30 | 
31 | 	diff <(sort $outfilesg) <(sort $correctsg) > /dev/null 2>/dev/null
32 | 	exitcode=$? ; if [ $exitcode -ne 0 ] ; then
33 | 		echo "Test failed on file $testfile!" | tee -a $logfile
34 | 		exit 1
35 | 	fi
36 | done
37 | 


--------------------------------------------------------------------------------
/tools/efg-ahocorasick/Makefile:
--------------------------------------------------------------------------------
 1 | CPPFLAGS=-Ofast -march=native --std=c++20
 2 | 
 3 | all : efg-ahocorasick extractor
 4 | 
 5 | efg-ahocorasick : src/efg-ahocorasick.rs ../daachorse/target/release/libdaachorse.rlib
 6 | 	rustc src/efg-ahocorasick.rs -C opt-level=3 \
 7 | 		--extern daachorse=../daachorse/target/release/libdaachorse.rlib
 8 | 
 9 | ../daachorse/target/release/libdaachorse.rlib : ../daachorse/.git
10 | 	cargo build --release --manifest-path=../daachorse/Cargo.toml
11 | 
12 | extractor : src/extractor.cpp src/efg.hpp src/efg-locate.hpp
13 | 	g++ $(CPPFLAGS) $(HEADERS) -o extractor \
14 | 	src/extractor.cpp \
15 | 


--------------------------------------------------------------------------------
/tools/efg-ahocorasick/src/efg-ahocorasick.rs:
--------------------------------------------------------------------------------
 1 | extern crate daachorse;
 2 | use daachorse::DoubleArrayAhoCorasick;
 3 | use std::env;
 4 | 
 5 | use std::{
 6 |     fs::File,
 7 |     io::{prelude::*, BufReader},
 8 |     path::Path,
 9 | };
10 | 
11 | fn lines_from_file(filename: impl AsRef<Path>) -> Vec<String> {
12 |     let file = File::open(filename).expect("no such file");
13 |     let buf = BufReader::new(file);
14 |     buf.lines()
15 |         .map(|l| l.expect("Could not parse line"))
16 |         .collect()
17 | }
18 | 
19 | // ---
20 | 
21 | fn main() {
22 |     let args: Vec<String> = env::args().collect(); // nodes.txt ids.txt reads.txt read_ids.txt
23 | 
24 |     let nodes : Vec<String> = lines_from_file(args[1].clone());
25 |     let nodeslen : Vec<usize> = nodes.iter().cloned().map(|n| n.len()).collect();
26 |     let ids : Vec<String> = lines_from_file(args[2].clone());
27 |     let reads : Vec<String> = lines_from_file(args[3].clone());
28 |     let rids : Vec<String> = lines_from_file(args[4].clone());
29 |     let automaton : DoubleArrayAhoCorasick<usize> = DoubleArrayAhoCorasick::new(nodes).unwrap();
30 | 
31 |     let mut printed = false;
32 |     for (read,readid) in reads.into_iter().zip(rids.into_iter()) {
33 |         let readlen = read.len();
34 |         let it = automaton.find_overlapping_iter(read);
35 | 
36 |         for arg in it {
37 |             if !printed {
38 |                 printed = true;
39 |             } else {
40 |                 print!("\n");
41 |             }
42 |             print!("{}\t{}\t{}\t{}\t+\t>{}\t{}\t0\t{}\t0\t0\t255", readid, readlen, arg.start(), arg.end(), ids[arg.value()], nodeslen[arg.value()],  nodeslen[arg.value()]);
43 |         }
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/tools/efg-ahocorasick/src/efg-locate.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef EFG_LOCATE_HPP
 2 | #define EFG_LOCATE_HPP
 3 | 
 4 | #include <iostream>
 5 | #include <fstream>
 6 | 
 7 | using std::string, std::ifstream, std::ofstream;
 8 | 
 9 | struct Params {
10 | 	ifstream graphfs;
11 | 	ifstream patternsfs;
12 | 	ofstream outputfs;
13 | 	string ignorechars;
14 | 	bool reversecompl;
15 | 	int threads;
16 | 	int mincoverage;
17 | 	bool reportstats;
18 | 	bool renamereversecomplement;
19 | 	bool splitoutputmatches;
20 | 	bool splitoutputmatchesgraphaligner;
21 | 	int edgemincount;
22 | 	bool edgemincountheuristic;
23 | };
24 | #endif
25 | 


--------------------------------------------------------------------------------
/tools/efg-ahocorasick/src/extractor.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string>
 3 | #include <vector>
 4 | #include <utility>
 5 | #include <algorithm>
 6 | 
 7 | #include "efg.hpp"
 8 | 
 9 | using namespace std;
10 | 
11 | int main(int argc, char * argv[])
12 | {
13 | 	if (argc < 2)
14 | 	{
15 | 		cout << "usage: " << argv[0] << " graph.gfa" << std::endl;
16 | 		return 1;
17 | 	}
18 | 
19 | 	// open graph file
20 | 	std::filesystem::path graphpath {argv[1]};
21 | 	std::ifstream graphfs = std::ifstream {graphpath};
22 | 	if (!graphfs) {std::cerr << "Error opening graph file " << graphpath << "." << std::endl; exit(1);};
23 | 
24 | 	Elasticfoundergraph graph(graphfs);
25 | 	vector<bool> is_source(graph.ordered_node_ids.size() + 1, true);
26 | 	vector<bool> is_sink(graph.ordered_node_ids.size() + 1, true);
27 | 	for (int i = 0; i < graph.ordered_node_ids.size(); i++) {
28 | 		for (int j : graph.edges[i]) {
29 | 			is_sink[i] = false;
30 | 			is_source[j] = false;
31 | 		}
32 | 	}
33 | 
34 | 	for (int i = 0; i < graph.ordered_node_ids.size(); i++) {
35 | 		if (!is_source[i] and !is_sink[i]) {
36 | 			std::cout << graph.ordered_node_labels[i] << "\n";
37 | 			std::cerr << graph.ordered_node_ids[i] << "\n";
38 | 		}
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/tools/efg-gaf-splitter/Makefile:
--------------------------------------------------------------------------------
 1 | CPPFLAGS=-Ofast -march=native --std=c++20
 2 | #CPPFLAGS=-g -O0 --std=c++20
 3 | 
 4 | all : efg-gaf-splitter
 5 | 
 6 | efg-gaf-splitter : efg-gaf-splitter.cpp efg.hpp command-line-parsing/cmdline.h command-line-parsing/cmdline.c
 7 | 	g++ $(CPPFLAGS) \
 8 | 	efg-gaf-splitter.cpp command-line-parsing/cmdline.c \
 9 | 	-o efg-gaf-splitter
10 | 
11 | # uncomment for development
12 | #command-line-parsing/cmdline%c command-line-parsing/cmdline%h : command-line-parsing/config.ggo
13 | #	gengetopt \
14 | #		--input=./command-line-parsing/config.ggo \
15 | #		--output-dir=./command-line-parsing/ \
16 | #		--unnamed-opts
17 | 
18 | .PHONY : clean all cleanall
19 | 
20 | # uncomment for development
21 | #clean :
22 | #	rm -Rf command-line-parsing/cmdline.c command-line-parsing/cmdline.h
23 | #cleanall :
24 | #	rm -Rf efg-gaf-splitter command-line-parsing/cmdline.c command-line-parsing/cmdline.h
25 | cleanall :
26 | 	rm -Rf efg-gaf-splitter
27 | 


--------------------------------------------------------------------------------
/tools/efg-gaf-splitter/README.md:
--------------------------------------------------------------------------------
1 | # efg-gaf-splitter
2 | Program to split GAF exact matches into node matches that are valid GraphAligner seeds and eventually flip to the reverse complement representation. Expects GCC compiler version >= 10.
3 | 
4 | # todo
5 | - docs
6 | - tests
7 | 


--------------------------------------------------------------------------------
/tools/efg-gaf-splitter/command-line-parsing/cmdline.h:
--------------------------------------------------------------------------------
  1 | /** @file cmdline.h
  2 |  *  @brief The header file for the command line option parser
  3 |  *  generated by GNU Gengetopt version 2.23
  4 |  *  http://www.gnu.org/software/gengetopt.
  5 |  *  DO NOT modify this file, since it can be overwritten
  6 |  *  @author GNU Gengetopt */
  7 | 
  8 | #ifndef CMDLINE_H
  9 | #define CMDLINE_H
 10 | 
 11 | /* If we use autoconf.  */
 12 | #ifdef HAVE_CONFIG_H
 13 | #include "config.h"
 14 | #endif
 15 | 
 16 | #include <stdio.h> /* for FILE */
 17 | 
 18 | #ifdef __cplusplus
 19 | extern "C" {
 20 | #endif /* __cplusplus */
 21 | 
 22 | #ifndef CMDLINE_PARSER_PACKAGE
 23 | /** @brief the program name (used for printing errors) */
 24 | #define CMDLINE_PARSER_PACKAGE "efg-gaf-splitter"
 25 | #endif
 26 | 
 27 | #ifndef CMDLINE_PARSER_PACKAGE_NAME
 28 | /** @brief the complete program name (used for help and version) */
 29 | #define CMDLINE_PARSER_PACKAGE_NAME "efg-gaf-splitter"
 30 | #endif
 31 | 
 32 | #ifndef CMDLINE_PARSER_VERSION
 33 | /** @brief the program version */
 34 | #define CMDLINE_PARSER_VERSION "devel"
 35 | #endif
 36 | 
 37 | /** @brief Where the command line options are stored */
 38 | struct gengetopt_args_info
 39 | {
 40 |   const char *help_help; /**< @brief Print help and exit help description.  */
 41 |   const char *version_help; /**< @brief Print version and exit help description.  */
 42 |   int sort_flag;	/**< @brief Gather and sort the anchors by read (default=off).  */
 43 |   const char *sort_help; /**< @brief Gather and sort the anchors by read help description.  */
 44 |   
 45 |   unsigned int help_given ;	/**< @brief Whether help was given.  */
 46 |   unsigned int version_given ;	/**< @brief Whether version was given.  */
 47 |   unsigned int sort_given ;	/**< @brief Whether sort was given.  */
 48 | 
 49 |   char **inputs ; /**< @brief unnamed options (options without names) */
 50 |   unsigned inputs_num ; /**< @brief unnamed options number */
 51 | } ;
 52 | 
 53 | /** @brief The additional parameters to pass to parser functions */
 54 | struct cmdline_parser_params
 55 | {
 56 |   int override; /**< @brief whether to override possibly already present options (default 0) */
 57 |   int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */
 58 |   int check_required; /**< @brief whether to check that all required options were provided (default 1) */
 59 |   int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */
 60 |   int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */
 61 | } ;
 62 | 
 63 | /** @brief the purpose string of the program */
 64 | extern const char *gengetopt_args_info_purpose;
 65 | /** @brief the usage string of the program */
 66 | extern const char *gengetopt_args_info_usage;
 67 | /** @brief the description string of the program */
 68 | extern const char *gengetopt_args_info_description;
 69 | /** @brief all the lines making the help output */
 70 | extern const char *gengetopt_args_info_help[];
 71 | 
 72 | /**
 73 |  * The command line parser
 74 |  * @param argc the number of command line options
 75 |  * @param argv the command line options
 76 |  * @param args_info the structure where option information will be stored
 77 |  * @return 0 if everything went fine, NON 0 if an error took place
 78 |  */
 79 | int cmdline_parser (int argc, char **argv,
 80 |   struct gengetopt_args_info *args_info);
 81 | 
 82 | /**
 83 |  * The command line parser (version with additional parameters - deprecated)
 84 |  * @param argc the number of command line options
 85 |  * @param argv the command line options
 86 |  * @param args_info the structure where option information will be stored
 87 |  * @param override whether to override possibly already present options
 88 |  * @param initialize whether to initialize the option structure my_args_info
 89 |  * @param check_required whether to check that all required options were provided
 90 |  * @return 0 if everything went fine, NON 0 if an error took place
 91 |  * @deprecated use cmdline_parser_ext() instead
 92 |  */
 93 | int cmdline_parser2 (int argc, char **argv,
 94 |   struct gengetopt_args_info *args_info,
 95 |   int override, int initialize, int check_required);
 96 | 
 97 | /**
 98 |  * The command line parser (version with additional parameters)
 99 |  * @param argc the number of command line options
100 |  * @param argv the command line options
101 |  * @param args_info the structure where option information will be stored
102 |  * @param params additional parameters for the parser
103 |  * @return 0 if everything went fine, NON 0 if an error took place
104 |  */
105 | int cmdline_parser_ext (int argc, char **argv,
106 |   struct gengetopt_args_info *args_info,
107 |   struct cmdline_parser_params *params);
108 | 
109 | /**
110 |  * Save the contents of the option struct into an already open FILE stream.
111 |  * @param outfile the stream where to dump options
112 |  * @param args_info the option struct to dump
113 |  * @return 0 if everything went fine, NON 0 if an error took place
114 |  */
115 | int cmdline_parser_dump(FILE *outfile,
116 |   struct gengetopt_args_info *args_info);
117 | 
118 | /**
119 |  * Save the contents of the option struct into a (text) file.
120 |  * This file can be read by the config file parser (if generated by gengetopt)
121 |  * @param filename the file where to save
122 |  * @param args_info the option struct to save
123 |  * @return 0 if everything went fine, NON 0 if an error took place
124 |  */
125 | int cmdline_parser_file_save(const char *filename,
126 |   struct gengetopt_args_info *args_info);
127 | 
128 | /**
129 |  * Print the help
130 |  */
131 | void cmdline_parser_print_help(void);
132 | /**
133 |  * Print the version
134 |  */
135 | void cmdline_parser_print_version(void);
136 | 
137 | /**
138 |  * Initializes all the fields a cmdline_parser_params structure 
139 |  * to their default values
140 |  * @param params the structure to initialize
141 |  */
142 | void cmdline_parser_params_init(struct cmdline_parser_params *params);
143 | 
144 | /**
145 |  * Allocates dynamically a cmdline_parser_params structure and initializes
146 |  * all its fields to their default values
147 |  * @return the created and initialized cmdline_parser_params structure
148 |  */
149 | struct cmdline_parser_params *cmdline_parser_params_create(void);
150 | 
151 | /**
152 |  * Initializes the passed gengetopt_args_info structure's fields
153 |  * (also set default values for options that have a default)
154 |  * @param args_info the structure to initialize
155 |  */
156 | void cmdline_parser_init (struct gengetopt_args_info *args_info);
157 | /**
158 |  * Deallocates the string fields of the gengetopt_args_info structure
159 |  * (but does not deallocate the structure itself)
160 |  * @param args_info the structure to deallocate
161 |  */
162 | void cmdline_parser_free (struct gengetopt_args_info *args_info);
163 | 
164 | /**
165 |  * Checks that all the required options were specified
166 |  * @param args_info the structure to check
167 |  * @param prog_name the name of the program that will be used to print
168 |  *   possible errors
169 |  * @return
170 |  */
171 | int cmdline_parser_required (struct gengetopt_args_info *args_info,
172 |   const char *prog_name);
173 | 
174 | 
175 | #ifdef __cplusplus
176 | }
177 | #endif /* __cplusplus */
178 | #endif /* CMDLINE_H */
179 | 


--------------------------------------------------------------------------------
/tools/efg-gaf-splitter/command-line-parsing/config.ggo:
--------------------------------------------------------------------------------
1 | version	"devel"
2 | package	"efg-gaf-splitter"
3 | purpose	"Program to split GAF exact matches into node matches that are valid GraphAligner seeds and eventually flip to the reverse complement representation"
4 | usage	"efg-gaf-splitter graph.gfa seeds.gaf"
5 | 
6 | description	"The program takes in input a GFA graph and a set of GAF exact matches, and outputs in stdout the matches split into node matches, filtering node matches of length 1. If the query id in the GAF entries starts with prefix 'rev_', the match is considered to be between the reverse complement of the read and the graph: such prefix is removed and the GAF entries in output are flipped to be between the forward strand of the read and the reverse complement nodes of the graph."
7 | 
8 | option	"sort"	-	"Gather and sort the anchors by read"	flag	off
9 | 


--------------------------------------------------------------------------------
/tools/efg-gaf-splitter/efg-gaf-splitter.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <string>
 4 | #include <sstream>
 5 | #include <filesystem>
 6 | #include <vector>
 7 | #include <cassert>
 8 | #include <unordered_map>
 9 | 
10 | #include "command-line-parsing/cmdline.h" // gengetopt-generated parser
11 | #include "efg.hpp"
12 | 
13 | using std::string;
14 | 
15 | int main(int argc, char* argv[])
16 | {
17 | 	gengetopt_args_info argsinfo;
18 | 	if (cmdline_parser(argc, argv, &argsinfo) != 0) exit(1);
19 | 
20 | 	if (argsinfo.inputs_num == 0)
21 | 		{cmdline_parser_print_help(); exit(1);};
22 | 	if (argsinfo.inputs_num == 1)
23 | 		{std::cerr << argv[0] << ": missing GAF file" << std::endl; exit(1);};
24 | 	if (argsinfo.inputs_num > 2)
25 | 		{std::cerr << argv[0] << ": too many arguments" << std::endl; exit(1);};
26 | 
27 | 	// open files
28 | 	std::filesystem::path graphpath {argsinfo.inputs[0]};
29 | 	std::ifstream graphfs {graphpath};
30 | 	if (!graphfs) {std::cerr << "Error opening graph file " << graphpath << "." << std::endl; exit(1);};
31 | 
32 | 	std::filesystem::path gafpath {argsinfo.inputs[1]};
33 | 	std::ifstream gaffs {gafpath};
34 | 	if (!gaffs) {std::cerr << "Error opening GAF file " << gafpath << "." << std::endl; exit(1);};
35 | 
36 | 	std::cerr << "Reading the graph..." << std::flush;
37 | 	Elasticfoundergraph graph(graphfs);
38 | 	std::cerr << " done." << std::endl;
39 | 
40 | 	if (argsinfo.sort_flag) {
41 | 		std::cerr << "Reading the seeds..." << std::flush;
42 | 		vector<vector<GAFAnchor>> seeds = read_gaf(gaffs, graph);
43 | 		std::cerr << " done." << std::endl;
44 | 
45 | 		std::cerr << "Splitting the seeds..." << std::flush;
46 | 		for (auto &patternseeds : seeds) {
47 | 			for (auto &a : patternseeds) {
48 | 				for (auto &b : a.split_single_graphaligner(graph)) {
49 | 					if (b.get_query_id().find("rev_") != std::string::npos) {
50 | 						b.reverse();
51 | 					}
52 | 					std::cout << b.to_string(graph) << std::endl;
53 | 				}
54 | 			}
55 | 		}
56 | 		std::cerr << " done." << std::endl;
57 | 	} else {
58 | 		std::cerr << "Reading and splitting the seeds..." << std::flush;
59 | 		GAFAnchor seed;
60 | 		while (read_gaf_single(gaffs, graph, seed)) {
61 | 			for (auto &b : seed.split_single_graphaligner(graph)) {
62 | 				if (b.get_query_id().find("rev_") != std::string::npos) {
63 | 					b.reverse();
64 | 				}
65 | 				std::cout << b.to_string(graph) << std::endl;
66 | 			}
67 | 		}
68 | 		std::cerr << " done." << std::endl;
69 | 	}
70 | }
71 | 


--------------------------------------------------------------------------------
/tools/efg-locate/Makefile:
--------------------------------------------------------------------------------
 1 | CPPFLAGS=-Ofast -march=native --std=c++20 -pthread
 2 | #CPPFLAGS=-g -O0 --std=c++20 -pthread
 3 | HEADERS=-I ../sdsl-lite-v3/include -I ../concurrentqueue
 4 | 
 5 | all : efg-locate
 6 | 
 7 | efg-locate : efg-locate.cpp efg.hpp algo.cpp command-line-parsing/cmdline.h command-line-parsing/cmdline.c
 8 | 	g++ $(CPPFLAGS) $(HEADERS) \
 9 | 	efg-locate.cpp command-line-parsing/cmdline.c \
10 | 	-o efg-locate
11 | 
12 | # uncomment for development
13 | #command-line-parsing/cmdline%c command-line-parsing/cmdline%h : command-line-parsing/config.ggo
14 | #	gengetopt \
15 | #		--input=./command-line-parsing/config.ggo \
16 | #		--output-dir=./command-line-parsing/ \
17 | #		--unnamed-opts
18 | 
19 | .PHONY : clean all cleanall
20 | 
21 | #clean :
22 | #	rm -Rf command-line-parsing/cmdline.c command-line-parsing/cmdline.h
23 | #cleanall :
24 | #	rm -Rf efg-locate command-line-parsing/cmdline.c command-line-parsing/cmdline.h
25 | cleanall :
26 | 	rm -Rf efg-locate
27 | 


--------------------------------------------------------------------------------
/tools/efg-locate/README.md:
--------------------------------------------------------------------------------
1 | # efg-locate
2 | Perform exact or approximate pattern matching on Elastic Founder Graphs.
3 | 
4 | ## TODO
5 |  - documentation
6 |  - investigate I/O bottleneck with high thread number (short-read-exact-match experiment)
7 |  - more tests
8 | 


--------------------------------------------------------------------------------
/tools/efg-locate/command-line-parsing/config.ggo:
--------------------------------------------------------------------------------
 1 | version	"0.1"
 2 | package	"efg-locate"
 3 | purpose	"Program to perform exact and approximate pattern matching on indexable Elastic Founder Graphs."
 4 | usage	"efg-locate graph.gfa patterns.fasta {paths.gaf,seeds.gaf}"
 5 | 
 6 | description	"The program takes in input an indexable Elastic Founder Graph (xGFA) and a set of patterns in FASTA format. In normal mode, the program searches for an exact occurrence of the patterns in the graph, the output is in GFA path format, and the exit value is 0 if all patterns occur and 1 otherwise. In approximate mode (--approximate), the program greedily searches for semi-repeat-free seeds between the patterns and the graph, and the output is in GAF format."
 7 | 
 8 | option	"ignore-chars"	-	"Ignore these characters for the indexability property/pattern matching, breaking up each pattern into maximal strings of non-ignore characters"	string	optional
 9 | option	"approximate"	-	"Approximate pattern matching by greedily matching the pattern in the graph and starting over when the matching fails; output only the recognized matches spanning at least a full node"	flag	off
10 | option	"approximate-edge-match-min-count"	-	"Consider any approximate occurrence valid if the pattern substring occurs at most COUNT times in the edges"	int	typestr = "COUNT"	default = "0"	optional
11 | option	"approximate-edge-match-longest"	-	"Consider the COUNT longest substrings of the pattern appearing in the edges valid"	int	typestr = "COUNT"	default = "0"	optional
12 | option	"approximate-edge-match-longest-max-count"	-	"Consider the COUNT longest substrings valid only if they appear less than N times in the edges"	int	typestr = "COUNT"	default = "1000"	optional
13 | option	"approximate-min-coverage"	-	"Consider approximate occurrences as valid if they cover at least PERC % of the pattern"	int	typestr = "PERC"	default = "0"	optional	hidden
14 | option	"approximate-stats"	-	"Output statistics for each read in stdout"	flag	off
15 | option	"reverse-complement"	-	"Match also the reverse complement of the patterns and output the results as a reverse graph path"	flag	off
16 | option	"rename-reverse-complement"	-	"When matching the reverse complement of patterns, consider them as a distinct patterns by prepending 'rev_' to its name"	flag	off
17 | option	"split-output-matches"	-	"In approximate mode (--approximate), split long matches into node matches"	flag off
18 | option	"split-output-matches-graphaligner"	-	"Same as --split-output-matches, but filter out node matches of length 1 (for use with GraphAligner --extend)"	flag off
19 | option	"split-keep-edge-matches"	-	"In approximate mode and using option --split-output-matches or --split-output-matches-graphaligner, do not split edge matches"	flag	off
20 | option	"threads"		t	"Number of compute threads"		long	typestr = "THREADNUM"	default = "-1"	optional
21 | option	"overwrite"	-	"Overwrite the output file, if it exists"	flag	off
22 | 


--------------------------------------------------------------------------------
/tools/efg-locate/efg-locate.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <fstream>
  3 | #include <string>
  4 | #include <sstream>
  5 | #include <filesystem>
  6 | #include <vector>
  7 | #include <cassert>
  8 | #include <unordered_map>
  9 | 
 10 | #include "efg-locate.hpp"
 11 | #include "command-line-parsing/cmdline.h" // gengetopt-generated parser
 12 | #include "efg.hpp"
 13 | #include "algo.cpp"
 14 | 
 15 | //#define LOCATE_DEBUG
 16 | 
 17 | using namespace efg_locate;
 18 | using std::string, std::max;
 19 | 
 20 | int main(int argc, char* argv[])
 21 | {
 22 | 	gengetopt_args_info argsinfo;
 23 | 	if (cmdline_parser(argc, argv, &argsinfo) != 0) exit(1);
 24 | 
 25 | 	if (argsinfo.inputs_num == 0)
 26 | 		{cmdline_parser_print_help(); exit(1);};
 27 | 	if (argsinfo.inputs_num == 1)
 28 | 		{std::cerr << argv[0] << ": missing patterns file" << std::endl; exit(1);};
 29 | 	if (argsinfo.inputs_num == 2)
 30 | 		{std::cerr << argv[0] << ": missing output file" << std::endl; exit(1);};
 31 | 	if (argsinfo.inputs_num > 3)
 32 | 		{std::cerr << argv[0] << ": too many arguments" << std::endl; exit(1);};
 33 | 
 34 | 	Params params;
 35 | 	params.ignorechars = ((argsinfo.ignore_chars_arg != NULL) ? string(argsinfo.ignore_chars_arg): "");
 36 | 	params.reversecompl = argsinfo.reverse_complement_flag;
 37 | 	params.threads = argsinfo.threads_arg;
 38 | 	params.mincoverage = argsinfo.approximate_min_coverage_arg;
 39 | 	params.reportstats = argsinfo.approximate_stats_flag;
 40 | 	params.renamereversecomplement = argsinfo.rename_reverse_complement_flag;
 41 | 	params.splitoutputmatches = argsinfo.split_output_matches_flag;
 42 | 	params.splitoutputmatchesgraphaligner = argsinfo.split_output_matches_graphaligner_flag;
 43 | 	params.splitkeepedgematches = argsinfo.split_keep_edge_matches_flag;
 44 | 	params.edgemincount = argsinfo.approximate_edge_match_min_count_arg;
 45 | 	params.edgelongestcount = argsinfo.approximate_edge_match_longest_arg;
 46 | 	params.edgelongestcountmax = argsinfo.approximate_edge_match_longest_max_count_arg;
 47 | 
 48 | 	// open graph file
 49 | 	std::filesystem::path graphpath {argsinfo.inputs[0]};
 50 | 	params.graphfs = std::ifstream {graphpath};
 51 | 	if (!params.graphfs) {std::cerr << "Error opening graph file " << graphpath << "." << std::endl; exit(1);};
 52 | 
 53 | 	// check and open output file
 54 | 	std::filesystem::path outputpath {argsinfo.inputs[2]};
 55 | 	if (std::filesystem::exists(outputpath)) {
 56 | 		if (argsinfo.overwrite_flag) {
 57 | 			params.outputfs = std::ofstream(outputpath, std::ios::out | std::ios::trunc);
 58 | 		} else {
 59 | 			std::cerr << "Error: output file already exists." << std::endl;
 60 | 			exit(1);
 61 | 		}
 62 | 	} else {
 63 | 		params.outputfs = std::ofstream(outputpath);
 64 | 	}
 65 | 	if (!params.outputfs) {std::cerr << "Error opening output file " << outputpath << "." << std::endl; exit(1);};
 66 | 
 67 | 	std::cerr << "Reading the graph..." << std::flush;
 68 | 	Elasticfoundergraph graph(params.graphfs);
 69 | 	std::cerr << " done." << std::endl;
 70 | 
 71 | 	std::cerr << "Indexing the graph..." << std::flush;
 72 | 	graph.init_pattern_matching_support();
 73 | 	std::cerr << " done." << std::endl;
 74 | 
 75 | #ifdef LOCATE_DEBUG 
 76 | 	std::cerr << "DEBUG graph is " << std::endl;
 77 | 	graph.to_stream(&std::cerr);
 78 | #endif
 79 | 
 80 | 	// check and open patterns file
 81 | 	std::filesystem::path patternspath {argsinfo.inputs[1]};
 82 | 	params.patternsfs = std::ifstream {patternspath};
 83 | 	if (!params.patternsfs) {std::cerr << "Error opening patterns file " << patternspath << "." << std::endl; exit(1);};
 84 | 
 85 | 	std::atomic<bool> input_done = false;
 86 | 	std::thread inputworker;
 87 | 	vector<string> pattern_ids, patterns;
 88 | 	if ((argsinfo.approximate_flag and params.threads > 0) or (!argsinfo.approximate_flag)) {
 89 | 		std::cerr << "Locate" << std::endl;
 90 | 		inputworker = std::thread(reader_worker, std::ref(params.patternsfs), std::ref(input_done));
 91 | 	} else {
 92 | 		std::cerr << "Reading the patterns..." << std::flush;
 93 | 		std::tie(pattern_ids, patterns) = read_patterns(params.patternsfs);
 94 | 		std::cerr << " done." << std::endl;
 95 | 	}
 96 | 
 97 | #ifdef LOCATE_DEBUG 
 98 | 	std::cerr << std::endl;
 99 | 	for (int i = 0; i < pattern_ids.size(); i++) {
100 | 		cerr << "DEBUG pattern:" << pattern_ids[i] << std::endl << patterns[i] << std::endl;
101 | 	}
102 | #endif
103 | 
104 | 	int returnvalue = 0;
105 | 	// exact pattern matching
106 | 	if (!argsinfo.approximate_flag) {
107 | 		std::atomic<bool> workers_done = false;
108 | 		std::thread outputworker(writer_worker, std::ref(workers_done), std::ref(params));
109 | 		vector<std::thread> workers;
110 | 		for (int i = 0; i < max(1,params.threads); i++)
111 | 			workers.push_back(std::thread(exact_worker, std::ref(graph), std::ref(pattern_ids), std::ref(patterns), std::ref(params), std::ref(input_done)));
112 | 		for (int i = 0; i < workers.size(); i++)
113 | 			workers[i].join();
114 | 		workers_done = true;
115 | 		inputworker.join();
116 | 		outputworker.join();
117 | 		// sanity check?
118 | 		outputworker = std::thread(writer_worker, std::ref(workers_done), std::ref(params));
119 | 		outputworker.join();
120 | 		return 0;
121 | 	}
122 | 
123 | 	if (argsinfo.approximate_flag) {
124 | 		if (params.threads > 0) {
125 | 			std::atomic<bool> workers_done = false;
126 | 			std::thread outputworker(writer_worker, std::ref(workers_done), std::ref(params));
127 | 			vector<std::thread> workers;
128 | 			for (int i = 0; i < params.threads; i++)
129 | 				workers.push_back(std::thread(approx_worker, std::ref(graph), std::ref(pattern_ids), std::ref(patterns), std::ref(params), std::ref(input_done)));
130 | 			for (int i = 0; i < workers.size(); i++)
131 | 				workers[i].join();
132 | 			workers_done = true;
133 | 			inputworker.join();
134 | 			outputworker.join();
135 | 			// sanity check?
136 | 			outputworker = std::thread(writer_worker, std::ref(workers_done), std::ref(params));
137 | 			outputworker.join();
138 | 		} else {
139 | 			for (int p = 0; p < patterns.size(); p++) {
140 | 				vector<GAFAnchor> matches;
141 | 
142 | 				if (approx_efg_backward_search(graph, pattern_ids[p], patterns[p], params, matches) != 0) {
143 | 					if (params.splitoutputmatches)
144 | 						anchors_to_stream_split_single(&params.outputfs, graph, matches, params.splitkeepedgematches);
145 | 					else if (params.splitoutputmatchesgraphaligner)
146 | 						anchors_to_stream_split_single_graphaligner(&params.outputfs, graph, matches, params.splitkeepedgematches);
147 | 					else
148 | 						anchors_to_stream(&params.outputfs, graph, matches);
149 | 				} else {
150 | 					cerr << "Cannot find any semi-repeat-free match of " << pattern_ids[p] << std::endl;
151 | 				}
152 | 			}
153 | 		}
154 | 
155 | 		return 0;
156 | 	}
157 | 
158 | 	cerr << "Mode not implemented!" << std::endl;
159 | 	return 1;
160 | }
161 | 


--------------------------------------------------------------------------------
/tools/efg-locate/efg-locate.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef EFG_LOCATE_HPP
 2 | #define EFG_LOCATE_HPP
 3 | 
 4 | #include <iostream>
 5 | #include <fstream>
 6 | 
 7 | using std::string, std::ifstream, std::ofstream;
 8 | 
 9 | namespace efg_locate {
10 | struct Params {
11 | 	ifstream graphfs;
12 | 	ifstream patternsfs;
13 | 	ofstream outputfs;
14 | 	string ignorechars;
15 | 	bool reversecompl;
16 | 	int threads;
17 | 	int mincoverage;
18 | 	bool reportstats;
19 | 	bool renamereversecomplement;
20 | 	bool splitoutputmatches;
21 | 	bool splitoutputmatchesgraphaligner;
22 | 	bool splitkeepedgematches;
23 | 	int edgemincount;
24 | 	int edgelongestcount;
25 | 	int edgelongestcountmax;
26 | };
27 | }
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/inputs/indels.gfa:
--------------------------------------------------------------------------------
 1 | M	2	36
 2 | X	1	4	7	11	15	19	22	26	30	34
 3 | B	1	1	2	1	1	1	2	1	1	2
 4 | S	0	TAC
 5 | S	1	AGT
 6 | L	0	+	1	+	0M
 7 | S	2	GAA
 8 | S	3	GAAA
 9 | L	1	+	2	+	0M
10 | L	1	+	3	+	0M
11 | S	4	CAAT
12 | L	2	+	4	+	0M
13 | L	3	+	4	+	0M
14 | S	5	GCTA
15 | L	4	+	5	+	0M
16 | S	6	GGG
17 | L	5	+	6	+	0M
18 | S	7	AGA
19 | S	8	AGAG
20 | L	6	+	7	+	0M
21 | L	6	+	8	+	0M
22 | S	9	GCTG
23 | L	7	+	9	+	0M
24 | L	8	+	9	+	0M
25 | S	10	CCTA
26 | L	9	+	10	+	0M
27 | S	11	TAT
28 | S	12	TT
29 | L	10	+	11	+	0M
30 | L	10	+	12	+	0M
31 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/inputs/indels_five_nodes.fasta:
--------------------------------------------------------------------------------
1 | >one
2 | ACAATGCTAGGGAGA
3 | >two
4 | TACAGTGAACAATGCTA
5 | >three
6 | GAGAGGCTGCCTATT
7 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/inputs/tcs_fig_5.gfa:
--------------------------------------------------------------------------------
 1 | M	4	9
 2 | X	1	4	7	9
 3 | B	2	2	3	1
 4 | S	0	AA
 5 | S	1	ACC
 6 | S	2	TTC
 7 | S	3	TA
 8 | L	0	+	2	+	0M
 9 | L	0	+	3	+	0M
10 | L	1	+	3	+	0M
11 | S	4	CA
12 | S	5	G
13 | S	6	GC
14 | L	2	+	4	+	0M
15 | L	3	+	5	+	0M
16 | L	3	+	6	+	0M
17 | S	7	C
18 | L	4	+	7	+	0M
19 | L	5	+	7	+	0M
20 | L	6	+	7	+	0M
21 | P	seq1	0+,2+,4+,7+	*
22 | P	seq2	0+,3+,5+,7+	*
23 | P	seq3	1+,3+,5+,7+	*
24 | P	seq4	0+,3+,6+,7+	*
25 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/inputs/tcs_fig_5_approximate.fasta:
--------------------------------------------------------------------------------
1 | >seq
2 | AATTCCACGGGGGGGGGGGGGGGGGGGGGGGGGGGGAATAGC
3 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/inputs/tcs_fig_5_edge.fasta:
--------------------------------------------------------------------------------
1 | >edgeoccurrence1
2 | TTC
3 | >edgeoccurrence2
4 | ATT
5 | >edgeoccurrence3
6 | TAGC
7 | >edgeoccurrence4
8 | ACCTA
9 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/inputs/tcs_fig_5_four_nodes.fasta:
--------------------------------------------------------------------------------
1 | >fournode1
2 | AATAGCC
3 | >fournode2
4 | CTAGCC
5 | >fournode3
6 | ATTCCAC
7 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/inputs/tcs_fig_5_three_nodes.fasta:
--------------------------------------------------------------------------------
1 | >threenode1
2 | ATAGC
3 | >threenode2
4 | CCAC
5 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/outputs/indels_five_nodes.gfa:
--------------------------------------------------------------------------------
1 | one	15	0	15	+	>3>4>5>6>7	18	3	18	0	0	255
2 | two	17	0	17	+	>0>1>2>4>5	17	0	17	0	0	255
3 | three	15	0	15	+	>6>8>9>10>12	17	2	17	0	0	255
4 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/outputs/tcs_fig_5_approximate.fasta:
--------------------------------------------------------------------------------
1 | seq	42	0	8	+	>0>2>4>7	8	0	8	0	0	255
2 | seq	42	36	42	+	>0>3>6	6	0	6	0	0	255
3 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/outputs/tcs_fig_5_edge.gfa:
--------------------------------------------------------------------------------
1 | edgeoccurrence1	3	0	3	+	>2	3	0	3	0	0	255
2 | edgeoccurrence1	3	0	3	+	>2	3	0	3	0	0	255
3 | edgeoccurrence2	3	0	3	+	>0>2	5	1	4	0	0	255
4 | edgeoccurrence3	4	0	4	+	>3>6	4	0	4	0	0	255
5 | edgeoccurrence4	5	0	5	+	>1>3	5	0	5	0	0	255
6 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/outputs/tcs_fig_5_four_nodes.gfa:
--------------------------------------------------------------------------------
1 | fournode1	7	0	7	+	>0>3>6>7	7	0	7	0	0	255
2 | fournode2	6	0	6	+	>1>3>6>7	8	2	8	0	0	255
3 | fournode3	7	0	7	+	>0>2>4>7	8	1	8	0	0	255
4 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/outputs/tcs_fig_5_three_nodes.gfa:
--------------------------------------------------------------------------------
1 | threenode1	5	0	5	+	>0>3>6	6	1	6	0	0	255
2 | threenode2	4	0	4	+	>2>4>7	6	2	6	0	0	255
3 | 


--------------------------------------------------------------------------------
/tools/efg-locate/test/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # barebones testing pipeline that matches the test files with the correct
 3 | # outputs as specified by the following arrays
 4 | locate=("tcs_fig_5.gfa tcs_fig_5_edge.fasta        tcs_fig_5_edge.gfa"
 5 | 	"tcs_fig_5.gfa tcs_fig_5_three_nodes.fasta tcs_fig_5_three_nodes.gfa"
 6 | 	"tcs_fig_5.gfa tcs_fig_5_four_nodes.fasta  tcs_fig_5_four_nodes.gfa"
 7 | 	"indels.gfa    indels_five_nodes.fasta     indels_five_nodes.gfa")
 8 | 
 9 | approximate=("tcs_fig_5.gfa tcs_fig_5_approximate.fasta tcs_fig_5_approximate.fasta")
10 | 
11 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
12 | outputfolder=$thisfolder/output-$(date -Iminutes)
13 | logfile=$outputfolder/log
14 | efglocate=$thisfolder/../efg-locate
15 | 
16 | mkdir $outputfolder
17 | echo -n > $logfile
18 | 
19 | for testfile in "${locate[@]}"
20 | do
21 | 	graph=$thisfolder/inputs/$(echo "$testfile" | tr -s " " | cut -d' ' -f1)
22 | 	patterns=$thisfolder/inputs/$(echo "$testfile" | tr -s " " | cut -d' ' -f2)
23 | 	correct=$thisfolder/outputs/$(echo "$testfile" | tr -s " " | cut -d' ' -f3)
24 | 
25 | 	patternsbasename=$(basename $patterns)
26 | 	output=$outputfolder/${patternsbasename%.*}.gfa
27 | 
28 | 	echo "$efglocate $graph $patterns $output" >> $logfile
29 | 	$efglocate $graph $patterns $output >> $logfile 2>> $logfile
30 | 	diff $output $correct > /dev/null 2>/dev/null
31 | 
32 | 	exitcode=$? ; if [ $exitcode -ne 0 ] ; then
33 | 		echo "Test failed for files $graph $patterns $correct!" | tee -a $logfile
34 | 		exit 1
35 | 	fi
36 | done
37 | 
38 | for testfile in "${approximate[@]}"
39 | do
40 | 	graph=$thisfolder/inputs/$(echo "$testfile" | tr -s " " | cut -d' ' -f1)
41 | 	patterns=$thisfolder/inputs/$(echo "$testfile" | tr -s " " | cut -d' ' -f2)
42 | 	correct=$thisfolder/outputs/$(echo "$testfile" | tr -s " " | cut -d' ' -f3)
43 | 
44 | 	patternsbasename=$(basename $patterns)
45 | 	output=$outputfolder/${patternsbasename%.*}.gaf
46 | 
47 | 	echo "$efglocate $graph $patterns $output" >> $logfile
48 | 	$efglocate --approximate $graph $patterns $output >> $logfile 2>> $logfile
49 | 	diff $output $correct > /dev/null 2>/dev/null
50 | 
51 | 	exitcode=$? ; if [ $exitcode -ne 0 ] ; then
52 | 		echo "Test failed for files $graph $patterns $correct!" | tee -a $logfile
53 | 		exit 1
54 | 	fi
55 | done
56 | 


--------------------------------------------------------------------------------
/tools/efg-simplify/Makefile:
--------------------------------------------------------------------------------
 1 | CPPFLAGS=-Ofast -march=native --std=c++20
 2 | #CPPFLAGS=-O0 -g --std=c++20
 3 | 
 4 | all : efg-simplify
 5 | 
 6 | efg-simplify : efg-simplify.cpp command-line-parsing/cmdline.h command-line-parsing/cmdline.c
 7 | 	g++ $(CPPFLAGS) \
 8 | 	efg-simplify.cpp command-line-parsing/cmdline.c \
 9 | 	-o efg-simplify
10 | 
11 | #command-line-parsing/cmdline%c command-line-parsing/cmdline%h : command-line-parsing/config.ggo
12 | #	gengetopt \
13 | #		--input=./command-line-parsing/config.ggo \
14 | #		--output-dir=./command-line-parsing/ \
15 | #		--unnamed-opts
16 | 
17 | .PHONY : clean all
18 | 
19 | clean :
20 | 	rm -Rf efg-simplify
21 | #	rm -Rf efg-simplify command-line-parsing/cmdline.{c,h}
22 | 


--------------------------------------------------------------------------------
/tools/efg-simplify/README.md:
--------------------------------------------------------------------------------
 1 | # efg-simplify
 2 | Program that checks whether a given (Elastic) Founder Graph (in xGFA format) respects the repeat-free property or the semi-repeat-free property.
 3 | 
 4 | ```
 5 | Usage: efg-simplify inputgraph.xgfa simplifiedgraph.xgfa
 6 | Program to transform and simplify an Elastic Founder Graph given in xGFA
 7 | format.
 8 | 
 9 | The program takes an Elastic Founder Graph in xGFA format and merges adjacent
10 | blocks that only contain parallel paths.
11 | 
12 |   -h, --help     Print help and exit
13 |   -V, --version  Print version and exit
14 | ```
15 | 
16 | ## GFA format (xGFA)
17 | See [here](https://github.com/algbio/founderblockgraphs/blob/master/xGFAspec.md).
18 | 
19 | ## known issues
20 | 
21 | - as we merge blocks, paths warp a little bit: the program extends the original paths up to the first non-simplified node at the beginning and end
22 | 
23 | ## todo
24 | 
25 | - tests
26 | - solve the paths extension
27 | 


--------------------------------------------------------------------------------
/tools/efg-simplify/command-line-parsing/cmdline.h:
--------------------------------------------------------------------------------
  1 | /** @file cmdline.h
  2 |  *  @brief The header file for the command line option parser
  3 |  *  generated by GNU Gengetopt version 2.23
  4 |  *  http://www.gnu.org/software/gengetopt.
  5 |  *  DO NOT modify this file, since it can be overwritten
  6 |  *  @author GNU Gengetopt */
  7 | 
  8 | #ifndef CMDLINE_H
  9 | #define CMDLINE_H
 10 | 
 11 | /* If we use autoconf.  */
 12 | #ifdef HAVE_CONFIG_H
 13 | #include "config.h"
 14 | #endif
 15 | 
 16 | #include <stdio.h> /* for FILE */
 17 | 
 18 | #ifdef __cplusplus
 19 | extern "C" {
 20 | #endif /* __cplusplus */
 21 | 
 22 | #ifndef CMDLINE_PARSER_PACKAGE
 23 | /** @brief the program name (used for printing errors) */
 24 | #define CMDLINE_PARSER_PACKAGE "efg-simplify"
 25 | #endif
 26 | 
 27 | #ifndef CMDLINE_PARSER_PACKAGE_NAME
 28 | /** @brief the complete program name (used for help and version) */
 29 | #define CMDLINE_PARSER_PACKAGE_NAME "efg-simplify"
 30 | #endif
 31 | 
 32 | #ifndef CMDLINE_PARSER_VERSION
 33 | /** @brief the program version */
 34 | #define CMDLINE_PARSER_VERSION "devel"
 35 | #endif
 36 | 
 37 | /** @brief Where the command line options are stored */
 38 | struct gengetopt_args_info
 39 | {
 40 |   const char *help_help; /**< @brief Print help and exit help description.  */
 41 |   const char *version_help; /**< @brief Print version and exit help description.  */
 42 |   int simplify_tunnels_flag;	/**< @brief for each maximal range of blocks to simplify, do not simplify first and last (default=off).  */
 43 |   const char *simplify_tunnels_help; /**< @brief for each maximal range of blocks to simplify, do not simplify first and last help description.  */
 44 |   int rename_nodes_flag;	/**< @brief rename all node IDs to 0-indexed integers (default=off).  */
 45 |   const char *rename_nodes_help; /**< @brief rename all node IDs to 0-indexed integers help description.  */
 46 |   int ignore_only_flag;	/**< @brief consider in the simplification only blocks containing ignore characters (default=off).  */
 47 |   const char *ignore_only_help; /**< @brief consider in the simplification only blocks containing ignore characters help description.  */
 48 |   char * ignore_chars_arg;	/**< @brief Ignore characters.  */
 49 |   char * ignore_chars_orig;	/**< @brief Ignore characters original value given at command line.  */
 50 |   const char *ignore_chars_help; /**< @brief Ignore characters help description.  */
 51 |   int overwrite_flag;	/**< @brief overwrite the output file, if it exists (default=off).  */
 52 |   const char *overwrite_help; /**< @brief overwrite the output file, if it exists help description.  */
 53 |   
 54 |   unsigned int help_given ;	/**< @brief Whether help was given.  */
 55 |   unsigned int version_given ;	/**< @brief Whether version was given.  */
 56 |   unsigned int simplify_tunnels_given ;	/**< @brief Whether simplify-tunnels was given.  */
 57 |   unsigned int rename_nodes_given ;	/**< @brief Whether rename-nodes was given.  */
 58 |   unsigned int ignore_only_given ;	/**< @brief Whether ignore-only was given.  */
 59 |   unsigned int ignore_chars_given ;	/**< @brief Whether ignore-chars was given.  */
 60 |   unsigned int overwrite_given ;	/**< @brief Whether overwrite was given.  */
 61 | 
 62 |   char **inputs ; /**< @brief unnamed options (options without names) */
 63 |   unsigned inputs_num ; /**< @brief unnamed options number */
 64 | } ;
 65 | 
 66 | /** @brief The additional parameters to pass to parser functions */
 67 | struct cmdline_parser_params
 68 | {
 69 |   int override; /**< @brief whether to override possibly already present options (default 0) */
 70 |   int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */
 71 |   int check_required; /**< @brief whether to check that all required options were provided (default 1) */
 72 |   int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */
 73 |   int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */
 74 | } ;
 75 | 
 76 | /** @brief the purpose string of the program */
 77 | extern const char *gengetopt_args_info_purpose;
 78 | /** @brief the usage string of the program */
 79 | extern const char *gengetopt_args_info_usage;
 80 | /** @brief the description string of the program */
 81 | extern const char *gengetopt_args_info_description;
 82 | /** @brief all the lines making the help output */
 83 | extern const char *gengetopt_args_info_help[];
 84 | 
 85 | /**
 86 |  * The command line parser
 87 |  * @param argc the number of command line options
 88 |  * @param argv the command line options
 89 |  * @param args_info the structure where option information will be stored
 90 |  * @return 0 if everything went fine, NON 0 if an error took place
 91 |  */
 92 | int cmdline_parser (int argc, char **argv,
 93 |   struct gengetopt_args_info *args_info);
 94 | 
 95 | /**
 96 |  * The command line parser (version with additional parameters - deprecated)
 97 |  * @param argc the number of command line options
 98 |  * @param argv the command line options
 99 |  * @param args_info the structure where option information will be stored
100 |  * @param override whether to override possibly already present options
101 |  * @param initialize whether to initialize the option structure my_args_info
102 |  * @param check_required whether to check that all required options were provided
103 |  * @return 0 if everything went fine, NON 0 if an error took place
104 |  * @deprecated use cmdline_parser_ext() instead
105 |  */
106 | int cmdline_parser2 (int argc, char **argv,
107 |   struct gengetopt_args_info *args_info,
108 |   int override, int initialize, int check_required);
109 | 
110 | /**
111 |  * The command line parser (version with additional parameters)
112 |  * @param argc the number of command line options
113 |  * @param argv the command line options
114 |  * @param args_info the structure where option information will be stored
115 |  * @param params additional parameters for the parser
116 |  * @return 0 if everything went fine, NON 0 if an error took place
117 |  */
118 | int cmdline_parser_ext (int argc, char **argv,
119 |   struct gengetopt_args_info *args_info,
120 |   struct cmdline_parser_params *params);
121 | 
122 | /**
123 |  * Save the contents of the option struct into an already open FILE stream.
124 |  * @param outfile the stream where to dump options
125 |  * @param args_info the option struct to dump
126 |  * @return 0 if everything went fine, NON 0 if an error took place
127 |  */
128 | int cmdline_parser_dump(FILE *outfile,
129 |   struct gengetopt_args_info *args_info);
130 | 
131 | /**
132 |  * Save the contents of the option struct into a (text) file.
133 |  * This file can be read by the config file parser (if generated by gengetopt)
134 |  * @param filename the file where to save
135 |  * @param args_info the option struct to save
136 |  * @return 0 if everything went fine, NON 0 if an error took place
137 |  */
138 | int cmdline_parser_file_save(const char *filename,
139 |   struct gengetopt_args_info *args_info);
140 | 
141 | /**
142 |  * Print the help
143 |  */
144 | void cmdline_parser_print_help(void);
145 | /**
146 |  * Print the version
147 |  */
148 | void cmdline_parser_print_version(void);
149 | 
150 | /**
151 |  * Initializes all the fields a cmdline_parser_params structure 
152 |  * to their default values
153 |  * @param params the structure to initialize
154 |  */
155 | void cmdline_parser_params_init(struct cmdline_parser_params *params);
156 | 
157 | /**
158 |  * Allocates dynamically a cmdline_parser_params structure and initializes
159 |  * all its fields to their default values
160 |  * @return the created and initialized cmdline_parser_params structure
161 |  */
162 | struct cmdline_parser_params *cmdline_parser_params_create(void);
163 | 
164 | /**
165 |  * Initializes the passed gengetopt_args_info structure's fields
166 |  * (also set default values for options that have a default)
167 |  * @param args_info the structure to initialize
168 |  */
169 | void cmdline_parser_init (struct gengetopt_args_info *args_info);
170 | /**
171 |  * Deallocates the string fields of the gengetopt_args_info structure
172 |  * (but does not deallocate the structure itself)
173 |  * @param args_info the structure to deallocate
174 |  */
175 | void cmdline_parser_free (struct gengetopt_args_info *args_info);
176 | 
177 | /**
178 |  * Checks that all the required options were specified
179 |  * @param args_info the structure to check
180 |  * @param prog_name the name of the program that will be used to print
181 |  *   possible errors
182 |  * @return
183 |  */
184 | int cmdline_parser_required (struct gengetopt_args_info *args_info,
185 |   const char *prog_name);
186 | 
187 | 
188 | #ifdef __cplusplus
189 | }
190 | #endif /* __cplusplus */
191 | #endif /* CMDLINE_H */
192 | 


--------------------------------------------------------------------------------
/tools/efg-simplify/command-line-parsing/config.ggo:
--------------------------------------------------------------------------------
 1 | version	"devel"
 2 | package	"efg-simplify"
 3 | purpose	"Program to transform and simplify an Elastic Founder Graph given in xGFA format."
 4 | usage	"efg-simplify inputgraph.xgfa simplifiedgraph.xgfa"
 5 | 
 6 | description	"The program takes an Elastic Founder Graph in xGFA format and merges adjacent blocks that only contain parallel paths."
 7 | 
 8 | option	"simplify-tunnels"	t	"for each maximal range of blocks to simplify, do not simplify first and last"	flag	off
 9 | option	"rename-nodes"		r	"rename all node IDs to 0-indexed integers"	flag	off
10 | option	"ignore-only"	n	"consider in the simplification only blocks containing ignore characters"	flag	off
11 | option	"ignore-chars"		-	"Ignore characters"	string	optional
12 | option	"overwrite"	-	"overwrite the output file, if it exists"	flag	off
13 | 


--------------------------------------------------------------------------------