├── .gitignore ├── .gitmodules ├── Makefile ├── README.md ├── SRFAligner ├── SRFChainer ├── docs └── workflow.png ├── efg-ahocorasickAligner ├── efg-memsAligner ├── experiments ├── aligner-evaluation │ ├── README.md │ ├── ahocorasick │ │ └── runexp.sh │ ├── chaining │ │ └── runexp.sh │ ├── environment.yml │ ├── final │ │ └── runexp.sh │ ├── input │ │ ├── .gitkeep │ │ └── covid19_100_acc.txt │ ├── mems │ │ └── runexp.sh │ ├── scripts │ │ ├── .env │ │ ├── compute_metrics.py │ │ ├── compute_summary.py │ │ ├── generate_sim_reads.py │ │ ├── run_experiment.py │ │ └── vg_pb2.py │ ├── semi-repeat-free │ │ └── runexp.sh │ ├── vg-comparison │ │ └── runexp.sh │ └── vg-unchop │ │ └── runexp.sh ├── graph-statistics │ ├── README.md │ ├── run.sh │ └── scripts │ │ ├── compute-N.sh │ │ ├── compute-bps.sh │ │ ├── compute-branching-factor.sh │ │ ├── compute-branching-nodes.sh │ │ ├── compute-choices.sh │ │ ├── compute-edges.sh │ │ ├── compute-efg-H.sh │ │ ├── compute-longest-node.sh │ │ ├── compute-nodes.sh │ │ ├── compute-paths.sh │ │ └── compute-width.sh ├── msa-validation │ ├── README.md │ └── validate.sh ├── short-read-exact-match │ ├── README.md │ ├── input │ │ └── .gitkeep │ └── runexp.sh └── vcf-to-hapl-to-efg │ ├── README.md │ └── sample-and-build-efg-heuristic.sh ├── test ├── graph1.gfa ├── graph2.gfa ├── graph3.gfa ├── read1.fastq ├── read2.fastq └── read3.fastq └── tools ├── ChainX-block-graph ├── Makefile ├── README.md ├── chaining.hpp ├── chaining_hpp_license.txt ├── chainx-block-graph.cpp ├── chainx-block-graph.hpp ├── command-line-parsing │ ├── cmdline.c │ ├── cmdline.h │ └── config.ggo ├── efg.hpp └── test │ ├── correctoutput │ ├── anchors-1-global.gaf │ └── anchors-1-semi-global.gaf │ ├── input │ ├── anchors-1.gaf │ └── graph.gfa │ └── test.sh ├── efg-ahocorasick ├── Makefile └── src │ ├── efg-ahocorasick.rs │ ├── efg-locate.hpp │ ├── efg.hpp │ └── extractor.cpp ├── efg-gaf-splitter ├── Makefile ├── README.md ├── command-line-parsing │ ├── cmdline.c │ ├── cmdline.h │ └── config.ggo ├── efg-gaf-splitter.cpp └── efg.hpp ├── efg-locate ├── Makefile ├── README.md ├── algo.cpp ├── command-line-parsing │ ├── cmdline.c │ ├── cmdline.h │ └── config.ggo ├── efg-locate.cpp ├── efg-locate.hpp ├── efg.hpp └── test │ ├── inputs │ ├── indels.gfa │ ├── indels_five_nodes.fasta │ ├── tcs_fig_5.gfa │ ├── tcs_fig_5_approximate.fasta │ ├── tcs_fig_5_edge.fasta │ ├── tcs_fig_5_four_nodes.fasta │ └── tcs_fig_5_three_nodes.fasta │ ├── outputs │ ├── indels_five_nodes.gfa │ ├── tcs_fig_5_approximate.fasta │ ├── tcs_fig_5_edge.gfa │ ├── tcs_fig_5_four_nodes.gfa │ └── tcs_fig_5_three_nodes.gfa │ └── test.sh └── efg-simplify ├── Makefile ├── README.md ├── command-line-parsing ├── cmdline.c ├── cmdline.h └── config.ggo └── efg-simplify.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | experiments/vcf-to-hapl-to-efg/output 2 | experiments/vcf-to-hapl-to-efg/chr22_uppercase.fasta 3 | experiments/vcf-to-hapl-to-efg/1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz 4 | experiments/vcf-to-hapl-to-efg/1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz.tbi 5 | experiments/vcf-to-hapl-to-efg/chm13v2.0.fa.gz 6 | experiments/vcf-to-hapl-to-efg/phased_T2T_panel.tar 7 | experiments/vcf-to-hapl-to-efg/chr22_iEFG.gfa 8 | 9 | tools/efg-locate/efg-locate 10 | tools/efg-locate/test/output-* 11 | 12 | tools/ChainX-block-graph/chainx-block-graph 13 | tools/ChainX-block-graph/test/output-* 14 | 15 | tools/efg-gaf-splitter/efg-gaf-splitter 16 | 17 | tools/efg-ahocorasick/efg-ahocorasick 18 | tools/efg-ahocorasick/extractor 19 | 20 | tools/efg-simplify/efg-simplify 21 | 22 | test/*.gaf 23 | 24 | experiments/aligner-evaluation/scripts/__pycache__ 25 | experiments/aligner-evaluation/input/* 26 | experiments/aligner-evaluation/semi-repeat-free/output 27 | experiments/aligner-evaluation/chaining/output 28 | experiments/aligner-evaluation/final/output 29 | experiments/aligner-evaluation/mems/output 30 | experiments/aligner-evaluation/vg-comparison/output 31 | experiments/aligner-evaluation/ahocorasick/output 32 | 33 | experiments/msa-validation/tmp* 34 | experiments/msa-validation/output_stats.txt 35 | 36 | experiments/short-read-exact-match/input/chm13v2.0.fa.gz 37 | experiments/short-read-exact-match/input/chr22_iEFG.gfa 38 | experiments/short-read-exact-match/input/chr22_uppercase.fasta 39 | experiments/short-read-exact-match/input/ERR1025645_sample05_1.fq.gz 40 | experiments/short-read-exact-match/output 41 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tools/founderblockgraphs"] 2 | path = tools/founderblockgraphs 3 | url = https://github.com/algbio/founderblockgraphs.git 4 | [submodule "tools/vcf2multialign"] 5 | path = tools/vcf2multialign 6 | url = https://github.com/tsnorri/vcf2multialign 7 | [submodule "tools/concurrentqueue"] 8 | path = tools/concurrentqueue 9 | url = https://github.com/cameron314/concurrentqueue 10 | [submodule "tools/sdsl-lite-v3"] 11 | path = tools/sdsl-lite-v3 12 | url = https://github.com/xxsds/sdsl-lite 13 | [submodule "tools/Badread"] 14 | path = tools/Badread 15 | url = https://github.com/rrwick/Badread 16 | [submodule "tools/GraphChainer"] 17 | path = tools/GraphChainer 18 | url = https://github.com/algbio/GraphChainer 19 | [submodule "tools/minichain"] 20 | path = tools/minichain 21 | url = https://github.com/at-cg/minichain 22 | ignore = untracked 23 | [submodule "tools/minigraph"] 24 | path = tools/minigraph 25 | url = https://github.com/lh3/minigraph 26 | [submodule "tools/efg-mems"] 27 | path = tools/efg-mems 28 | url = https://github.com/algbio/efg-mems 29 | branch = mapped-output 30 | [submodule "tools/GraphAligner"] 31 | path = tools/GraphAligner 32 | url = https://github.com/maickrau/GraphAligner 33 | [submodule "tools/edlib"] 34 | path = tools/edlib 35 | url = https://github.com/Martinsos/edlib 36 | [submodule "tools/seqtk"] 37 | path = tools/seqtk 38 | url = https://github.com/lh3/seqtk 39 | ignore = untracked 40 | [submodule "tools/daachorse"] 41 | path = tools/daachorse 42 | url = https://github.com/daac-tools/daachorse.git 43 | [submodule "tools/bwa"] 44 | path = tools/bwa 45 | url = https://github.com/lh3/bwa 46 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: prerequisites 2 | prerequisites: 3 | make --directory=tools/efg-locate 4 | make --directory=tools/ChainX-block-graph 5 | make --directory=tools/efg-gaf-splitter 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # seed-chain-extend on indexable Elastic Founder Graphs 2 | `SRFAligner` and `SRFChainer` are long-read alignment tools based on indexable Elastic Founder Graphs (iEFGs). iEFGs can be obtained from FASTA multiple sequence alignments using [`founderblockgraph`](https://github.com/algbio/founderblockgraphs), or from a VCF file with the pipeline implemented in `experiments/vcf-to-hapl-to-efg` using `founderblockgraph` and [`vcf2multialign`](https://github.com/tsnorri/vcf2multialign/). The graphs used in the experiments can be found at [doi.org/10.5281/zenodo.14012881](https://doi.org/10.5281/zenodo.14012881). 3 | 4 | ![Workflow to build iEFGs from a VCF file and to perform seed-chain-extend alignment](docs/workflow.png) 5 | 6 | ## getting started 7 | `SRFAligner`, `SRFChainer` and the other prototype aligners are Bash scripts and they depend on `efg-locate`, `chainx-block-graph` (tested on GCC >= 13), and `GraphAligner` (tested on GitHub version >= 1.0.19). To clone this repository and compile the first two: 8 | ```console 9 | git clone https://github.com/algbio/SRFAligner && cd SRFAligner 10 | git submodule update --init tools/{sdsl-lite-v3,concurrentqueue} 11 | make 12 | ``` 13 | and `GraphAligner`'s executable is expected to be found in folder `tools/GraphAligner/bin`, so you can run command `git submodule update --init --recursive tools/GraphAligner` and follow its [compilation instructions](https://github.com/maickrau/GraphAligner?tab=readme-ov-file#compilation). If `GraphAligner` is already installed in your system, you can just modify the relative line in `SRFAligner` and `SRFChainer`: 14 | ```console 15 | sed --in-place '7s/.*/graphaligner=GraphAligner/' SRFAligner SRFChainer efg-memsAligner efg-ahocorasickAligner 16 | ``` 17 | Test the aligners with commands 18 | ```console 19 | ./SRFAligner -g test/graph1.gfa -f test/read1.fastq -a test/aln1.gaf 20 | ./SRFChainer -g test/graph2.gfa -f test/read2.fastq -a test/aln2.gaf 21 | ``` 22 | 23 | ## prototype aligners 24 | To use MEM seeds computed by `efg-mems`, `efg-memsAligner` expects `efg-mems`'s executable to be in `tools/efg-mems/efg-mems` and [`seqtk`](https://github.com/lh3/seqtk) to be in `tools/seqtk`: 25 | ```console 26 | git submodule update --init --recursive {tools/efg-mems,tools/seqtk} 27 | make -C tools/seqtk 28 | cd tools/efg-mems/sdsl-lite 29 | ./install.sh . 30 | cd .. 31 | cmake . 32 | make 33 | ``` 34 | 35 | Analogously, to use full node seeds computed by [`daachorse`](https://github.com/daac-tools/daachorse) (Aho-Corasick automaton of the node labels, requires Rust >= 1.61), `efg-ahocorasickAligner` expects `efg-ahocorasick` and `extractor` to be in `tools/efg-ahocorasick`, and [`seqtk`](https://github.com/lh3/seqtk) to be in `tools/seqtk`: 36 | 37 | ```console 38 | git submodule update --init --recursive {tools/daachorse,tools/seqtk} 39 | make -C tools/seqtk 40 | make -C tools/efg-ahocorasick 41 | ``` 42 | -------------------------------------------------------------------------------- /SRFAligner: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 5 | 6 | # executables' absolute paths/commands (make sure they work!) 7 | graphaligner=$thisfolder/tools/GraphAligner/bin/GraphAligner 8 | efglocate=$thisfolder/tools/efg-locate/efg-locate 9 | 10 | # default params 11 | workingfolder="." 12 | threads=8 # threads 13 | edgemincount=0 # semi-repeat-free seeds only 14 | edgelongestcount=0 15 | extendoptions="--max-cluster-extend 5 -b 10" # GraphAligner default extend options 16 | discardoption="(substr(\$16,6) > 0.90) && ((\$4-\$3)*100/\$2 >= 50)" # using -c option, discard alignments with identity <= 90% or read coverage < 50% 17 | 18 | print_help() 19 | { 20 | echo "Pipeline to align long reads to indexable Elastic Founder Graphs based on semi-repeat-free seeds" 21 | echo "usage: SRFAligner -g graph.gfa -f reads.fastq -a alignments.gaf" 22 | echo " -h --help: show this screen" 23 | echo " -g graph.gfa: semi-repeat-free EFG in xGFA format" 24 | echo " -f reads.fastq: reads in FASTQ format" 25 | echo " -a alignmentsout.gaf: output alignments in GAF format" 26 | echo " -t threads: # of threads" 27 | echo " -i IGNORECHARS : ignore the following characters for indexability/seed finding" 28 | echo " -c : discard bad alignments (identity <= 0.9 or read coverage < 50%) and run GraphAligner on unaligned reads" 29 | echo " -p : disable pipeline mode and save the seeds in working folder" 30 | echo " -e : make GraphAligner consider each single seed (cluster) for extension" 31 | echo " -m edgemincount: heuristic parameter for seed computation (see efg-locate)" 32 | echo " -o edgelongestcount: heuristic parameter for seed computation (see efg-locate)" 33 | echo " -w path: working folder for output and temporary files" 34 | } 35 | 36 | # https://stackoverflow.com/questions/12022592/how-can-i-use-long-options-with-the-bash-getopts-builtin 37 | for arg in "$@"; do 38 | shift 39 | case "$arg" in 40 | '--help') set -- "$@" '-h' ;; 41 | *) set -- "$@" "$arg" ;; 42 | esac 43 | done 44 | 45 | while getopts "hepcg:f:a:t:w:m:i:o:" option; do 46 | case $option in 47 | h) # display help 48 | print_help 49 | exit;; 50 | g) # graph 51 | argg=true 52 | graph="$OPTARG" ;; 53 | f) # fastq reads 54 | argf=true 55 | reads="$OPTARG" ;; 56 | a) # output 57 | arga=true 58 | alignmentsout="$OPTARG" ;; 59 | t) # threads 60 | argt=true 61 | threads="$OPTARG" ;; 62 | i) # ignorechars 63 | argi=true 64 | ignorechars="$OPTARG" ;; 65 | w) # working folder 66 | argw=true 67 | workingfolder="$OPTARG" ;; 68 | c) # discard short alignment heuristic flag 69 | argc=true ;; 70 | p) # disable pipeline mode flag 71 | argp=true ;; 72 | e) # extend all clusters 73 | arge=true ;; 74 | m) # edgemincount parameter 75 | argm=true 76 | edgemincount="$OPTARG" ;; 77 | o) # edgelongestcount parameter 78 | argo=true 79 | edgelongestcount="$OPTARG" ;; 80 | \?) # invalid option 81 | echo "Error: Invalid option" 82 | print_help 83 | exit;; 84 | esac 85 | done 86 | if [[ "$argg" != true ]] || [[ "$argf" != true ]] ; then 87 | print_help 88 | exit 89 | fi 90 | 91 | ignorecharsarg="" 92 | if [[ "$argi" = true ]] ; then 93 | ignorecharsarg="--ignore-chars=$ignorechars" 94 | fi 95 | 96 | if [[ "$arge" = true ]] ; then 97 | extendoptions="--max-cluster-extend -1 --multimap-score-fraction 0.00 -b 10" 98 | fi 99 | 100 | # move to working folder 101 | if [[ "$argw" = true ]] ; then 102 | workingfolder="${workingfolder%/}" 103 | else 104 | workingfolder="." 105 | fi 106 | 107 | if [[ "$argp" = true ]] ; then 108 | # find semi-repeat-free seeds 109 | $efglocate --approximate --split-output-matches-graphaligner --reverse-complement --overwrite \ 110 | $ignorecharsarg \ 111 | --threads $threads \ 112 | --approximate-edge-match-min-count $edgemincount \ 113 | --approximate-edge-match-longest $edgelongestcount \ 114 | $graph \ 115 | <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \ 116 | "$workingfolder/$(basename $reads)_srf_seeds.gaf" 117 | 118 | # GraphAligner extend 119 | $graphaligner $extendoptions \ 120 | -t $threads \ 121 | -g $graph \ 122 | -f $reads \ 123 | --realign "$workingfolder/$(basename $reads)_srf_seeds.gaf" \ 124 | -a $alignmentsout 125 | 126 | if [[ "$argc" = true ]] ; then 127 | echo -n "Filtering alignments with identity <= 90% or read coverage < 50%..." 128 | awk "{if ($discardoption) {print}}" "$alignmentsout" > "$workingfolder/filtered_alignments_$$.gaf" 129 | mv "$workingfolder/filtered_alignments_$$.gaf" "$alignmentsout" 130 | echo " done." 131 | 132 | unalignedreads=$({ grep -v \ 133 | -f <(cut -f1 $alignmentsout | uniq) \ 134 | <(awk 'NR % 4 == 1' $reads | cut -d' ' -f1 | tr -d "@") || true; }) 135 | unalignedreadscount=$(echo -n "$unalignedreads" | wc -l) 136 | if [[ "$unalignedreadscount" -gt "0" ]] ; then 137 | grep -A 1 --no-group-separator \ 138 | -f <(echo "$unalignedreads") \ 139 | <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \ 140 | > "$workingfolder/$(basename $reads)_unaligned_reads.fasta" 141 | 142 | $graphaligner -x "vg" \ 143 | -t $threads \ 144 | -g $graph \ 145 | -f "$workingfolder/$(basename $reads)_unaligned_reads.fasta" \ 146 | -a "$workingfolder/$(basename $reads)_unaligned_reads.gaf" 147 | cat "$workingfolder/$(basename $reads)_unaligned_reads.gaf" >> $alignmentsout 148 | else 149 | echo "There are no unaligned reads to realign" 150 | fi 151 | fi 152 | else 153 | # pipeline of above commands 154 | 155 | if [[ "$argc" = true ]] ; then 156 | # set up final GraphAligner call 157 | fastapipe=$(mktemp -u --suffix ".fasta") 158 | mkfifo -m 600 $fastapipe 159 | trap '{ rm -f -- "$fastapipe"; }' EXIT 160 | 161 | # TODO find a way to not store as a separate file the unaligned alignments and directly append them 162 | $graphaligner -x "vg" \ 163 | -t $threads \ 164 | -g $graph \ 165 | -f $fastapipe \ 166 | -a "$workingfolder/unaligned_reads_$$.gaf" & 167 | # give GraphAligner a dummy "file", see https://github.com/maickrau/GraphAligner/issues/105 168 | set +eo pipefail ; { echo > $fastapipe & } ; set -eo pipefail 169 | fi 170 | 171 | $efglocate --approximate --split-output-matches-graphaligner --reverse-complement --overwrite \ 172 | $ignorecharsarg \ 173 | --threads $threads \ 174 | --approximate-edge-match-min-count $edgemincount \ 175 | --approximate-edge-match-longest $edgelongestcount \ 176 | $graph \ 177 | <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \ 178 | /dev/stdout | \ 179 | $graphaligner $extendoptions \ 180 | -t $threads \ 181 | -g $graph \ 182 | -f $reads \ 183 | --realign /dev/stdin \ 184 | -a $alignmentsout 185 | 186 | if [[ "$argc" = true ]] ; then 187 | echo -n "Filtering alignments with identity <= 90% or read coverage < 50%..." 188 | awk "{if ($discardoption) {print}}" "$alignmentsout" > "$workingfolder/filtered_alignments_$$.gaf" 189 | mv "$workingfolder/filtered_alignments_$$.gaf" "$alignmentsout" 190 | echo " done." 191 | 192 | unalignedreads=$({ grep -v \ 193 | -f <(cut -f1 $alignmentsout | uniq) \ 194 | <(awk 'NR % 4 == 1' $reads | cut -d' ' -f1 | tr -d "@") || true; }) 195 | unalignedreadscount=$(echo -n "$unalignedreads" | wc -l) 196 | if [[ "$unalignedreadscount" -gt "0" ]] ; then 197 | echo "There are the following unaligned reads:" 198 | echo "$unalignedreads" 199 | grep -A 1 --no-group-separator \ 200 | -f <(echo "$unalignedreads") \ 201 | <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \ 202 | > $fastapipe 203 | wait $(jobs -p) 204 | else 205 | echo "There are no unaligned reads to realign" 206 | set +eo pipefail ; { kill $(jobs -p) & } ; set -eo pipefail 207 | fi 208 | 209 | cat "$workingfolder/unaligned_reads_$$.gaf" >> $alignmentsout 210 | rm "$workingfolder/unaligned_reads_$$.gaf" 211 | fi 212 | fi 213 | -------------------------------------------------------------------------------- /SRFChainer: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 5 | 6 | # executables' absolute paths/commands (make sure they work!) 7 | graphaligner=$thisfolder/tools/GraphAligner/bin/GraphAligner 8 | efglocate=$thisfolder/tools/efg-locate/efg-locate 9 | chainxblockgraph=$thisfolder/tools/ChainX-block-graph/chainx-block-graph 10 | 11 | # default params 12 | workingfolder="." 13 | threads=8 # threads 14 | edgemincount=0 # semi-repeat-free seeds only 15 | edgelongestcount=0 16 | alternativechains=0 17 | chainingguess="--initial-guess-coverage=0.5 --ramp-up-factor=1.5" 18 | extendoptions="--max-cluster-extend 5 -b 10" # GraphAligner default extend options 19 | discardoption="(substr(\$16,6) > 0.90) && ((\$4-\$3)*100/\$2 >= 50)" # using -c option, discard alignments with identity <= 90% or read coverage < 50% 20 | 21 | # load balancing 22 | locatework=(32 32 22 14 14 11 10 9 8 7 6) 23 | chainwork=(8 25 32 32 32 32 32 32 32 32 32) 24 | 25 | print_help() 26 | { 27 | echo "Pipeline to align long reads to indexable Elastic Founder Graphs based on chained semi-repeat-free seeds" 28 | echo "usage: SRFChainer -g graph.gfa -f reads.fastq -a alignments.gaf" 29 | echo " -h --help: show this screen" 30 | echo " -g graph.gfa: semi-repeat-free EFG in xGFA format" 31 | echo " -f reads.fastq: reads in FASTQ format" 32 | echo " -a alignmentsout.gaf: output alignments in GAF format" 33 | echo " -t threads: # of threads" 34 | echo " -i IGNORECHARS : ignore the following characters for indexability/seed finding" 35 | echo " -c : discard bad alignments (identity <= 0.9 or read coverage < 50%) and run GraphAligner on unaligned reads" 36 | echo " -p : disable pipeline mode and save the seeds in working folder" 37 | echo " -e : make GraphAligner consider each single seed (cluster) for extension" 38 | echo " -m edgemincount: heuristic parameter for seed computation (see efg-locate)" 39 | echo " -o edgelongestcount: heuristic parameter for seed computation (see efg-locate)" 40 | echo " -n altchains : heuristic parameter to chain an additional n times for each strand" 41 | echo " -w path: working folder for output and temporary files" 42 | } 43 | 44 | load_balance() 45 | { 46 | m=$2 47 | if (( $m >= ${#locatework[@]} )) 48 | then 49 | m=$(( ${#locatework[@]} - 1 )) 50 | fi 51 | 52 | 53 | threads1=$(( $1 * ${chainwork[$m]} / (${chainwork[$m]} + ${locatework[$m]}))) 54 | threads1=$(( threads1 >= $1 ? $1 - 1 : threads1 )) 55 | threads1=$(( threads1 < 1 ? 1 : threads1 )) 56 | 57 | echo "$(($1 - $threads1))" 58 | } 59 | 60 | # https://stackoverflow.com/questions/12022592/how-can-i-use-long-options-with-the-bash-getopts-builtin 61 | for arg in "$@"; do 62 | shift 63 | case "$arg" in 64 | '--help') set -- "$@" '-h' ;; 65 | *) set -- "$@" "$arg" ;; 66 | esac 67 | done 68 | 69 | while getopts "hepcg:f:a:t:w:m:i:n:o:" option; do 70 | case $option in 71 | h) # display help 72 | print_help 73 | exit;; 74 | g) # graph 75 | argg=true 76 | graph="$OPTARG" ;; 77 | f) # fastq reads 78 | argf=true 79 | reads="$OPTARG" ;; 80 | a) # output 81 | arga=true 82 | alignmentsout="$OPTARG" ;; 83 | t) # threads 84 | argt=true 85 | threads="$OPTARG" ;; 86 | i) # ignorechars 87 | argi=true 88 | ignorechars="$OPTARG" ;; 89 | w) # working folder 90 | argw=true 91 | workingfolder="$OPTARG" ;; 92 | c) # discard short alignment heuristic flag 93 | argc=true ;; 94 | p) # disable pipeline mode flag 95 | argp=true ;; 96 | e) # extend all clusters 97 | arge=true ;; 98 | m) # edgemincount parameter 99 | argm=true 100 | edgemincount="$OPTARG" ;; 101 | o) # edgelongestcount parameter 102 | argo=true 103 | edgelongestcount="$OPTARG" ;; 104 | n) # alternative chains parameter 105 | argn=true 106 | alternativechains="$OPTARG" ;; 107 | \?) # invalid option 108 | echo "Error: Invalid option" 109 | print_help 110 | exit;; 111 | esac 112 | done 113 | if [[ "$argg" != true ]] || [[ "$argf" != true ]] ; then 114 | print_help 115 | exit 116 | fi 117 | 118 | if [[ "$argp" != true ]] && [[ "$threads" -lt 2 ]] ; then 119 | echo "Please pick at least 2 threads in pipeline mode!" 120 | exit 1 121 | fi 122 | 123 | ignorecharsarg="" 124 | if [[ "$argi" = true ]] ; then 125 | ignorecharsarg="--ignore-chars=$ignorechars" 126 | fi 127 | 128 | if [[ "$arge" = true ]] ; then 129 | extendoptions="--max-cluster-extend -1 --multimap-score-fraction 0.00 -b 10" 130 | fi 131 | 132 | # move to working folder 133 | if [[ "$argw" = true ]] ; then 134 | workingfolder="${workingfolder%/}" 135 | else 136 | workingfolder="." 137 | fi 138 | 139 | if [[ "$argp" = true ]] ; then 140 | # find semi-repeat-free seeds 141 | $efglocate --approximate --split-output-matches --reverse-complement --rename-reverse-complement --overwrite \ 142 | $ignorecharsarg \ 143 | --threads $threads \ 144 | --approximate-edge-match-min-count $edgemincount \ 145 | --approximate-edge-match-longest $edgelongestcount \ 146 | $graph \ 147 | <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \ 148 | "$workingfolder/$(basename $reads)_srfchain_seeds.gaf" 149 | 150 | # chainx-block-graph chain 151 | $chainxblockgraph --semi-global --split-output-matches-graphaligner --overwrite $chainingguess \ 152 | --alternative-chains $alternativechains \ 153 | --threads $threads \ 154 | $graph \ 155 | "$workingfolder/$(basename $reads)_srfchain_seeds.gaf" \ 156 | "$workingfolder/$(basename $reads)_srfchain_chain.gaf" 157 | 158 | # GraphAligner extend 159 | $graphaligner $extendoptions \ 160 | -t $threads \ 161 | -g $graph \ 162 | -f $reads \ 163 | --realign "$workingfolder/$(basename $reads)_srfchain_chain.gaf" \ 164 | -a $alignmentsout 165 | 166 | if [[ "$argc" = true ]] ; then 167 | echo -n "Filtering alignments with identity <= 90% or read coverage < 50%..." 168 | awk "{if ($discardoption) {print}}" "$alignmentsout" > "$workingfolder/filtered_alignments_$$.gaf" 169 | mv "$workingfolder/filtered_alignments_$$.gaf" "$alignmentsout" 170 | echo " done." 171 | 172 | unalignedreads=$({ grep -v \ 173 | -f <(cut -f1 $alignmentsout | uniq) \ 174 | <(awk 'NR % 4 == 1' $reads | cut -d' ' -f1 | tr -d "@") || true; }) 175 | unalignedreadscount=$(echo -n "$unalignedreads" | wc -l) 176 | if [[ "$unalignedreadscount" -gt "0" ]] ; then 177 | grep -A 1 --no-group-separator \ 178 | -f <(echo "$unalignedreads") \ 179 | <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \ 180 | > "$workingfolder/$(basename $reads)_unaligned_reads.fasta" 181 | 182 | $graphaligner -x "vg" \ 183 | -t $threads \ 184 | -g $graph \ 185 | -f "$workingfolder/$(basename $reads)_unaligned_reads.fasta" \ 186 | -a "$workingfolder/$(basename $reads)_unaligned_reads.gaf" 187 | cat "$workingfolder/$(basename $reads)_unaligned_reads.gaf" >> $alignmentsout 188 | else 189 | echo "There are no unaligned reads to realign" 190 | fi 191 | fi 192 | else 193 | # pipeline of above commands 194 | 195 | if [[ "$argc" = true ]] ; then 196 | # set up final GraphAligner call 197 | fastapipe=$(mktemp -u --suffix ".fasta") 198 | mkfifo -m 600 $fastapipe 199 | trap '{ rm -f -- "$fastapipe"; }' EXIT 200 | 201 | # TODO find a way to not store on disk the unaligned alignments and directly append them 202 | $graphaligner -x "vg" \ 203 | -t $threads \ 204 | -g $graph \ 205 | -f $fastapipe \ 206 | -a "$workingfolder/unaligned_reads_$$.gaf" & 207 | # give GraphAligner a dummy "file", see https://github.com/maickrau/GraphAligner/issues/105 208 | set +eo pipefail ; { echo > $fastapipe & } ; set -eo pipefail 209 | fi 210 | 211 | efglocatethreads=$(( $threads / 2 )) 212 | chainxthreads=$(( $threads - $efglocatethreads )) 213 | echo "load balance: $efglocatethreads for locate, $chainxthreads for chaining" 214 | $efglocate --approximate --split-output-matches --reverse-complement --rename-reverse-complement --overwrite \ 215 | --threads $efglocatethreads \ 216 | --approximate-edge-match-min-count $edgemincount \ 217 | --approximate-edge-match-longest $edgelongestcount \ 218 | $graph \ 219 | <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \ 220 | /dev/stdout | \ 221 | $chainxblockgraph --semi-global --split-output-matches-graphaligner --overwrite $chainingguess \ 222 | --threads $chainxthreads \ 223 | --alternative-chains $alternativechains \ 224 | $graph \ 225 | /dev/stdin \ 226 | /dev/stdout | \ 227 | $graphaligner $extendoptions \ 228 | -t $threads \ 229 | -g $graph \ 230 | -f $reads \ 231 | --realign /dev/stdin \ 232 | -a $alignmentsout 233 | 234 | if [[ "$argc" = true ]] ; then 235 | echo -n "Filtering alignments with identity <= 90% or read coverage < 50%..." 236 | awk "{if ($discardoption) {print}}" "$alignmentsout" > "$workingfolder/filtered_alignments_$$.gaf" 237 | mv "$workingfolder/filtered_alignments_$$.gaf" "$alignmentsout" 238 | echo " done." 239 | 240 | unalignedreads=$({ grep -v \ 241 | -f <(cut -f1 $alignmentsout | uniq) \ 242 | <(awk 'NR % 4 == 1' $reads | cut -d' ' -f1 | tr -d "@") || true; }) 243 | unalignedreadscount=$(echo -n "$unalignedreads" | wc -l) 244 | if [[ "$unalignedreadscount" -gt "0" ]] ; then 245 | echo "There are the following unaligned reads:" 246 | echo "$unalignedreads" 247 | grep -A 1 --no-group-separator \ 248 | -f <(echo "$unalignedreads") \ 249 | <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \ 250 | > $fastapipe 251 | wait $(jobs -p) 252 | else 253 | echo "There are no unaligned reads to realign" 254 | set +eo pipefail ; { kill $(jobs -p) & } ; set -eo pipefail 255 | fi 256 | 257 | cat "$workingfolder/unaligned_reads_$$.gaf" >> $alignmentsout 258 | rm "$workingfolder/unaligned_reads_$$.gaf" 259 | fi 260 | fi 261 | -------------------------------------------------------------------------------- /docs/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algbio/SRFAligner/c698b10ce1b3f5ec20ef5f218f6d0259eebf7911/docs/workflow.png -------------------------------------------------------------------------------- /efg-ahocorasickAligner: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 5 | 6 | # executables' absolute paths/commands (make sure they work!) 7 | graphaligner=$thisfolder/tools/GraphAligner/bin/GraphAligner 8 | ahocorasick=$thisfolder/tools/efg-ahocorasick/efg-ahocorasick 9 | extractor=$thisfolder/tools/efg-ahocorasick/extractor 10 | efggafsplitter=$thisfolder/tools/efg-gaf-splitter/efg-gaf-splitter 11 | seqtk=$thisfolder/tools/seqtk/seqtk 12 | 13 | # default params 14 | workingfolder="." 15 | extendoptions="--max-cluster-extend 5 -b 10" # GraphAligner default extend options 16 | 17 | print_help() 18 | { 19 | echo "Pipeline to align long reads to indexable Elastic Founder Graphs, based on srf seeds" 20 | echo "Currently, only 1 compute thread is supported, and all intermediate seeds" 21 | echo " are saved in the working folder" 22 | echo " -h --help: show this screen" 23 | echo " -g graph.gfa: semi-repeat-free EFG in xGFA format" 24 | echo " -f reads.fastq: reads in FASTQ format" 25 | echo " -a alignmentsout.gaf: output alignments in GAF format" 26 | echo " -w path: working folder for output and temporary files" 27 | } 28 | 29 | # https://stackoverflow.com/questions/12022592/how-can-i-use-long-options-with-the-bash-getopts-builtin 30 | for arg in "$@"; do 31 | shift 32 | case "$arg" in 33 | '--help') set -- "$@" '-h' ;; 34 | *) set -- "$@" "$arg" ;; 35 | esac 36 | done 37 | 38 | while getopts "hg:f:a:w:" option; do 39 | case $option in 40 | h) # display help 41 | print_help 42 | exit;; 43 | g) # graph 44 | argg=true 45 | graph="$OPTARG" ;; 46 | f) # fastq reads 47 | argf=true 48 | reads="$OPTARG" ;; 49 | a) # output 50 | arga=true 51 | alignmentsout="$OPTARG" ;; 52 | w) # working folder 53 | argw=true 54 | workingfolder="$OPTARG" ;; 55 | \?) # invalid option 56 | echo "Error: Invalid option" 57 | print_help 58 | exit;; 59 | esac 60 | done 61 | 62 | if [[ "$argg" != true ]] || [[ "$argf" != true ]] ; then 63 | print_help 64 | exit 65 | fi 66 | 67 | # move to working folder 68 | if [[ "$argw" = true ]] ; then 69 | workingfolder="${workingfolder%/}" 70 | else 71 | workingfolder="." 72 | fi 73 | 74 | # extract nodes and ids 75 | $extractor $graph > "$workingfolder/nodes.txt" 2> "$workingfolder/nodeids.txt" 76 | 77 | # find efg-mems seeds 78 | # ahocorasick node_labels node_ids fasta_reads fasta_ids 79 | $ahocorasick \ 80 | "$workingfolder/nodes.txt" \ 81 | "$workingfolder/nodeids.txt" \ 82 | <(cat \ 83 | <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \ 84 | <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1 | sed 's/^>/>rev_/g' | $seqtk seq -r) \ 85 | | grep -v "^>") \ 86 | <(cat \ 87 | <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \ 88 | <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1 | sed 's/^>/>rev_/g' | $seqtk seq -r) \ 89 | | grep "^>" | tr -d ">") \ 90 | > "$workingfolder/ahocorasick_seeds_pre_split.gaf" 91 | 92 | # split seeds and reverse if needed 93 | $efggafsplitter \ 94 | $graph \ 95 | "$workingfolder/ahocorasick_seeds_pre_split.gaf" \ 96 | > "$workingfolder/ahocorasick_seeds.gaf" 97 | 98 | # GraphAligner extend 99 | $graphaligner $extendoptions \ 100 | -t 1 \ 101 | -g $graph \ 102 | -f $reads \ 103 | --realign "$workingfolder/ahocorasick_seeds.gaf" \ 104 | -a $alignmentsout 105 | -------------------------------------------------------------------------------- /efg-memsAligner: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 5 | 6 | # executables' absolute paths/commands (make sure they work!) 7 | graphaligner=$thisfolder/tools/GraphAligner/bin/GraphAligner 8 | efgmems=$thisfolder/tools/efg-mems/efg-mems 9 | efggafsplitter=$thisfolder/tools/efg-gaf-splitter/efg-gaf-splitter 10 | seqtk=$thisfolder/tools/seqtk/seqtk 11 | 12 | # default params 13 | workingfolder="." 14 | extendoptions="--max-cluster-extend 5 -b 10" # GraphAligner default extend options 15 | 16 | print_help() 17 | { 18 | echo "Pipeline to align long reads to indexable Elastic Founder Graphs, based on MEMs" 19 | echo "Currently, only 1 compute thread is supported, and all intermediate seeds" 20 | echo " are saved in the working folder" 21 | echo " -h --help: show this screen" 22 | echo " -g graph.gfa: semi-repeat-free EFG in xGFA format" 23 | echo " -f reads.fastq: reads in FASTQ format" 24 | echo " -a alignmentsout.gaf: output alignments in GAF format" 25 | echo " -w path: working folder for output and temporary files" 26 | } 27 | 28 | # https://stackoverflow.com/questions/12022592/how-can-i-use-long-options-with-the-bash-getopts-builtin 29 | for arg in "$@"; do 30 | shift 31 | case "$arg" in 32 | '--help') set -- "$@" '-h' ;; 33 | *) set -- "$@" "$arg" ;; 34 | esac 35 | done 36 | 37 | while getopts "hg:f:a:w:" option; do 38 | case $option in 39 | h) # display help 40 | print_help 41 | exit;; 42 | g) # graph 43 | argg=true 44 | graph="$OPTARG" ;; 45 | f) # fastq reads 46 | argf=true 47 | reads="$OPTARG" ;; 48 | a) # output 49 | arga=true 50 | alignmentsout="$OPTARG" ;; 51 | w) # working folder 52 | argw=true 53 | workingfolder="$OPTARG" ;; 54 | \?) # invalid option 55 | echo "Error: Invalid option" 56 | print_help 57 | exit;; 58 | esac 59 | done 60 | 61 | if [[ "$argg" != true ]] || [[ "$argf" != true ]] ; then 62 | print_help 63 | exit 64 | fi 65 | 66 | # move to working folder 67 | if [[ "$argw" = true ]] ; then 68 | workingfolder="${workingfolder%/}" 69 | else 70 | workingfolder="." 71 | fi 72 | 73 | # find efg-mems seeds 74 | $efgmems -a "ACGTXN#0" -k 20 --indexing --bdbwt \ 75 | -o "$workingfolder/efgmems_seeds_pre_split.gaf" \ 76 | <(cat <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) <(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1 | sed 's/^>/>rev_/g' | $seqtk seq -r) | tr "N" "X") \ 77 | $graph 78 | 79 | # split seeds and reverse if needed 80 | $efggafsplitter \ 81 | $graph \ 82 | "$workingfolder/efgmems_seeds_pre_split.gaf" \ 83 | > "$workingfolder/efgmems_seeds.gaf" 84 | 85 | # GraphAligner extend 86 | $graphaligner $extendoptions \ 87 | -t 1 \ 88 | -g $graph \ 89 | -f $reads \ 90 | --realign "$workingfolder/efgmems_seeds.gaf" \ 91 | -a $alignmentsout 92 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation of sequence-to-graph aligners on simulated reads 2 | Based on the [GraphChainer evaluation pipeline](https://github.com/algbio/GraphChainer-scripts). 3 | 4 | ## Prerequisites 5 | Install [miniconda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html). Then, set up the environment `aligner-evaluation` defined in `environment.yml`: 6 | ``` 7 | conda env create -f environment.yml 8 | ``` 9 | The pipeline expects to find `GraphChainer` and `badread_runner.py` in folders `tools/GraphChainer/bin` and `tools/BadRead` from the root of the repository, so download them and compile `GraphChainer` if you have not. 10 | ``` 11 | git submodule update --init --recursive ../../tools/Badread 12 | git submodule update --init --recursive ../../tools/GraphChainer 13 | # following GraphChainer compile instructions 14 | cd ../../tools/GraphChainer 15 | conda env create -f CondaEnvironment.yml 16 | conda activate GraphChainer 17 | make bin/GraphChainer 18 | 19 | cd ../../experiments/aligner-evaluation 20 | conda activate aligner-evaluation 21 | ``` 22 | The scripts `runexp.sh` expect `/usr/bin/time`, `openssl`, and `awk` to be installed (and the `aligner-evaluation` conda environment to be active). 23 | 24 | ## Datasets 25 | | Graph | Construction | Input | 26 | |-----------------------------|----------------------------------------|-------| 27 | | chr22\_iEFG | `vcf2multialign` + `founderblockgraph` | [T2T-CHM13v2.0](https://github.com/marbl/CHM13) + [Phased T2T 1KGP panel](https://zenodo.org/records/7612953#.Y-8VD3bMJPY) 28 | | chr22\_vg | `vg` | same as above | 29 | | chr22_vg_msa | `vcf2multialign` + `vg -M` | same as above | 30 | | covid19_100_iEFG_simplified | `founderblockgraph` + `efg-simplify` | [MSA from efg-mems experiments](https://github.com/algbio/efg-mems) 31 | 32 | All experiments except for `vg-comparison` and `mems` use the chromosome 22 iEFG built with the `vcf-to-hapl-to-efg` pipeline as file `input/chr22_iEFG.gfa`. You can get the graph from [zenodo](https://doi.org/10.5281/zenodo.14012882) as follows: 33 | ``` 34 | wget https://zenodo.org/records/14012882/files/chr22_iEFG.gfa.gz?download=1 --output-document=input/chr22_iEFG.gfa.gz 35 | gunzip input/chr22_iEFG.gfa 36 | ``` 37 | The `mems` experiment uses a SaRS-CoV-2 MSA of 100 strains (NCBI accession numbers in `input/covid19_100_acc.txt`) aligned with [ViralMSA](https://github.com/niemasd/ViralMSA): 38 | ``` 39 | wget https://zenodo.org/records/14012882/files/covid19_100_iEFG_simplified.gfa.gz?download=1 --output-document=input/covid19_100_iEFG_simplified.gfa.gz 40 | gunzip input/covid19_100_iEFG_simplified.gfa.gz 41 | ``` 42 | Finally, the `vg-comparison` experiment additionally uses the `vg` graphs built from the 1KGP phased VCF file or the MSA obtained with `vcf2multialign` from the same data: 43 | ``` 44 | wget https://zenodo.org/records/14012882/files/chr22_vg.gfa.gz?download=1 --output-document=input/chr22_vg.gfa.gz 45 | gunzip input/chr22_vg.gfa.gz 46 | wget https://zenodo.org/records/14012882/files/chr22_vg_msa.gfa.gz?download=1 --output-document=input/chr22_vg_msa.gfa.gz 47 | gunzip input/chr22_vg_msa.gfa.gz 48 | ``` 49 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/ahocorasick/runexp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # comparison of SRFAligner and efg-ahocorasickAligner 3 | # remember to run `conda activate aligner-evaluation` before execution 4 | 5 | set -e 6 | set -o pipefail 7 | 8 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 9 | cd $thisfolder 10 | 11 | # executable's absolute paths/commands (make sure they work!) 12 | graphaligner=$thisfolder/../../../tools/GraphAligner/bin/GraphAligner 13 | srfaligner=$thisfolder/../../../SRFAligner 14 | ahocorasickaligner=$thisfolder/../../../efg-ahocorasickAligner 15 | usrbintime=/usr/bin/time 16 | 17 | # params 18 | inputgraph=$thisfolder/../input/chr22_iEFG.gfa 19 | coverage=30 20 | threads=1 21 | 22 | # 0. setup 23 | mkdir output 24 | echo -n > output/runexp_log.txt 25 | set -a 26 | set +a 27 | 28 | # 1. simulate path and reads 29 | # uncomment the following 3 lines and comment the following ones to use the reads in semi-repeat-free 30 | #ln -s $thisfolder/../semi-repeat-free/output/sim_reads.fastq output 31 | #ln -s $thisfolder/../semi-repeat-free/output/sim_reads_path.fasta output 32 | #ln -s $thisfolder/../semi-repeat-free/output/sim_reads_path.nodes output 33 | python3 ../scripts/generate_sim_reads.py \ 34 | --graph $inputgraph \ 35 | --fastq output/sim_reads.fastq \ 36 | --seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt /dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \ 37 | --path output/sim_reads_path.nodes \ 38 | --fasta output/sim_reads_path.fasta \ 39 | --coverage $coverage \ 40 | 2>> output/runexp_log.txt >> output/runexp_log.txt 41 | 42 | # 2. run the aligners 43 | $usrbintime $srfaligner \ 44 | -t $threads \ 45 | -g $inputgraph \ 46 | -m 0 \ 47 | -p \ 48 | -f output/sim_reads.fastq \ 49 | -a output/semi_repeat_free_alignments.gaf \ 50 | -w output \ 51 | 2>> output/runexp_log.txt >> output/runexp_log.txt 52 | 53 | $usrbintime $ahocorasickaligner \ 54 | -g $inputgraph \ 55 | -f output/sim_reads.fastq \ 56 | -a output/ahocorasick_alignments.gaf \ 57 | -w output \ 58 | 2>> output/runexp_log.txt >> output/runexp_log.txt 59 | 60 | # 3. pick first alignment 61 | for alignment in "ahocorasick_alignments.gaf" "semi_repeat_free_alignments.gaf" 62 | do 63 | awk '{if (found[$1] == "1") \ 64 | {} \ 65 | else 66 | {found[$1]="1"; print}}' \ 67 | output/$alignment > output/best_$alignment 68 | done 69 | 70 | # 4. validate and plot results 71 | for alignment in semi_repeat_free ahocorasick 72 | do 73 | python3 ../scripts/compute_summary.py \ 74 | -t 3 \ 75 | --graph $inputgraph \ 76 | --fastq output/sim_reads.fastq \ 77 | --path output/sim_reads_path.nodes \ 78 | --fasta output/sim_reads_path.fasta \ 79 | --alignments output/best_${alignment}_alignments.gaf \ 80 | --metrics output/metrics_${alignment}.mts & 81 | done 82 | wait $(jobs -p) 83 | 84 | python3 ../scripts/compute_metrics.py \ 85 | --output-name output/results \ 86 | --summaries output/*.mts \ 87 | --summaries-names $(basename -s ".mts" -a output/*.mts | sed 's/metrics_//') 88 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/chaining/runexp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # remember to run `conda activate aligner-evaluation` 3 | set -e 4 | set -o pipefail 5 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 6 | cd $thisfolder 7 | 8 | # executable's absolute paths/commands (make sure they work!) 9 | BADREAD=$thisfolder/../../../tools/Badread/badread-runner.py 10 | GRAPHCHAINER=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer 11 | srfchainer=$thisfolder/../../../SRFChainer 12 | usrbintime=/usr/bin/time 13 | 14 | # params 15 | inputgraph=$thisfolder/../input/chr22_iEFG.gfa 16 | coverage=30 17 | threads=64 18 | 19 | # 0. setup 20 | mkdir output 21 | echo -n > output/runexp_log.txt 22 | set -a 23 | set +a 24 | 25 | # 1. simulate path and reads 26 | # comment the following 3 lines and uncomment the following ones to generate the reads (again) 27 | ln -s $thisfolder/../semi-repeat-free/output/sim_reads.fastq output 28 | ln -s $thisfolder/../semi-repeat-free/output/sim_reads_path.fasta output 29 | ln -s $thisfolder/../semi-repeat-free/output/sim_reads_path.nodes output 30 | #python3 ../scripts/generate_sim_reads.py \ 31 | # --graph $inputgraph \ 32 | # --fastq output/sim_reads.fastq \ 33 | # --seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt /dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \ 34 | # --path output/sim_reads_path.nodes \ 35 | # --fasta output/sim_reads_path.fasta \ 36 | # --coverage $coverage \ 37 | # 2>> output/runexp_log.txt >> output/runexp_log.txt 38 | 39 | # 2. run the aligners 40 | $usrbintime $srfchainer \ 41 | -t $threads \ 42 | -g $inputgraph \ 43 | -f output/sim_reads.fastq \ 44 | -a output/semi_repeat_free_chain_alignments.gaf \ 45 | 2>> output/runexp_log.txt >> output/runexp_log.txt 46 | 47 | for o in 1 5 10 50 100 48 | do 49 | $usrbintime $srfchainer \ 50 | -t $threads \ 51 | -g $inputgraph \ 52 | -o $o \ 53 | -f output/sim_reads.fastq \ 54 | -a output/srf_edge_longest_${o}_chain_alignments.gaf \ 55 | 2>> output/runexp_log.txt >> output/runexp_log.txt 56 | done 57 | 58 | # 3. pick first alignment 59 | for alignment in "semi_repeat_free_chain_alignments.gaf" 60 | do 61 | awk '{if (found[$1] == "1") \ 62 | {} \ 63 | else 64 | {found[$1]="1"; print}}' \ 65 | output/$alignment > output/best_$alignment 66 | done 67 | for o in 1 5 10 50 100 68 | do 69 | awk '{if (found[$1] == "1") \ 70 | {} \ 71 | else 72 | {found[$1]="1"; print}}' \ 73 | output/srf_edge_longest_${o}_chain_alignments.gaf > output/best_srf_edge_longest_${o}_chain_alignments.gaf 74 | done 75 | 76 | # 4. validate and plot results 77 | for alignment in semi_repeat_free_chain 78 | do 79 | python3 ../scripts/compute_summary.py \ 80 | -t 3 \ 81 | --graph $inputgraph \ 82 | --fastq output/sim_reads.fastq \ 83 | --path output/sim_reads_path.nodes \ 84 | --fasta output/sim_reads_path.fasta \ 85 | --alignments output/best_${alignment}_alignments.gaf \ 86 | --metrics output/metrics_${alignment}.mts 87 | done 88 | for o in 1 5 10 50 100 89 | do 90 | python3 ../scripts/compute_summary.py \ 91 | -t 3 \ 92 | --graph $inputgraph \ 93 | --fastq output/sim_reads.fastq \ 94 | --path output/sim_reads_path.nodes \ 95 | --fasta output/sim_reads_path.fasta \ 96 | --alignments output/best_srf_edge_longest_${o}_chain_alignments.gaf \ 97 | --metrics output/metrics_srf_edge_longest_${o}_chain.mts 98 | done 99 | wait $(jobs -p) 100 | 101 | python3 ../scripts/compute_metrics.py \ 102 | --output-name output/results \ 103 | --summaries output/*.mts \ 104 | --summaries-names $(basename -s ".mts" -a output/*.mts | sed 's/metrics_//') 105 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/environment.yml: -------------------------------------------------------------------------------- 1 | name: aligner-evaluation 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - https://conda.anaconda.org/gurobi 6 | - defaults 7 | dependencies: 8 | - _libgcc_mutex=0.1=main 9 | - _openmp_mutex=5.1=1_gnu 10 | - brotli=1.0.9=h166bdaf_7 11 | - brotli-bin=1.0.9=h166bdaf_7 12 | - bzip2=1.0.8=h7b6447c_0 13 | - ca-certificates=2022.12.7=ha878542_0 14 | - certifi=2022.12.7=pyhd8ed1ab_0 15 | - click=8.1.3=unix_pyhd8ed1ab_2 16 | - contourpy=1.0.5=py310hdb19cb5_0 17 | - cycler=0.11.0=pyhd8ed1ab_0 18 | - dbus=1.13.18=hb2f20db_0 19 | - expat=2.2.10=h9c3ff4c_0 20 | - fontconfig=2.14.1=hef1e5e3_0 21 | - fonttools=4.25.0=pyhd3eb1b0_0 22 | - freetype=2.10.4=h0708190_1 23 | - giflib=5.2.1=h36c2ea0_2 24 | - glib=2.69.1=h4ff587b_1 25 | - gst-plugins-base=1.14.1=h6a678d5_1 26 | - gstreamer=1.14.1=h5eee18b_1 27 | - icu=58.2=hf484d3e_1000 28 | - joblib=1.1.1=py310h06a4308_0 29 | - jpeg=9e=h166bdaf_1 30 | - keyutils=1.6.1=h166bdaf_0 31 | - kiwisolver=1.4.4=py310h6a678d5_0 32 | - krb5=1.19.3=h3790be6_0 33 | - lcms2=2.12=h3be6417_0 34 | - ld_impl_linux-64=2.38=h1181459_1 35 | - lerc=3.0=h295c915_0 36 | - libblas=3.9.0=15_linux64_openblas 37 | - libbrotlicommon=1.0.9=h166bdaf_7 38 | - libbrotlidec=1.0.9=h166bdaf_7 39 | - libbrotlienc=1.0.9=h166bdaf_7 40 | - libcblas=3.9.0=15_linux64_openblas 41 | - libclang=10.0.1=default_hb85057a_2 42 | - libdeflate=1.17=h5eee18b_0 43 | - libedit=3.1.20191231=he28a2e2_2 44 | - libevent=2.1.12=h8f2d780_0 45 | - libffi=3.3=he6710b0_2 46 | - libgcc-ng=11.2.0=h1234567_1 47 | - libgfortran-ng=12.2.0=h69a702a_19 48 | - libgfortran5=12.2.0=h337968e_19 49 | - libgomp=11.2.0=h1234567_1 50 | - liblapack=3.9.0=15_linux64_openblas 51 | - libllvm10=10.0.1=he513fc3_3 52 | - libopenblas=0.3.20=pthreads_h78a6416_0 53 | - libpng=1.6.39=h5eee18b_0 54 | - libpq=12.9=h16c4e8d_3 55 | - libprotobuf=3.20.3=he621ea3_0 56 | - libstdcxx-ng=11.2.0=h1234567_1 57 | - libtiff=4.5.0=h6a678d5_2 58 | - libuuid=1.41.5=h5eee18b_0 59 | - libwebp=1.2.4=h11a3e52_1 60 | - libwebp-base=1.2.4=h5eee18b_1 61 | - libxcb=1.15=h7f8727e_0 62 | - libxkbcommon=1.0.1=hfa300c1_0 63 | - libxml2=2.9.14=h74e7548_0 64 | - libxslt=1.1.35=h4e12654_0 65 | - lz4-c=1.9.3=h9c3ff4c_1 66 | - matplotlib=3.7.0=py310h06a4308_0 67 | - matplotlib-base=3.7.0=py310h1128e8f_0 68 | - munkres=1.1.4=pyh9f0ad1d_0 69 | - ncurses=6.4=h6a678d5_0 70 | - nspr=4.33=h295c915_0 71 | - nss=3.74=h0370c37_0 72 | - openssl=1.1.1s=h7f8727e_0 73 | - packaging=23.0=pyhd8ed1ab_0 74 | - pcre=8.45=h9c3ff4c_0 75 | - pillow=9.4.0=py310h6a678d5_0 76 | - pip=23.0.1=py310h06a4308_0 77 | - ply=3.11=py_1 78 | - protobuf=3.20.3=py310h6a678d5_0 79 | - pyparsing=3.0.9=pyhd8ed1ab_0 80 | - pyqt=5.15.7=py310h6a678d5_1 81 | - python=3.10.0=h12debd9_5 82 | - python-dateutil=2.8.2=pyhd8ed1ab_0 83 | - python-dotenv=1.0.0=pyhd8ed1ab_0 84 | - python_abi=3.10=2_cp310 85 | - qt-main=5.15.2=h327a75a_7 86 | - qt-webengine=5.15.9=hd2b0992_4 87 | - qtwebkit=5.212=h4eab89a_4 88 | - readline=8.2=h5eee18b_0 89 | - setuptools=65.6.3=py310h06a4308_0 90 | - sip=6.6.2=py310h6a678d5_0 91 | - six=1.16.0=pyh6c4a22f_0 92 | - sqlite=3.40.1=h5082296_0 93 | - tk=8.6.12=h1ccaba5_0 94 | - toml=0.10.2=pyhd8ed1ab_0 95 | - tornado=6.1=py310h5764c6d_3 96 | - tzdata=2022g=h04d1e81_0 97 | - wheel=0.38.4=py310h06a4308_0 98 | - xz=5.2.10=h5eee18b_1 99 | - zlib=1.2.13=h5eee18b_0 100 | - zstd=1.5.2=ha4553b6_0 101 | - pip: 102 | - edlib==1.3.9 103 | - numpy==1.24.2 104 | - pyqt5-sip==12.11.0 105 | - scipy==1.10.1 106 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/final/runexp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # remember to run `conda activate aligner-evaluation` 3 | set -e 4 | set -o pipefail 5 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 6 | cd $thisfolder 7 | 8 | # executables' absolute paths/commands (make sure they work!) 9 | BADREAD=$thisfolder/../../../tools/Badread/badread-runner.py 10 | GRAPHCHAINER=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer 11 | srfaligner=$thisfolder/../../../SRFAligner 12 | srfchainer=$thisfolder/../../../SRFChainer 13 | graphchainer=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer 14 | minigraph=$thisfolder/../../../tools/minigraph/minigraph 15 | minichain=$thisfolder/../../../tools/minichain/minichain 16 | usrbintime=/usr/bin/time 17 | 18 | # params 19 | inputgraph=$thisfolder/../input/chr22_iEFG.gfa 20 | coverage=30 21 | threads=64 22 | 23 | # 0. setup 24 | mkdir output 25 | echo -n > output/runexp_log.txt 26 | set -a 27 | set +a 28 | 29 | # 1. simulate path and reads 30 | # comment the following 3 lines and uncomment the following ones to generate the reads (again) 31 | #ln -s $thisfolder/../semi-repeat-free/output/sim_reads.fastq output 32 | #ln -s $thisfolder/../semi-repeat-free/output/sim_reads_path.fasta output 33 | #ln -s $thisfolder/../semi-repeat-free/output/sim_reads_path.nodes output 34 | python3 ../scripts/generate_sim_reads.py \ 35 | --graph $inputgraph \ 36 | --fastq output/sim_reads.fastq \ 37 | --seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt /dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \ 38 | --path output/sim_reads_path.nodes \ 39 | --fasta output/sim_reads_path.fasta \ 40 | --coverage $coverage \ 41 | 2>> output/runexp_log.txt >> output/runexp_log.txt 42 | 43 | # 2. run the aligners 44 | $usrbintime $srfaligner \ 45 | -t $threads \ 46 | -g $inputgraph \ 47 | -c \ 48 | -f output/sim_reads.fastq \ 49 | -a output/srfaligner_alignments.gaf \ 50 | 2>> output/runexp_log.txt >> output/runexp_log.txt 51 | 52 | $usrbintime $srfchainer \ 53 | -t $threads \ 54 | -g $inputgraph \ 55 | -c \ 56 | -f output/sim_reads.fastq \ 57 | -a output/srfchainer_alignments.gaf \ 58 | 2>> output/runexp_log.txt >> output/runexp_log.txt 59 | 60 | cat output/sim_reads.fastq | cut -d' ' -f1 > output/sim_reads_fixed_header.fastq 61 | $usrbintime $graphchainer \ 62 | -t $threads \ 63 | -g $inputgraph \ 64 | -f output/sim_reads_fixed_header.fastq \ 65 | -a output/graphchainer_alignments.gaf \ 66 | 2>> output/runexp_log.txt >> output/runexp_log.txt 67 | 68 | $usrbintime $minigraph \ 69 | -t $threads \ 70 | -c \ 71 | -x lr \ 72 | $inputgraph \ 73 | output/sim_reads.fastq \ 74 | -o output/minigraph_alignments.gaf \ 75 | 2>> output/runexp_log.txt >> output/runexp_log.txt 76 | 77 | $usrbintime $minichain \ 78 | -t $threads \ 79 | -c $inputgraph \ 80 | output/sim_reads.fastq \ 81 | -o output/minichain_alignments.gaf \ 82 | 2>> output/runexp_log.txt >> output/runexp_log.txt 83 | 84 | # 3. pick first alignment 85 | for alignment in "srfaligner_alignments.gaf" "srfchainer_alignments.gaf" "graphchainer_alignments.gaf" "minigraph_alignments.gaf" "minichain_alignments.gaf" 86 | do 87 | awk '{if (found[$1] == "1") \ 88 | {} \ 89 | else 90 | {found[$1]="1"; print}}' \ 91 | output/$alignment > output/best_$alignment 92 | done 93 | 94 | # 4. validate and plot results 95 | for alignment in best_srfaligner best_srfchainer best_graphchainer minigraph minichain 96 | do 97 | python3 ../scripts/compute_summary.py \ 98 | -t 3 \ 99 | --graph $inputgraph \ 100 | --fastq output/sim_reads.fastq \ 101 | --path output/sim_reads_path.nodes \ 102 | --fasta output/sim_reads_path.fasta \ 103 | --alignments output/${alignment}_alignments.gaf \ 104 | --metrics output/metrics_${alignment}.mts 105 | done 106 | wait $(jobs -p) 107 | 108 | python3 ../scripts/compute_metrics.py \ 109 | --output-name output/results \ 110 | --summaries output/*.mts \ 111 | --summaries-names $(basename -s ".mts" -a output/*.mts | sed 's/metrics_//') 112 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/input/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algbio/SRFAligner/c698b10ce1b3f5ec20ef5f218f6d0259eebf7911/experiments/aligner-evaluation/input/.gitkeep -------------------------------------------------------------------------------- /experiments/aligner-evaluation/input/covid19_100_acc.txt: -------------------------------------------------------------------------------- 1 | MT370921 2 | MT358688 3 | NC_045512 4 | MT370999 5 | MT251977 6 | MT345836 7 | MT263424 8 | MT334572 9 | MT246477 10 | MN997409 11 | MT370973 12 | MT344945 13 | MT345880 14 | MT246478 15 | MT114416 16 | MT345801 17 | MT328033 18 | LC542809 19 | MT293218 20 | MT370906 21 | MT259248 22 | MT334531 23 | MT350263 24 | MT371017 25 | MT350263 26 | MT322407 27 | LC528232 28 | MT246459 29 | MT291832 30 | MT263406 31 | MT358732 32 | MT293207 33 | MT326065 34 | MT326104 35 | MT325580 36 | MT350263 37 | MT345841 38 | MT325571 39 | MN996527 40 | MT370949 41 | MT358734 42 | MT192759 43 | MT370879 44 | MT259228 45 | MT370872 46 | MT358401 47 | MT291835 48 | MT370948 49 | MT358675 50 | MT358680 51 | NC_045512 52 | MT322399 53 | MT326174 54 | MT039887 55 | MT259236 56 | MT246461 57 | MT370886 58 | MT370880 59 | MT334565 60 | MT114419 61 | MT370943 62 | MT358648 63 | MT344959 64 | MT344961 65 | MT370899 66 | MT358652 67 | MT322419 68 | MT325580 69 | MT322404 70 | MT370988 71 | MT259277 72 | MT370911 73 | MT370862 74 | MT370983 75 | MT263423 76 | MT322400 77 | MT370990 78 | MT345803 79 | MT334536 80 | MT263437 81 | MT334573 82 | MT358716 83 | MT344955 84 | MN938384 85 | MT292575 86 | MT358637 87 | MT334557 88 | MT345870 89 | MT350249 90 | MT263391 91 | MT358656 92 | MT350263 93 | MT350236 94 | MT370889 95 | MT308696 96 | MT256924 97 | MT370846 98 | MN988713 99 | MT350237 100 | MT322415 101 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/mems/runexp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # remember to run `conda activate aligner-evaluation` 3 | set -e 4 | set -o pipefail 5 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 6 | cd $thisfolder 7 | 8 | # executables' absolute paths/commands (make sure they work!) 9 | BADREAD=$thisfolder/../../../tools/Badread/badread-runner.py 10 | GRAPHCHAINER=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer 11 | graphaligner=$thisfolder/../../../tools/GraphAligner/bin/GraphAligner 12 | srfaligner=$thisfolder/../../../SRFAligner 13 | efgmemsaligner=$thisfolder/../../../efg-memsAligner 14 | usrbintime=/usr/bin/time 15 | 16 | # params 17 | inputgraph=$thisfolder/../input/covid19_100_iEFG_simplified.gfa 18 | coverage=1000 19 | threads=1 20 | 21 | # 0. setup 22 | mkdir output 23 | echo -n > output/runexp_log.txt 24 | set -a 25 | set +a 26 | 27 | # 1. simulate path and reads 28 | python3 ../scripts/generate_sim_reads.py \ 29 | --graph $inputgraph \ 30 | --fastq output/sim_reads.fastq \ 31 | --seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt /dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \ 32 | --path output/sim_reads_path.nodes \ 33 | --fasta output/sim_reads_path.fasta \ 34 | --coverage $coverage \ 35 | 2>> output/runexp_log.txt >> output/runexp_log.txt 36 | 37 | # 2. run the aligners 38 | $usrbintime $srfaligner \ 39 | -t $threads \ 40 | -g $inputgraph \ 41 | -i "N" \ 42 | -f output/sim_reads.fastq \ 43 | -a output/semi_repeat_free_alignments.gaf \ 44 | 2>> output/runexp_log.txt >> output/runexp_log.txt 45 | 46 | $usrbintime $graphaligner \ 47 | -t $threads \ 48 | -x "vg" \ 49 | -g $inputgraph \ 50 | -f output/sim_reads.fastq \ 51 | -a output/graphaligner_alignments.gaf \ 52 | 2>> output/runexp_log.txt >> output/runexp_log.txt 53 | 54 | $usrbintime $efgmemsaligner \ 55 | -g $inputgraph \ 56 | -f output/sim_reads.fastq \ 57 | -a output/efg_mems_alignments.gaf \ 58 | -w output \ 59 | 2>> output/runexp_log.txt >> output/runexp_log.txt 60 | 61 | # 3. pick first alignment 62 | for alignment in "semi_repeat_free_alignments.gaf" "graphaligner_alignments.gaf" "efg_mems_alignments.gaf" 63 | do 64 | awk '{if (found[$1] == "1") \ 65 | {} \ 66 | else 67 | {found[$1]="1"; print}}' \ 68 | output/$alignment > output/best_$alignment 69 | done 70 | 71 | # 4. validate and plot results 72 | for alignment in semi_repeat_free graphaligner efg_mems 73 | do 74 | python3 ../scripts/compute_summary.py \ 75 | -t 3 \ 76 | --graph $inputgraph \ 77 | --fastq output/sim_reads.fastq \ 78 | --path output/sim_reads_path.nodes \ 79 | --fasta output/sim_reads_path.fasta \ 80 | --alignments output/best_${alignment}_alignments.gaf \ 81 | --metrics output/metrics_${alignment}.mts 82 | done 83 | wait $(jobs -p) 84 | 85 | python3 ../scripts/compute_metrics.py \ 86 | --output-name output/results \ 87 | --summaries output/*.mts \ 88 | --summaries-names $(basename -s ".mts" -a output/*.mts | sed 's/metrics_//') 89 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/scripts/.env: -------------------------------------------------------------------------------- 1 | BADREAD=../../../tools/Badread/badread-runner.py 2 | GRAPHCHAINER=../../../tools/GraphChainer/bin/GraphChainer 3 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/scripts/compute_metrics.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser, RawTextHelpFormatter 2 | from numpy import linspace 3 | from matplotlib.pyplot import subplots, savefig 4 | 5 | sigma_range = list(linspace(0, 1, 1001)) 6 | 7 | 8 | def populate_metric_dicts(lines, col_idx, accuracy_dict, length_dict, distance=True): 9 | 10 | for sigma in sigma_range: 11 | accuracy_dict[sigma] = 0 12 | length_dict[sigma] = 0 13 | 14 | total_reads = 0 15 | total_length = 0 16 | 17 | for i, line in enumerate(lines): 18 | if i == 0: 19 | continue 20 | values = line.strip().split(',') 21 | edit_distance = float(values[col_idx]) 22 | read_len = float(values[-1]) 23 | truth_len = float(values[-2]) 24 | 25 | used_len = read_len if col_idx == 2 else truth_len 26 | total_reads += 1 27 | total_length += read_len 28 | for sigma in sigma_range: 29 | if used_len > 0: 30 | if distance: 31 | if edit_distance / used_len <= sigma: 32 | accuracy_dict[sigma] += 1 33 | length_dict[sigma] += read_len 34 | else: 35 | if edit_distance / used_len >= sigma: 36 | accuracy_dict[sigma] += 1 37 | length_dict[sigma] += read_len 38 | 39 | # converting counts to percentages 40 | for sigma in sigma_range: 41 | accuracy_dict[sigma] = float(accuracy_dict[sigma]) / total_reads * 100 42 | length_dict[sigma] = float(length_dict[sigma]) / total_length * 100 43 | 44 | 45 | def compute_metrics(args): 46 | 47 | # Metrics 48 | metrics = dict() 49 | for aligner, csv in zip(args.summaries_names, args.summaries): 50 | 51 | metrics[aligner] = { 52 | 'overlap': { 53 | 'accuracy': dict(), 54 | 'length': dict() 55 | }, 56 | 'truth': { 57 | 'accuracy': dict(), 58 | 'length': dict() 59 | }, 60 | 'read': { 61 | 'accuracy': dict(), 62 | 'length': dict() 63 | } 64 | } 65 | 66 | lines = open(csv).read().split('\n')[:-1] 67 | populate_metric_dicts( 68 | lines, 4, metrics[aligner]['overlap']['accuracy'], metrics[aligner]['overlap']['length'], distance=False 69 | ) 70 | populate_metric_dicts(lines, 3, metrics[aligner]['truth']['accuracy'], metrics[aligner]['truth']['length']) 71 | populate_metric_dicts(lines, 2, metrics[aligner]['read']['accuracy'], metrics[aligner]['read']['length']) 72 | 73 | # Values for tables (Hardcoded for now) 74 | first_delta = 0.1 75 | second_delta = 0.9500000000000001 76 | first_sigma = 0.1 77 | 78 | # Tables accuracy 79 | 80 | # Compute lines for each aligner 81 | aligners_lines_overlap_accuracy = "" 82 | aligners_lines_truth_read_accuracy = "" 83 | for aligner in args.summaries_names: 84 | aligners_lines_overlap_accuracy += f" & {aligner} & {round(metrics[aligner]['overlap']['accuracy'][first_delta], 2)}\\% & {round(metrics[aligner]['overlap']['accuracy'][second_delta], 2)}\\% \\\\\n" 85 | aligners_lines_truth_read_accuracy += f" & {aligner} & {round(metrics[aligner]['truth']['accuracy'][first_sigma], 2)}\\% & {round(metrics[aligner]['read']['accuracy'][first_sigma], 2)}\\% \\\\\n" 86 | 87 | tables_accuracy = f""" 88 | \\begin{{tabular}}{{|c | l | l | l |}} 89 | \\hline 90 | Dataset & \\multicolumn{{1}}{{c|}}{{Aligner}} & \\multicolumn{{2}}{{c|}}{{Correctly aligned}} \\\\ 91 | \\cline{{3-4}} & & \\multicolumn{{1}}{{c|}}{{$\\delta = {round(first_delta, 2)}$}} & \\multicolumn{{1}}{{c|}}{{$\\delta = {round(second_delta, 2)}$}} \\\\ 92 | \\hline\\hline 93 | \\multirow{{{len(args.summaries_names)}}}{{*}}{{{args.output_name}}}{aligners_lines_overlap_accuracy}\\hline 94 | \\end{{tabular}} 95 | 96 | \\begin{{tabular}}{{|c | l | l | l |}} 97 | \\hline 98 | Dataset & \\multicolumn{{1}}{{c|}}{{Aligner}} & \\multicolumn{{2}}{{c|}}{{Correctly aligned}} \\\\ 99 | \\cline{{3-4}} & & \\multicolumn{{1}}{{c|}}{{$\\sigma_{{truth}} = {round(first_sigma, 2)}$}} & \\multicolumn{{1}}{{c|}}{{$\\sigma_{{read}} = {round(first_sigma, 2)}$}} \\\\ 100 | \\hline\\hline 101 | \\multirow{{{len(args.summaries_names)}}}{{*}}{{{args.output_name}}}{aligners_lines_truth_read_accuracy}\\hline 102 | \\end{{tabular}} 103 | """ 104 | 105 | tex_file = open(f'{args.output_name}_accuracy.tex', 'w') 106 | tex_file.write(tables_accuracy) 107 | tex_file.close() 108 | 109 | # Plot accuracy 110 | fig, (overlap, truth, read) = subplots(3, 1) 111 | 112 | overlap.set_xlabel('Overlap criterion ($\delta$)') 113 | overlap.set_ylabel('% Correctly aligned') 114 | truth.set_xlabel('Edit distance criterion ($\sigma_{truth}$)') 115 | truth.set_ylabel('% Correctly aligned') 116 | read.set_xlabel('Edit distance criterion ($\sigma_{read}$)') 117 | read.set_ylabel('% Correctly aligned') 118 | 119 | for aligner in args.summaries_names: 120 | overlap.plot(sigma_range, metrics[aligner]['overlap']['accuracy'].values(), label=aligner) 121 | truth.plot(sigma_range, metrics[aligner]['truth']['accuracy'].values(), label=aligner) 122 | read.plot(sigma_range, metrics[aligner]['read']['accuracy'].values(), label=aligner) 123 | 124 | overlap.grid() 125 | overlap.legend() 126 | truth.grid() 127 | truth.legend() 128 | read.grid() 129 | read.legend() 130 | 131 | 132 | fig.set_figwidth(3.87) 133 | fig.set_figheight(11.5) 134 | fig.tight_layout(pad=1.15) 135 | 136 | savefig(f'{args.output_name}_accuracy.pdf') 137 | 138 | 139 | # Tables length 140 | 141 | # Compute lines for each aligner 142 | aligners_lines_overlap_length = "" 143 | aligners_lines_truth_read_length = "" 144 | for aligner in args.summaries_names: 145 | aligners_lines_overlap_length += f" & {aligner} & {round(metrics[aligner]['overlap']['length'][first_delta], 2)}\\% & {round(metrics[aligner]['overlap']['length'][second_delta], 2)}\\% \\\\\n" 146 | aligners_lines_truth_read_length += f" & {aligner} & {round(metrics[aligner]['truth']['length'][first_sigma], 2)}\\% & {round(metrics[aligner]['read']['length'][first_sigma], 2)}\\% \\\\\n" 147 | 148 | tables_length = f""" 149 | \\begin{{tabular}}{{|c | l | l | l |}} 150 | \\hline 151 | Dataset & \\multicolumn{{1}}{{c|}}{{Aligner}} & \\multicolumn{{2}}{{c|}}{{Good length}} \\\\ 152 | \\cline{{3-4}} & & \\multicolumn{{1}}{{c|}}{{$\\delta = {round(first_delta, 2)}$}} & \\multicolumn{{1}}{{c|}}{{$\\delta = {round(second_delta, 2)}$}} \\\\ 153 | \\hline\\hline 154 | \\multirow{{{len(args.summaries_names)}}}{{*}}{{{args.output_name}}}{aligners_lines_overlap_length}\\hline 155 | \\end{{tabular}} 156 | 157 | \\begin{{tabular}}{{|c | l | l | l |}} 158 | \\hline 159 | Dataset & \\multicolumn{{1}}{{c|}}{{Aligner}} & \\multicolumn{{2}}{{c|}}{{Good length}} \\\\ 160 | \\cline{{3-4}} & & \\multicolumn{{1}}{{c|}}{{$\\sigma_{{truth}} = {round(first_sigma, 2)}$}} & \\multicolumn{{1}}{{c|}}{{$\\sigma_{{read}} = {round(first_sigma, 2)}$}} \\\\ 161 | \\hline\\hline 162 | \\multirow{{{len(args.summaries_names)}}}{{*}}{{{args.output_name}}}{aligners_lines_truth_read_length}\\hline 163 | \\end{{tabular}} 164 | """ 165 | 166 | tex_file = open(f'{args.output_name}_length.tex', 'w') 167 | tex_file.write(tables_length) 168 | tex_file.close() 169 | 170 | 171 | 172 | # Plot length 173 | fig, (overlap, truth, read) = subplots(3, 1) 174 | 175 | overlap.set_xlabel('Overlap criterion ($\delta$)') 176 | overlap.set_ylabel('% Good length') 177 | truth.set_xlabel('Edit distance criterion ($\sigma_{truth}$)') 178 | truth.set_ylabel('% Good length') 179 | read.set_xlabel('Edit distance criterion ($\sigma_{read}$)') 180 | read.set_ylabel('% Good length') 181 | 182 | for aligner in args.summaries_names: 183 | overlap.plot(sigma_range, metrics[aligner]['overlap']['length'].values(), label=aligner) 184 | truth.plot(sigma_range, metrics[aligner]['truth']['length'].values(), label=aligner) 185 | read.plot(sigma_range, metrics[aligner]['read']['length'].values(), label=aligner) 186 | 187 | overlap.grid() 188 | overlap.legend() 189 | truth.grid() 190 | truth.legend() 191 | read.grid() 192 | read.legend() 193 | 194 | fig.set_figwidth(3.87) 195 | fig.set_figheight(11.5) 196 | fig.tight_layout(pad=1.15) 197 | 198 | savefig(f'{args.output_name}_length.pdf') 199 | 200 | 201 | if __name__ == '__main__': 202 | 203 | parser = ArgumentParser( 204 | description=''' 205 | Computes distance and overlap metrics based on summary files. 206 | Outputs plots and tables. 207 | ''', 208 | formatter_class=RawTextHelpFormatter 209 | ) 210 | 211 | requiredNamed = parser.add_argument_group('required arguments') 212 | requiredNamed.add_argument('-n', '--output-name', type=str, help='Output base name', required=True) 213 | requiredNamed.add_argument( 214 | '-s', '--summaries', type=str, 215 | help='Input summary (csv( files used to compute metrics, one per aligner/configuration', 216 | required=True, nargs='+' 217 | ) 218 | requiredNamed.add_argument( 219 | '-sn', '--summaries-names', type=str, help='Name of aligners/configurations, used in plot/table legends', 220 | required=True, nargs='+' 221 | ) 222 | 223 | compute_metrics(parser.parse_args()) 224 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/scripts/compute_summary.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser, RawTextHelpFormatter 2 | from gzip import GzipFile 3 | from google.protobuf.internal.decoder import _DecodeVarint32 4 | from vg_pb2 import Alignment 5 | from numpy import cumsum 6 | from joblib import Parallel, delayed 7 | from edlib import align 8 | from bisect import bisect 9 | from re import split 10 | 11 | 12 | def load_graph(gfa_graph): 13 | 14 | vertex_labels, edges = dict(), dict() 15 | 16 | for line in open(gfa_graph).read().split('\n')[:-1]: 17 | if line[0] == 'S': 18 | str_id, label = line[1:].strip().split() 19 | vertex_labels[str_id] = label 20 | if line[0] == 'L': 21 | tail_str_id, _, head_str_id, _, _ = line[1:].strip().split() 22 | tail_id, head_id = tail_str_id, head_str_id 23 | if tail_id not in edges: 24 | edges[tail_id] = list() 25 | edges[tail_id].append(head_id) 26 | 27 | return vertex_labels, edges 28 | 29 | 30 | def get_read_info(read_label, read_header): 31 | 32 | s, t = map(int, read_header.split()[1].split(',')[-1].split('-')) 33 | is_reverse_comp = '-strand' in read_header.split()[1].split(',') 34 | 35 | return read_label, is_reverse_comp, s, t 36 | 37 | 38 | def load_reads_and_ref(fastq, fasta, path): 39 | 40 | fastq_lines = open(fastq).read().split('\n')[:-1] 41 | 42 | ref = open(fasta).readlines()[-1].strip() if fasta else '' 43 | ref_rev_comp = ''.join({'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}[b] for b in ref[::-1]) 44 | path = [s.strip().split()[1] for s in open(path).readlines()] if path else list() 45 | 46 | if path: 47 | reads = { 48 | read_header.strip().split()[0][1:]: get_read_info(read_label.strip(), read_header.strip()) 49 | for read_header, read_label in zip(fastq_lines[::4], fastq_lines[1::4]) 50 | } 51 | else: 52 | reads = { 53 | read_header.strip().split()[0][1:]: (read_label.strip(), False, -1, -1) 54 | for read_header, read_label in zip(fastq_lines[::4], fastq_lines[1::4]) 55 | } 56 | 57 | return reads, ref, ref_rev_comp, path 58 | 59 | 60 | def read_gam(gam_filename): 61 | 62 | with open(gam_filename, 'rb') as f: 63 | buf = GzipFile(fileobj=f).read() 64 | n = 0 65 | while n < len(buf): 66 | an, n = _DecodeVarint32(buf, n) 67 | for i in range(an): 68 | msg_len, n = _DecodeVarint32(buf, n) 69 | msg_buf = buf[n:n + msg_len] 70 | n += msg_len 71 | aln = Alignment() 72 | aln.ParseFromString(msg_buf) 73 | yield aln 74 | 75 | 76 | def parse_gam(raw_gam, vertex_labels): 77 | 78 | name = raw_gam.name.split()[0] 79 | rev_cnt = 0 80 | path = list() 81 | idx, n = 0, len(raw_gam.path.mapping) 82 | first_node_off, last_node_off = 0, 0 83 | 84 | seqs = list() 85 | 86 | for x in raw_gam.path.mapping: 87 | 88 | node_name = x.position.name 89 | if node_name =='': 90 | node_name = str(x.position.node_id) 91 | ll = vertex_labels[node_name] 92 | original_len = len(ll) 93 | 94 | if x.position.is_reverse: 95 | rev_cnt += 1 96 | if idx == 0: 97 | if rev_cnt > 0: 98 | first_node_off = original_len - x.position.offset 99 | else: 100 | first_node_off = x.position.offset 101 | 102 | if idx == n - 1: 103 | suma = sum(i.from_length for i in x.edit) 104 | if rev_cnt > 0: 105 | last_node_off = original_len - suma - (x.position.offset if idx == 0 else 0) 106 | else: 107 | last_node_off = suma + (x.position.offset if idx == 0 else 0) 108 | 109 | if idx == 0 and idx == n - 1: 110 | if rev_cnt > 0: 111 | ll = ll[last_node_off:first_node_off] 112 | else: 113 | ll = ll[first_node_off:last_node_off] 114 | elif idx == 0: 115 | if rev_cnt > 0: 116 | ll = ll[:first_node_off] 117 | else: 118 | ll = ll[first_node_off:] 119 | elif idx == n - 1: 120 | if rev_cnt > 0: 121 | ll = ll[last_node_off:] 122 | else: 123 | ll = ll[:last_node_off] 124 | 125 | if rev_cnt > 0: 126 | ll = ''.join({'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}[b] for b in ll[::-1]) 127 | 128 | path.append(node_name) 129 | seqs.append(ll) 130 | idx += 1 131 | 132 | seq = ''.join(seqs) 133 | 134 | return name, seq, len(path), rev_cnt, len(seq), path, first_node_off, last_node_off 135 | 136 | 137 | def load_gam(gam_filename, vertex_labels): 138 | 139 | ret = dict() 140 | 141 | for raw_gam in read_gam(gam_filename): 142 | a = parse_gam(raw_gam, vertex_labels) 143 | if a[0] not in ret: 144 | ret[a[0]] = a 145 | else: 146 | # take the longer one when multiple alignments 147 | if a[-4] > ret[a[0]][-4]: 148 | ret[a[0]] = a 149 | 150 | return ret 151 | 152 | 153 | def read_gaf(gaf_filename): 154 | 155 | for line in open(gaf_filename, 'r').read().split('\n')[:-1]: 156 | items = line.split('\t') 157 | raw_path = items[5] 158 | path = split('<|>', raw_path)[1:] 159 | rev = '<' in raw_path 160 | 161 | yield items[0], path, rev, int(items[7]), int(items[8]) 162 | 163 | 164 | def parse_gaf(raw_gaf, vertex_labels): 165 | 166 | id, path, rev, f_o, l_o = raw_gaf 167 | 168 | if rev: 169 | rev_cnt = len(path) 170 | first_node_off, last_node_off = len(vertex_labels[path[0]]) - f_o, len(vertex_labels[path[-1]]) - l_o 171 | else: 172 | rev_cnt = 0 173 | first_node_off, last_node_off = f_o, l_o 174 | 175 | 176 | seqs = list() 177 | n = len(path) 178 | 179 | for idx, node_id in enumerate(path): 180 | 181 | ll = vertex_labels[node_id] 182 | original_length = len(ll) 183 | 184 | if idx == 0 and idx == n - 1: 185 | if rev_cnt > 0: 186 | ll = ll[last_node_off:first_node_off] 187 | else: 188 | ll = ll[first_node_off:last_node_off] 189 | elif idx == 0: 190 | if rev_cnt > 0: 191 | ll = ll[:first_node_off] 192 | else: 193 | ll = ll[first_node_off:] 194 | elif idx == n - 1: 195 | if rev_cnt > 0: 196 | ll = ll[last_node_off:] 197 | else: 198 | ll = ll[:last_node_off] 199 | 200 | if rev_cnt > 0: 201 | ll = ''.join({'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}[b] for b in ll[::-1]) 202 | if idx < n - 1: 203 | last_node_off += original_length 204 | else: 205 | if idx < n - 1: 206 | last_node_off -= original_length 207 | 208 | seqs.append(ll) 209 | 210 | seq = ''.join(seqs) 211 | 212 | return id, seq, n, rev_cnt, len(seq), path, first_node_off, last_node_off 213 | 214 | 215 | def load_gaf(gaf_filename, vertex_labels): 216 | 217 | ret = dict() 218 | 219 | for raw_gaf in read_gaf(gaf_filename): 220 | a = parse_gaf(raw_gaf, vertex_labels) 221 | if a[0] not in ret: 222 | ret[a[0]] = a 223 | else: 224 | # take the longer one when multiple alignments 225 | if a[-4] > ret[a[0]][-4]: 226 | ret[a[0]] = a 227 | 228 | return ret 229 | 230 | 231 | def compute_overlap(read, node): 232 | return max(0, min(read[1], node[1]) - max(read[0], node[0])) 233 | 234 | 235 | def compute_summary(args): 236 | 237 | vertex_labels, edges = load_graph(args.graph) 238 | reads, ref_seq, ref_seq_rev_comp, ref_path = load_reads_and_ref(args.fastq, args.fasta, args.path) 239 | node_limits = list(cumsum([len(vertex_labels[v]) for v in ref_path])) 240 | 241 | for alignment_filename, metric in zip(args.alignments, args.metrics): 242 | if alignment_filename.endswith('.gam'): 243 | alignments = load_gam(alignment_filename, vertex_labels) 244 | elif alignment_filename.endswith('.gaf'): 245 | alignments = load_gaf(alignment_filename, vertex_labels) 246 | 247 | 248 | csv = open(metric, 'w') 249 | csv.write('id,len_aln,ed_read,ed_true,overlap,len_truth,len_read\n') 250 | 251 | def compute_edit(read_id): 252 | 253 | read_label, reverse_read, s, t = reads[read_id] 254 | t = min(t, len(ref_seq)) 255 | true_seq = '' 256 | 257 | if s > 0 and t > 0: # SIMULATED READ 258 | 259 | if reverse_read: 260 | true_seq = ref_seq_rev_comp[s: t] 261 | s, t = len(ref_seq) - t, len(ref_seq) - s ## Transform to original coordinates 262 | else: 263 | true_seq = ref_seq[s: t] 264 | 265 | first_node_index, last_node_index = bisect(node_limits, s), bisect(node_limits, t - 1) 266 | read_path = ref_path[first_node_index:last_node_index + 1] 267 | read_path_intervals = [[0, len(vertex_labels[n])] for n in read_path] 268 | read_path_intervals[0][0] = s - (0 if first_node_index == 0 else node_limits[first_node_index - 1]) 269 | read_path_intervals[-1][-1] = t - (0 if last_node_index == 0 else node_limits[last_node_index - 1]) 270 | read_path_nodes = {read_path[i]: read_path_intervals[i] for i in range(len(read_path))} 271 | 272 | aln_seq = '' 273 | overlap = 0 274 | 275 | if read_id in alignments: 276 | 277 | a = alignments[read_id] 278 | aln_seq = a[1] 279 | 280 | if s > 0 and t > 0: 281 | 282 | first_node_off = a[-2] 283 | last_node_off = a[-1] 284 | reverse_alignment = a[3] > 0 285 | 286 | if reverse_alignment == reverse_read: # Only consider overlap if same direction paths 287 | 288 | alignment_path = a[5] 289 | alignment_path_intervals = [[0, len(vertex_labels[n])] for n in alignment_path] 290 | if reverse_alignment: 291 | alignment_path_intervals[0][-1] = first_node_off 292 | alignment_path_intervals[-1][0] = last_node_off 293 | else: 294 | alignment_path_intervals[0][0] = first_node_off 295 | alignment_path_intervals[-1][-1] = last_node_off 296 | 297 | 298 | alignment_path_nodes = {alignment_path[i]: alignment_path_intervals[i] for i in 299 | range(len(alignment_path))} 300 | overlap = sum(compute_overlap(read_path_nodes[x], alignment_path_nodes[x]) for x in a[5] if 301 | x in read_path_nodes) 302 | 303 | row = list() 304 | row.append(read_id) 305 | row.append(len(aln_seq)) 306 | row.append(align(read_label, aln_seq, mode='NW')['editDistance']) 307 | row.append(align(true_seq, aln_seq, mode='NW')['editDistance']) 308 | row.append(overlap) 309 | row.append(len(true_seq)) 310 | row.append(len(read_label)) 311 | 312 | return row 313 | 314 | reads_ids = [id for id in reads] 315 | reads_n = len(reads_ids) 316 | block_size = 200 317 | n_blocks = reads_n // block_size + 1 318 | 319 | def compute_edit_kernel(tid): 320 | l, r = tid * reads_n // n_blocks, (tid + 1) * reads_n // n_blocks 321 | tmp = [] 322 | for i in range(l, r): 323 | tmp.append(compute_edit(reads_ids[i])) 324 | return tmp 325 | 326 | tmp_list = Parallel(n_jobs=args.threads, prefer="threads")( 327 | delayed(compute_edit_kernel)(t_idx) for t_idx in range(n_blocks)) 328 | processed_list = list() 329 | for c in tmp_list: 330 | processed_list += c 331 | 332 | for row in processed_list: 333 | csv.write(','.join(list(map(str, row))) + '\n') 334 | csv.close() 335 | 336 | 337 | if __name__ == '__main__': 338 | 339 | parser = ArgumentParser( 340 | description=''' 341 | Computes distances between alignments and input reads. 342 | If read are simulated it also computes overlaps and distances to truth. 343 | ''', 344 | formatter_class=RawTextHelpFormatter 345 | ) 346 | 347 | requiredNamed = parser.add_argument_group('required arguments') 348 | requiredNamed.add_argument('-g', '--graph', type=str, help='Input gfa file', required=True) 349 | requiredNamed.add_argument('-fq', '--fastq', type=str, help='Input fastq file', required=True) 350 | requiredNamed.add_argument( 351 | '-als', '--alignments', type=str, help='Output gam/gaf files (with extension, each)', required=True, nargs='+' 352 | ) 353 | requiredNamed.add_argument( 354 | '-mts', '--metrics', type=str, help='Output csv files with metrics', required=True, nargs='+' 355 | ) 356 | 357 | parser.add_argument('-p', '--path', type=str, help='Output path file (node ids of selected path)') 358 | parser.add_argument('-fa', '--fasta', type=str, help='Output fasta of original path') 359 | parser.add_argument('-t', '--threads', type=int, help='Number of threads', default=30) 360 | 361 | compute_summary(parser.parse_args()) 362 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/scripts/generate_sim_reads.py: -------------------------------------------------------------------------------- 1 | from os import getenv 2 | from dotenv import load_dotenv 3 | from subprocess import run 4 | from argparse import ArgumentParser, RawTextHelpFormatter 5 | 6 | 7 | def generate_sim_reads(args): 8 | 9 | load_dotenv() 10 | badread = getenv('BADREAD') 11 | graphchainer = getenv('GRAPHCHAINER') 12 | 13 | run( 14 | f'{graphchainer} --generate-path --generate-path-seed {args.seed} -g {args.graph} -f {args.fasta} -a tmp.gam' 15 | .split() 16 | ) 17 | 18 | run(f'mv tmp.gam {args.path}'.split()) 19 | 20 | run( 21 | f'python {badread} simulate --identity 95,99,2.5 --seed {args.seed} --reference {args.fasta} ' 22 | f'--quantity {args.coverage}x --length 15000,13000 --error_model nanopore2023 --qscore_model nanopore2023 --junk_reads 0 --random_reads 0 ' 23 | f'--chimeras 0'.split(), 24 | stdout=open(args.fastq, 'wb') 25 | ) 26 | 27 | 28 | if __name__ == '__main__': 29 | 30 | parser = ArgumentParser( 31 | description=''' 32 | Generates simulated reads from a random path of an input vg/gfa file using the Badread simulator. 33 | Badread parameters are fixed to --identity 95,99,2.5 --length 15000,13000 --error_model nanopore2023 34 | --qscore_model nanopore2023 --junk_reads 0 --random_reads 0 --chimeras 0. 35 | ''', 36 | formatter_class=RawTextHelpFormatter 37 | ) 38 | 39 | requiredNamed = parser.add_argument_group('required arguments') 40 | requiredNamed.add_argument('-g', '--graph', type=str, help='Input vg/gfa file', required=True) 41 | requiredNamed.add_argument('-fq', '--fastq', type=str, help='Output fastq file', required=True) 42 | 43 | parser.add_argument('-s', '--seed', type=int, help='Seed for random path generator and Badread', default=0) 44 | parser.add_argument( 45 | '-p', '--path', type=str, help='Output path file (node ids of selected path)', default='tmp.path' 46 | ) 47 | parser.add_argument('-fa', '--fasta', type=str, help='Output fasta of original path', default='tmp.fasta') 48 | parser.add_argument('-c', '--coverage', type=int, help='Coverage value given to Badread', default=15) 49 | 50 | generate_sim_reads(parser.parse_args()) 51 | 52 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/scripts/run_experiment.py: -------------------------------------------------------------------------------- 1 | from os import getenv 2 | from dotenv import load_dotenv 3 | from subprocess import run 4 | from argparse import ArgumentParser, RawTextHelpFormatter 5 | 6 | 7 | def run_experiment(args): 8 | load_dotenv() 9 | 10 | srfaligner = getenv('SRFALIGNER') 11 | run( 12 | f'{srfaligner} -t {args.threads} -f {args.fastq} -g {args.graph} ' 13 | f'-a {args.alignment}_srfaligner.gaf -w output'.split() 14 | ) 15 | 16 | # graphaligner = getenv('GRAPHALIGNER') 17 | # run( 18 | # f'{graphaligner} -t {args.threads} -x vg -f {args.fastq} -g {args.graph} --verbose ' 19 | # f'-a {args.alignment}_graphaligner.gaf'.split() 20 | # ) 21 | 22 | # minigraph = getenv('MINIGRAPH') 23 | # run( 24 | # f'{minigraph} -t {args.threads} -c {args.graph} {args.fastq}'.split(), 25 | # stdout=open(f'{args.alignment}_minigraph.gaf', 'wb') 26 | # ) 27 | 28 | # srfchainer = getenv('SRFCHAINER') 29 | # run( 30 | # f'{srfchainer} -t {args.threads} -f {args.fastq} -g {args.graph} ' 31 | # f'-a {args.alignment}_srfchainer.gaf -w output'.split() 32 | # ) 33 | 34 | # graphchainer = getenv('GRAPHCHAINER') 35 | # run( 36 | # f'{graphchainer} -t {args.threads} -f {args.fastq} -g {args.graph} ' 37 | # f'-a {args.alignment}_graphchainer.gam '.split() 38 | # ) 39 | 40 | # minichain = getenv('MINICHAIN') 41 | # run( 42 | # f'{minichain} -t {args.threads} -c {args.graph} {args.fastq}'.split(), 43 | # stdout=open(f'{args.alignment}_minichain.gaf', 'wb') 44 | # ) 45 | 46 | 47 | if __name__ == '__main__': 48 | 49 | parser = ArgumentParser( 50 | description=''' 51 | Run aligners GraphAligner, minigraph, and srfaligner on the vg(?)/gfa and fastq files specified. 52 | ''', 53 | formatter_class=RawTextHelpFormatter 54 | ) 55 | 56 | requiredNamed = parser.add_argument_group('required arguments') 57 | requiredNamed.add_argument('-g', '--graph', type=str, help='Input vg/gfa file', required=True) 58 | requiredNamed.add_argument('-fq', '--fastq', type=str, help='Input fastq file', required=True) 59 | requiredNamed.add_argument( 60 | '-a', '--alignment', type=str, help='Output gam/gaf files (without extension)', required=True 61 | ) 62 | 63 | parser.add_argument('-t', '--threads', type=int, help='Number of threads', default=30) 64 | 65 | run_experiment(parser.parse_args()) 66 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/semi-repeat-free/runexp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # remember to run `conda activate aligner-evaluation` 3 | set -e 4 | set -o pipefail 5 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 6 | cd $thisfolder 7 | 8 | # executable's absolute paths/commands (make sure they work!) 9 | BADREAD=$thisfolder/../../../tools/Badread/badread-runner.py 10 | GRAPHCHAINER=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer 11 | graphaligner=$thisfolder/../../../tools/GraphAligner/bin/GraphAligner 12 | srfaligner=$thisfolder/../../../SRFAligner 13 | usrbintime=/usr/bin/time 14 | 15 | # params 16 | inputgraph=$thisfolder/../input/chr22_iEFG.gfa 17 | coverage=30 18 | threads=64 19 | 20 | # 0. setup 21 | mkdir output 22 | echo -n > output/runexp_log.txt 23 | set -a 24 | set +a 25 | 26 | # 1. simulate path and reads 27 | python3 ../scripts/generate_sim_reads.py \ 28 | --graph $inputgraph \ 29 | --fastq output/sim_reads.fastq \ 30 | --seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt /dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \ 31 | --path output/sim_reads_path.nodes \ 32 | --fasta output/sim_reads_path.fasta \ 33 | --coverage $coverage \ 34 | 2>> output/runexp_log.txt >> output/runexp_log.txt 35 | 36 | # 2. run the aligners 37 | $usrbintime $srfaligner \ 38 | -t $threads \ 39 | -g $inputgraph \ 40 | -f output/sim_reads.fastq \ 41 | -a output/semi_repeat_free_alignments.gaf \ 42 | 2>> output/runexp_log.txt >> output/runexp_log.txt 43 | 44 | $usrbintime $graphaligner \ 45 | -t $threads \ 46 | -x "vg" \ 47 | -g $inputgraph \ 48 | -f output/sim_reads.fastq \ 49 | -a output/graphaligner_alignments.gaf \ 50 | 2>> output/runexp_log.txt >> output/runexp_log.txt 51 | 52 | for o in 1 5 10 50 100 53 | do 54 | $usrbintime $srfaligner \ 55 | -t $threads \ 56 | -g $inputgraph \ 57 | -o $o \ 58 | -f output/sim_reads.fastq \ 59 | -a output/srf_edge_longest_${o}_alignments.gaf \ 60 | 2>> output/runexp_log.txt >> output/runexp_log.txt 61 | done 62 | 63 | # 3. pick first alignment 64 | for alignment in "semi_repeat_free_alignments.gaf" "graphaligner_alignments.gaf" 65 | do 66 | awk '{if (found[$1] == "1") \ 67 | {} \ 68 | else 69 | {found[$1]="1"; print}}' \ 70 | output/$alignment > output/best_$alignment 71 | done 72 | for o in 1 5 10 50 100 73 | do 74 | awk '{if (found[$1] == "1") \ 75 | {} \ 76 | else 77 | {found[$1]="1"; print}}' \ 78 | output/srf_edge_longest_${o}_alignments.gaf > output/best_srf_edge_longest_${o}_alignments.gaf 79 | done 80 | 81 | # 4. validate and plot results 82 | for alignment in semi_repeat_free graphaligner 83 | do 84 | python3 ../scripts/compute_summary.py \ 85 | -t 3 \ 86 | --graph $inputgraph \ 87 | --fastq output/sim_reads.fastq \ 88 | --path output/sim_reads_path.nodes \ 89 | --fasta output/sim_reads_path.fasta \ 90 | --alignments output/best_${alignment}_alignments.gaf \ 91 | --metrics output/metrics_${alignment}.mts 92 | done 93 | for o in 1 5 10 50 100 94 | do 95 | python3 ../scripts/compute_summary.py \ 96 | -t 3 \ 97 | --graph $inputgraph \ 98 | --fastq output/sim_reads.fastq \ 99 | --path output/sim_reads_path.nodes \ 100 | --fasta output/sim_reads_path.fasta \ 101 | --alignments output/best_srf_edge_longest_${o}_alignments.gaf \ 102 | --metrics output/metrics_srf_edge_longest_${o}.mts 103 | done 104 | wait $(jobs -p) 105 | 106 | python3 ../scripts/compute_metrics.py \ 107 | --output-name output/results \ 108 | --summaries output/*.mts \ 109 | --summaries-names $(basename -s ".mts" -a output/*.mts | sed 's/metrics_//') 110 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/vg-comparison/runexp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # remember to run `conda activate aligner-evaluation` 3 | set -e 4 | set -o pipefail 5 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 6 | cd $thisfolder 7 | 8 | # executable's absolute paths/commands (make sure they work!) 9 | BADREAD=$thisfolder/../../../tools/Badread/badread-runner.py 10 | GRAPHCHAINER=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer 11 | graphaligner=$thisfolder/../../../tools/GraphAligner/bin/GraphAligner 12 | usrbintime=/usr/bin/time 13 | 14 | # params 15 | inputgraphs=($thisfolder/../input/chr22_iEFG.gfa $thisfolder/../input/chr22_vg.gfa $thisfolder/../input/chr22_vg_msa.gfa) 16 | graphnames=(iefg vg vgmsa) 17 | coverage=30 18 | threads=64 19 | 20 | # 0. setup 21 | mkdir output 22 | echo -n > output/runexp_log.txt 23 | 24 | # 1. simulate path and reads 25 | for ((g = 0 ; g < ${#inputgraphs[@]} ; g++)) 26 | do 27 | # TODO maybe parallelize this? 28 | inputgraph="${inputgraphs[$g]}" 29 | graphname="${graphnames[$g]}" 30 | python3 ../scripts/generate_sim_reads.py \ 31 | --graph $inputgraph \ 32 | --fastq output/sim_reads_$graphname.fastq \ 33 | --seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt /dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \ 34 | --path output/sim_reads_path_$graphname.nodes \ 35 | --fasta output/sim_reads_path_$graphname.fasta \ 36 | --coverage $coverage \ 37 | 2>> output/runexp_log.txt >> output/runexp_log.txt 38 | done 39 | 40 | # 3. run the aligners on each dataset 41 | for ((g1 = 0 ; g1 < ${#inputgraphs[@]} ; g1++)) 42 | do 43 | for ((g2 = 0 ; g2 < ${#inputgraphs[@]} ; g2++)) 44 | do 45 | inputgraph="${inputgraphs[$g1]}" 46 | graphname="${graphnames[$g1]}" 47 | datasetname="${graphnames[$g2]}" 48 | reads=output/sim_reads_$datasetname.fastq 49 | $usrbintime $graphaligner \ 50 | -t $threads \ 51 | -x vg \ 52 | -g $inputgraph \ 53 | -f $reads \ 54 | -a output/${graphname}_graph_${datasetname}_reads_alignments.gaf \ 55 | 2>> output/runexp_log.txt >> output/runexp_log.txt 56 | 57 | # pick first alignment 58 | awk '{if (found[$1] == "1") \ 59 | {} \ 60 | else 61 | {found[$1]="1"; print}}' \ 62 | output/${graphname}_graph_${datasetname}_reads_alignments.gaf \ 63 | > output/best_${graphname}_graph_${datasetname}_reads_alignments.gaf 64 | done 65 | done 66 | 67 | # 4. validate and plot results 68 | for ((g1 = 0 ; g1 < ${#inputgraphs[@]} ; g1++)) 69 | do 70 | for ((g2 = 0 ; g2 < ${#inputgraphs[@]} ; g2++)) 71 | do 72 | inputgraph="${inputgraphs[$g1]}" 73 | graphname="${graphnames[$g1]}" 74 | if [ $g1 -eq $g2 ] 75 | then 76 | # we have ground truth 77 | reads=output/sim_reads_$graphname.fastq 78 | path=output/sim_reads_path_$graphname.nodes 79 | fasta=output/sim_reads_path_$graphname.fasta 80 | alignments=output/best_${graphname}_graph_${graphname}_reads_alignments.gaf 81 | python3 ../scripts/compute_summary.py \ 82 | -t 3 \ 83 | --graph $inputgraph \ 84 | --fastq $reads \ 85 | --path $path \ 86 | --fasta $fasta \ 87 | --alignments $alignments \ 88 | --metrics output/metrics_${graphname}_graph_${graphname}_reads.mts 89 | else 90 | # we do not have ground truth 91 | datasetname="${graphnames[$g2]}" 92 | reads=output/sim_reads_$datasetname.fastq 93 | alignments=output/best_${graphname}_graph_${datasetname}_reads_alignments.gaf 94 | python3 ../scripts/compute_summary.py \ 95 | -t 3 \ 96 | --graph $inputgraph \ 97 | --fastq $reads \ 98 | --alignments $alignments \ 99 | --metrics output/metrics_${graphname}_graph_${datasetname}_reads.mts 100 | fi 101 | done 102 | done 103 | wait $(jobs -p) 104 | 105 | for ((g = 0 ; g < ${#inputgraphs[@]} ; g++)) 106 | do 107 | graphname="${graphnames[$g]}" 108 | python3 ../scripts/compute_metrics.py \ 109 | --output-name output/results_${graphname}_graph \ 110 | --summaries output/metrics_${graphname}_graph_*.mts \ 111 | --summaries-names $(basename -s ".mts" -a output/metrics_${graphname}_graph_*.mts | sed 's/metrics_//') 112 | done 113 | -------------------------------------------------------------------------------- /experiments/aligner-evaluation/vg-unchop/runexp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # remember to run `conda activate aligner-evaluation` 3 | set -e 4 | set -o pipefail 5 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 6 | cd $thisfolder 7 | 8 | # executables' absolute paths/commands (make sure they work!) 9 | BADREAD=$thisfolder/../../../tools/Badread/badread-runner.py 10 | GRAPHCHAINER=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer 11 | graphaligner=$thisfolder/../../../tools/GraphAligner/bin/GraphAligner 12 | graphchainer=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer 13 | minigraph=$thisfolder/../../../tools/minigraph/minigraph 14 | minichain=$thisfolder/../../../tools/minichain/minichain 15 | usrbintime=/usr/bin/time 16 | 17 | # params 18 | inputgraph=$thisfolder/../input/chr22_vg_unchop.gfa 19 | coverage=30 20 | threads=64 21 | 22 | ## 0. setup 23 | mkdir output 24 | echo -n > output/runexp_log.txt 25 | 26 | # 1. simulate path and reads 27 | python3 ../scripts/generate_sim_reads.py \ 28 | --graph $inputgraph \ 29 | --fastq output/sim_reads.fastq \ 30 | --seed $(openssl enc -aes-256-ctr -pass pass:"semirepeatfree" -nosalt /dev/null | shuf -i0-4294967295 -n 1 --random-source=/dev/stdin) \ 31 | --path output/sim_reads_path.nodes \ 32 | --fasta output/sim_reads_path.fasta \ 33 | --coverage $coverage \ 34 | 2>> output/runexp_log.txt >> output/runexp_log.txt 35 | 36 | # 2. run the aligners 37 | cat output/sim_reads.fastq | cut -d' ' -f1 > output/sim_reads_fixed_header.fastq 38 | $usrbintime $graphaligner \ 39 | -t $threads \ 40 | -x "vg" \ 41 | -g $inputgraph \ 42 | -f output/sim_reads_fixed_header.fastq \ 43 | -a output/graphaligner_alignments.gaf \ 44 | 2>> output/runexp_log.txt >> output/runexp_log.txt 45 | 46 | $usrbintime $graphchainer \ 47 | -t $threads \ 48 | -g $inputgraph \ 49 | -f output/sim_reads_fixed_header.fastq \ 50 | -a output/graphchainer_alignments.gaf \ 51 | 2>> output/runexp_log.txt >> output/runexp_log.txt 52 | 53 | $usrbintime $minigraph \ 54 | -t $threads \ 55 | -c \ 56 | -x lr \ 57 | $inputgraph \ 58 | output/sim_reads.fastq \ 59 | -o output/minigraph_alignments.gaf \ 60 | 2>> output/runexp_log.txt >> output/runexp_log.txt 61 | 62 | $usrbintime $minichain \ 63 | -t $threads \ 64 | -c $inputgraph \ 65 | output/sim_reads.fastq \ 66 | -o output/minichain_alignments.gaf \ 67 | 2>> output/runexp_log.txt >> output/runexp_log.txt 68 | 69 | # 3. pick first alignment 70 | for alignment in "graphaligner_alignments.gaf" "graphchainer_alignments.gaf" 71 | do 72 | awk '{if (found[$1] == "1") \ 73 | {} \ 74 | else 75 | {found[$1]="1"; print}}' \ 76 | output/$alignment > output/best_$alignment 77 | done 78 | 79 | # 4. validate and plot results 80 | for alignment in best_graphaligner best_graphchainer minigraph minichain 81 | do 82 | python3 ../scripts/compute_summary.py \ 83 | -t 8 \ 84 | --graph $inputgraph \ 85 | --fastq output/sim_reads.fastq \ 86 | --path output/sim_reads_path.nodes \ 87 | --fasta output/sim_reads_path.fasta \ 88 | --alignments output/${alignment}_alignments.gaf \ 89 | --metrics output/metrics_${alignment}.mts & 90 | done 91 | wait $(jobs -p) 92 | 93 | python3 ../scripts/compute_metrics.py \ 94 | --output-name output/results \ 95 | --summaries output/*.mts \ 96 | --summaries-names $(basename -s ".mts" -a output/*.mts | sed 's/metrics_//') 97 | -------------------------------------------------------------------------------- /experiments/graph-statistics/README.md: -------------------------------------------------------------------------------- 1 | # Graph statistics 2 | Analisys of iEFGs or DAGs in GFA format containing only forward `L` links. Command `./run.sh graph.gfa` computes: 3 | 4 | - the number of nodes, edges, and base pairs of the graph 5 | - the ⌈N50⌉ metric, that is, the smallest k such that ≥50% of the bases are covered by segments of length ≤k 6 | - the length of the longest node 7 | - the maximum number H of nodes in a block, if `graph.gfa` is an iEFG 8 | - the width (size of smallest path set covering the nodes, using `GraphChainer`) 9 | - the number of branching nodes (outdegree ≥ 2), choices (sum of outdegrees ≥ 2), the branching factor (maximum number of branching nodes in any path), and the number of maximal paths 10 | 11 | ## Prerequisites 12 | The scripts used depend on [`octave-cli`](https://octave.org/), `gawk`, `awk`, and `GraphChainer`: the last one is expected to be found in folder `tools/GraphChainer/bin` from the root of this repository, and can be obtained with command 13 | ```console 14 | git submodule update --init --recursive ../../tools/GraphChainer 15 | ``` 16 | and by following the compilation instructions in its README. 17 | 18 | ## Limitations 19 | The computation of the ⌈N50⌉ is quite slow and inefficient. The computation of the number of maximal paths is also not efficient and keeps in memory many large numbers. 20 | -------------------------------------------------------------------------------- /experiments/graph-statistics/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]] 3 | then 4 | >&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1 5 | fi 6 | 7 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 8 | 9 | echo -e "nodes:\t$($thisfolder/scripts/compute-nodes.sh $1)" 10 | echo -e "edges:\t$($thisfolder/scripts/compute-edges.sh $1)" 11 | echo -e "bases:\t$($thisfolder/scripts/compute-bps.sh $1)" 12 | echo -e "N50:\t$($thisfolder/scripts/compute-N.sh $1 50 2>> /dev/null)" 13 | echo -e "longest node:\t$($thisfolder/scripts/compute-longest-node.sh $1)" 14 | echo -e "H:\t$($thisfolder/scripts/compute-efg-H.sh $1)" 15 | echo -e "width:\t$($thisfolder/scripts/compute-width.sh $1)" 16 | echo -e "branching nodes:\t$($thisfolder/scripts/compute-branching-nodes.sh $1)" 17 | echo -e "choices:\t$($thisfolder/scripts/compute-choices.sh $1)" 18 | echo -e "branching factor:\t$($thisfolder/scripts/compute-branching-factor.sh $1 2>> /dev/null)" 19 | echo -e "paths:\t$($thisfolder/scripts/compute-paths.sh $1 2>> /dev/null)" 20 | -------------------------------------------------------------------------------- /experiments/graph-statistics/scripts/compute-N.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | if [[ $# -lt 2 ]] || [[ $# -gt 2 ]] 4 | then 5 | >&2 echo "Usage: $(basename $0) graph.gfa [0-100]" ; exit 1 6 | fi 7 | 8 | # Configuration 9 | quantiles="$(echo $2 / 100 | bc -l)" 10 | 11 | lengths=$(mktemp) 12 | >&2 echo "generated tmp file $lengths (should be removed automatically on exit)..." 13 | trap '{ rm -f -- "$lengths"; }' EXIT 14 | 15 | grep "^S" $1 | awk '{for (i = 1 ; i <= length($3) ; i++) {print length($3)}}' | sort -n > $lengths 16 | 17 | octaveout=$(octave-cli --eval "format long; x = dlmread('$lengths'); q = quantile(x, [0.00 $quantiles]); N = ceil(q([2:length(q)])); disp(N)" | tr -s " " "\t" | cut -f2-) 18 | 19 | echo -e "$octaveout" 20 | -------------------------------------------------------------------------------- /experiments/graph-statistics/scripts/compute-bps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]] 4 | then 5 | >&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1 6 | fi 7 | 8 | grep "^S" $1 | cut -f3 | tr -d "\n" | wc -c 9 | -------------------------------------------------------------------------------- /experiments/graph-statistics/scripts/compute-branching-factor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]] 4 | then 5 | >&2 echo "Usage: $(basename $0) dag.gfa" ; exit 1 6 | fi 7 | 8 | tempdir=$(mktemp -d) 9 | >&2 echo "generated temp dir $tempdir (should be removed automatically on exit)..." 10 | trap '{ rm -rf -- "$tempdir"; }' EXIT 11 | 12 | # sort nodes and edges in topological order 13 | grep "^L" $1 | awk '{print $2,$4}' > "$tempdir/edges" 14 | tsort "$tempdir/edges" > "$tempdir/topological_nodes" 15 | 16 | awk '(NR == FNR) { inneighbors[$2]=inneighbors[$2] FS $1; next } { \ 17 | n=split(inneighbors[$1],array); \ 18 | for (i=1;i<=n;++i) \ 19 | print array[i],$1}' \ 20 | "$tempdir/edges" "$tempdir/topological_nodes" > "$tempdir/topological_edges" 21 | 22 | # find all branching nodes 23 | awk '{outdegree[$1]+=1;} \ 24 | END \ 25 | { \ 26 | for (key in outdegree) { \ 27 | if (outdegree[key] > 1) {print key} \ 28 | }; \ 29 | };' "$tempdir/edges" > "$tempdir/branching" 30 | 31 | # use branching nodes and topological edges to copute the branching factor 32 | gawk --bignum '(NR == FNR) { branching[$1]=1; next } { \ 33 | if (bfactor[$1] + branching[$1] > bfactor[$2]) \ 34 | {bfactor[$2]=bfactor[$1] + branching[$1]} \ 35 | } END { \ 36 | max=0 ; \ 37 | for (key in bfactor) { \ 38 | if (bfactor[key] > max) { \ 39 | max=bfactor[key] \ 40 | } \ 41 | } ; 42 | print max 43 | }' "$tempdir/branching" "$tempdir/topological_edges" 44 | -------------------------------------------------------------------------------- /experiments/graph-statistics/scripts/compute-branching-nodes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]] 4 | then 5 | >&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1 6 | fi 7 | 8 | grep "^L" $1 | \ 9 | awk '{outdegree[$2]+=1;} \ 10 | END \ 11 | { \ 12 | for (key in outdegree) { \ 13 | if (outdegree[key] > 1) {result+=1} \ 14 | }; \ 15 | print result \ 16 | };' 17 | -------------------------------------------------------------------------------- /experiments/graph-statistics/scripts/compute-choices.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]] 4 | then 5 | >&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1 6 | fi 7 | 8 | grep "^L" $1 | \ 9 | awk '{outdegree[$2]+=1;} \ 10 | END \ 11 | { \ 12 | for (key in outdegree) { \ 13 | if (outdegree[key] > 1) {result+=outdegree[key]} \ 14 | }; \ 15 | print result \ 16 | };' 17 | -------------------------------------------------------------------------------- /experiments/graph-statistics/scripts/compute-edges.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]] 4 | then 5 | >&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1 6 | fi 7 | 8 | grep "^L" $1 | wc -l 9 | -------------------------------------------------------------------------------- /experiments/graph-statistics/scripts/compute-efg-H.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]] 4 | then 5 | >&2 echo "Usage: $(basename $0) efg.gfa" ; exit 1 6 | fi 7 | 8 | head -n 3 $1 | grep "^B" | tr "\t" "\n" | tail -n +2 | sort -nr | head -n 1 9 | -------------------------------------------------------------------------------- /experiments/graph-statistics/scripts/compute-longest-node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]] 4 | then 5 | >&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1 6 | fi 7 | 8 | grep "^S" $1 | awk '{if (length($3) > max) {max = length($3)}} END {print max}' 9 | -------------------------------------------------------------------------------- /experiments/graph-statistics/scripts/compute-nodes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]] 4 | then 5 | >&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1 6 | fi 7 | 8 | grep "^S" $1 | wc -l 9 | -------------------------------------------------------------------------------- /experiments/graph-statistics/scripts/compute-paths.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]] 4 | then 5 | >&2 echo "Usage: $(basename $0) connected-dag.gfa" ; exit 1 6 | fi 7 | 8 | tempdir=$(mktemp -d) 9 | >&2 echo "generated temp dir $tempdir (should be removed automatically on exit)..." 10 | trap '{ rm -rf -- "$tempdir"; }' EXIT 11 | 12 | # compute topological nodes and edges 13 | grep "^L" $1 | awk '{print $2,$4}' > "$tempdir/edges" 14 | tsort "$tempdir/edges" > "$tempdir/topological_nodes" 15 | 16 | awk '(NR == FNR) { inneighbors[$2]=inneighbors[$2] FS $1; next } { \ 17 | n=split(inneighbors[$1],array); \ 18 | for (i=1;i<=n;++i) \ 19 | print array[i],$1}' \ 20 | "$tempdir/edges" "$tempdir/topological_nodes" > "$tempdir/topological_edges" 21 | 22 | # find sinks in the graph 23 | awk '{nodes[$2]=1; nonsinks[$1]=1} \ 24 | END \ 25 | { \ 26 | for (key in nodes) { \ 27 | if (nonsinks[key] != 1) {print key} \ 28 | }; \ 29 | };' "$tempdir/edges" > "$tempdir/sinks" 30 | 31 | # use edges and sinks to compute paths to each node 32 | gawk --bignum '(NR == FNR) { sinks[$1]=1; next } { \ 33 | if (paths[$1] > 0) \ 34 | {paths[$2]+=paths[$1]} \ 35 | else \ 36 | {paths[$2]+=1}} 37 | END { \ 38 | result=0 ; \ 39 | for (key in sinks) { \ 40 | result+=paths[key]} ; \ 41 | printf "%e", result ; \ 42 | print ""\ 43 | }' "$tempdir/sinks" "$tempdir/topological_edges" 44 | -------------------------------------------------------------------------------- /experiments/graph-statistics/scripts/compute-width.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | if [[ $# -lt 1 ]] || [[ $# -gt 1 ]] 4 | then 5 | >&2 echo "Usage: $(basename $0) graph.gfa" ; exit 1 6 | fi 7 | 8 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 9 | graphchainer=$thisfolder/../../../tools/GraphChainer/bin/GraphChainer 10 | 11 | $graphchainer --graph-statistics -g $1 -f fakereads.fastq -a fakealns.gaf 2>&1 | tail -n 1 | cut -d' ' -f8 12 | -------------------------------------------------------------------------------- /experiments/msa-validation/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation of sequence-to-graph aligners on simulated reads 2 | Script to compare the output of `vcf2multialign -H` on a chromosome of the [T2T-CHM13v2.0](https://github.com/marbl/CHM13) + [Phased T2T 1KGP panel](https://zenodo.org/records/7612953#.Y-8VD3bMJPY) dataset with that of [`bcftools consensus`](https://samtools.github.io/bcftools/howtos/consensus-sequence.html). Specifically, each sequence of the MSA generated with `vcf2multialign` is stripped of gaps and compared to the corresponding output of `bcftools consensus` using [`edlib-aligner`](https://github.com/Martinsos/edlib). Each Levenshtein distance (NW) is collected and the min, max, and average distance are computed. 3 | 4 | ## Prerequisites 5 | The script depends on `awk` and `bcftools`. It expects to find executable `edlib-aligner` and `seqtk` in folders `tools/edlib/meson-build` and `tools/seqtk/seqtk` from the root of the repository, which can be compiled as follows: 6 | ```console 7 | git submodule update --init ../../tools/edlib 8 | make -C ../../tools/edlib 9 | git submodule update --init ../../tools/seqtk 10 | make -C ../../tools/seqtk 11 | ``` 12 | 13 | ## Usage 14 | Usage is as follows: 15 | ```console 16 | ./validate.sh MSA.fasta reference.fasta variations.vcf.gz threads output_stats.txt 17 | ``` 18 | where `MSA.fasta` is the MSA computed by `vcf2multialign -H`, `reference.fasta` contains the reference chromosome, `variations.vcf.gz` contains the variations to the chromosome, threads is a positive number of threads, and `output_stats.txt` is the desired output file. For the chromosome 22 built in experiment `vcf-to-hapl-to-efg`, the command to run is 19 | ```console 20 | ./validate.sh ../vcf-to-hapl-to-efg/output/sampled_haplotypes.a2m ../vcf-to-hapl-to-efg/chr22_uppercase.fasta ../vcf-to-hapl-to-efg/1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz 64 output_stats.txt 21 | ``` 22 | -------------------------------------------------------------------------------- /experiments/msa-validation/validate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | if [[ $# -lt 5 ]] || [[ $# -gt 5 ]] 4 | then 5 | >&2 echo "Usage: $(basename $0) MSA.fasta reference.fasta variations.vcf.gz threads output_stats.txt" ; exit 1 6 | fi 7 | 8 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 9 | export bcftools=bcftools 10 | export edlib=../../tools/edlib/meson-build/edlib-aligner 11 | export seqtk=../../tools/seqtk/seqtk 12 | 13 | msa=$1 14 | export ref=$2 15 | export vcf=$3 16 | threads=$4 17 | outputfile=$5 18 | 19 | if [ -f "$outputfile" ] 20 | then 21 | >&2 echo "$outputfile already exists!"; exit 1 22 | fi 23 | 24 | function align_destroy () { # $1 is temp file containing gapless fasta sequence, to be destroyed 25 | header=$(head -n 1 $1) 26 | sample=$(echo ${header:1} | cut -d'-' -f1) 27 | haplot=$(echo ${header:1} | cut -d'-' -f2) 28 | echo -en "$sample-$haplot\t" 29 | $edlib \ 30 | $1 \ 31 | <($bcftools consensus -f $ref -s $sample -H $haplot $vcf 2>> /dev/null | $seqtk seq -U /dev/stdin) \ 32 | | grep "^#0" | cut -d' ' -f2 33 | rm $1 34 | } 35 | # make align_destroy visible to GNU parallel 36 | export SHELL=$(type -p bash) 37 | export -f align_destroy 38 | 39 | while read -u 3 header 40 | do 41 | while [[ $(find . -type f -name "tmp*" | wc -l) -gt $threads ]] 42 | do 43 | sleep 1s 44 | done 45 | read -u 3 entry 46 | sample=$(echo ${header:1} | cut -d'-' -f1) 47 | haplot=$(echo ${header:1} | cut -d'-' -f2) 48 | if [ "$sample" == "REF" ] ; then continue ; fi 49 | gaplessentry=$(echo "$entry" | tr -d "-") 50 | 51 | tempfile=$(mktemp -p $thisfolder) 52 | echo "$header" > $tempfile 53 | echo "$gaplessentry" >> $tempfile 54 | echo "$tempfile" 55 | done 3<$msa | parallel --keep-order --jobs $threads align_destroy > $outputfile 56 | 57 | awk 'BEGIN { min = 2^1024 ; max = 0 } \ 58 | { sum += $2 ; count++ ; \ 59 | if ($2 < min) min = $2 ; \ 60 | if ($2 > max) max = $2 } \ 61 | END { print "average:\t"sum / count ; print " min:\t"min ; print " max:\t"max }' \ 62 | $outputfile >> $outputfile 63 | 64 | tail -n 3 $outputfile 65 | 66 | -------------------------------------------------------------------------------- /experiments/short-read-exact-match/README.md: -------------------------------------------------------------------------------- 1 | # Exact match of short reads on the chr22 iEFG 2 | We compare the short-read exact matching solution of `efg-locate` on the chromosome 22 iEFG built with the pipeline at `experiments/vcf-to-hapl-to-efg` to that of `bwa` on the T2T-CHM13 linear reference for chromosome 22. After checking out the *Prerequisites* and *Datasets* sections, run the script `runexp.sh` (requires 42G of disk space) and check `output/runexp_log.txt` for the results. 3 | 4 | ## Prerequisites 5 | Script `runexp.sh` expects `efg-locate`, `bwa`, and `seqtk` to be located in folder `tools/efg-locate`, `tools/bwa`, and `tools/seqtk` from the root of this repository. You can download and compile them with the following commands (executed from this folder): 6 | ```console 7 | make -C ../../tools/efg-locate 8 | git submodule update --init ../../tools/{bwa,seqtk} 9 | make -C ../../tools/bwa 10 | make -C ../../tools/seqtk 11 | ``` 12 | 13 | ## Datasets 14 | Obtain the graph (262M) with command 15 | ```console 16 | wget https://zenodo.org/records/15112649/files/chr22_iEFG.gfa.gz?download=1 --output-document=input/chr22_iEFG.gfa.gz && gunzip input/chr22_iEFG.gfa.gz 17 | ``` 18 | the reads (24GB) with 19 | ```console 20 | wget 'https://cs.helsinki.fi/group/gsa/panvc-founders/scalability-experiment/reads/ERR1025645_sample05_1.fq.gz' --output-document=input/ERR1025645_sample05_1.fq.gz 21 | ``` 22 | and the chromosome 22 reference (1GB) with 23 | ```console 24 | wget "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0.fa.gz" --output-document=input/chm13v2.0.fa.gz 25 | seqtk subseq input/chm13v2.0.fa.gz <(echo "chr22") | seqtk seq -U - > input/chr22_uppercase.fasta 26 | ``` 27 | -------------------------------------------------------------------------------- /experiments/short-read-exact-match/input/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algbio/SRFAligner/c698b10ce1b3f5ec20ef5f218f6d0259eebf7911/experiments/short-read-exact-match/input/.gitkeep -------------------------------------------------------------------------------- /experiments/short-read-exact-match/runexp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -o pipefail 4 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 5 | cd $thisfolder 6 | 7 | # executable's absolute paths/commands (make sure they work!) 8 | bwa=$thisfolder/../../tools/bwa/bwa 9 | efglocate=$thisfolder/../../tools/efg-locate/efg-locate 10 | seqtk=$thisfolder/../../tools/seqtk/seqtk 11 | usrbintime=/usr/bin/time 12 | 13 | # params 14 | inputgraph=$thisfolder/input/chr22_iEFG.gfa 15 | inputreference=$thisfolder/input/chr22_uppercase.fasta 16 | inputreads=$thisfolder/input/ERR1025645_sample05_1.fq.gz 17 | threads=16 18 | 19 | # 0. setup 20 | mkdir output 21 | echo -n > output/runexp_log.txt 22 | 23 | echo "# 1. match in the chr22 iEFG (efg-locate)" >> output/runexp_log.txt 24 | /usr/bin/time $efglocate \ 25 | --reverse-complement \ 26 | --threads $threads \ 27 | $inputgraph \ 28 | <(seqtk seq -A $inputreads) \ 29 | output/efg_locate_matches.gaf \ 30 | >> output/runexp_log.txt 2>> output/runexp_log.txt 31 | 32 | echo "# 2. match in the chr22 reference (bwa)" >> output/runexp_log.txt 33 | ln -s $inputreference $thisfolder/output/ref.fasta 34 | /usr/bin/time $bwa index output/ref.fasta 2>> output/runexp_log.txt 35 | /usr/bin/time $bwa aln -n 0 -k 0 -l 100 -t $threads \ 36 | output/ref.fasta \ 37 | $inputreads \ 38 | > output/bwa_matches.sai 2>> output/runexp_log.txt 39 | /usr/bin/time $bwa samse \ 40 | output/ref.fasta \ 41 | output/bwa_matches.sai \ 42 | $inputreads \ 43 | > output/bwa_matches.sam 2>> output/runexp_log.txt 44 | 45 | echo "# 3. compute stats" >> output/runexp_log.txt 46 | echo -n "efg-locate took" $(grep system output/runexp_log.txt | head -n 1 | cut -d' ' -f3 | cut -d'e' -f1) >> output/runexp_log.txt 47 | echo " and matched" $(cut -f1 output/efg_locate_matches.gaf | uniq | sort | uniq | wc -l) "reads" >> output/runexp_log.txt 48 | 49 | echo -n "bwa took" $(grep system output/runexp_log.txt | tail -n 3 | cut -d' ' -f3 | cut -d'e' -f1 | tr "\n" " ") >> output/runexp_log.txt 50 | echo " and matched" $(cat output/bwa_matches.sam | awk '{if ($6 == "100M") {print}}' | cut -f1 | uniq | sort | uniq | wc -l) "reads" >> output/runexp_log.txt 51 | -------------------------------------------------------------------------------- /experiments/vcf-to-hapl-to-efg/README.md: -------------------------------------------------------------------------------- 1 | # vcf-to-hapl-to-efg experiment 2 | Pipeline to build an indexable Elastic Founder Graph from a VCF file plus reference. After checking out the 'Prerequisites' and 'Datasets and obtaining the input data' sections of this document, you can build the chromosome 22 iEFG with command 3 | ```console 4 | /usr/bin/time ./sample-and-build-efg-heuristic.sh -f chr22_uppercase.fasta -v 1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz -c chr22 -s 2504 -M 8 -t 64 5 | ``` 6 | This requires ~600 GB of disk space and ~600 GB of RAM. If you want to generate an iEFG from fewer haplotypes, for example 20, run command 7 | ```console 8 | /usr/bin/time ./sample-and-build-efg-heuristic.sh -f chr22_uppercase.fasta -v 1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz -c chr22 -s 20 -t 64 9 | ``` 10 | Finally, the iEFG can be stripped of its paths with command 11 | ```console 12 | grep -v "^P" output/efg-unsimplified.gfa > chr22_iEFG.gfa 13 | ``` 14 | 15 | ## Prerequisites 16 | The pipeline expects [`bcftools`](https://www.htslib.org/download/) and [`vcf2multialign`](https://github.com/tsnorri/vcf2multialign) to be found in the search path variable `PATH`, and expects `founderblockgraph` to be in folder `tools/founderblockgraph` from the root of this repository. You can get and compile `founderblockgraph` with 17 | ```console 18 | git submodule update --init --recursive ../../tools/founderblockgraphs 19 | make -C ../../tools/founderblockgraphs 20 | ``` 21 | 22 | In case you obtain `bcftools` and `vcf2multialign` in a different way, modify lines 11-13 of `sample-and-build-efg-heuristic.sh` accordingly. To manipulate FASTQ files in the next section, we also use [`seqtk`](https://github.com/lh3/seqtk). 23 | 24 | ## Datasets and obtaining the input data 25 | We use the [phased T2T 1KGP panel](https://zenodo.org/records/7612953) (Version 1.0) by Joseph Lalli, based on [T2T-CHM13v2.0](https://github.com/marbl/CHM13). We can easily obtain chromosome 22 with 26 | ```console 27 | wget "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0.fa.gz" 28 | seqtk subseq chm13v2.0.fa.gz <(echo "chr22") | seqtk seq -U - > chr22_uppercase.fasta 29 | ``` 30 | and obtain the chr22 variations using commands 31 | ```console 32 | wget "https://zenodo.org/records/7612953/files/phased_T2T_panel.tar" 33 | tar -xvf phased_T2T_panel.tar phased_T2T_panel/1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz --strip-components 1 34 | tar -xvf phased_T2T_panel.tar phased_T2T_panel/1KGP.CHM13v2.0.chr22.recalibrated.snp_indel.pass.phased.vcf.gz.tbi --strip-components 1 35 | ``` 36 | 37 | ## iEFG validation 38 | You can validate that the iEFG was correctly built from the MSA rows with `efg-locate`: 39 | ```console 40 | ../../tools/efg-locate/efg-locate -t 64 --overwrite \ 41 | output/efg-unsimplified.gfa \ 42 | <(awk '{if (substr($0, 0, 1) == ">") {print} else {gsub(/-/, "", $0); print $0}}' \ 43 | output/sampled_haplotypes.a2m) \ 44 | /dev/null 45 | ``` 46 | 47 | ## Versions of the software used 48 | | Tool | Version | 49 | | ----------------- | ---------------- | 50 | | founderblockgraph | 439ef67 (GitHub) | 51 | | vcf2multialign | 1.2.2 23f3f42 | 52 | | seqtk | 1.4-r130-dirty | 53 | | bcftools | 1.20 | 54 | 55 | ## todo 56 | - [] gzip some of the intermediate files 57 | -------------------------------------------------------------------------------- /experiments/vcf-to-hapl-to-efg/sample-and-build-efg-heuristic.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # sample some haplotypes from reference + VCF file, build MSA with vcf2multialign, and build iEFG with founderblockgraph 3 | set -e 4 | set -o pipefail 5 | 6 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 7 | 8 | # 9 | # executables 10 | # 11 | founderblockgraph=$thisfolder/../../tools/founderblockgraphs/founderblockgraph 12 | vcf2multialign=vcf2multialign 13 | bcftools=bcftools 14 | 15 | # 16 | # setup 17 | # 18 | threads=8 19 | heuristicsubset="" 20 | 21 | # parsing command line options 22 | print_help() 23 | { 24 | echo "usage: $0 [-f reference.fa] [-v variation.vcf] [-c chromosome] [-s nhapl] [-M Mrows] [-t threads]" 25 | echo " -h --help: show this screen" 26 | echo " -f reference: reference (FASTA format) used to generate MSA" 27 | echo " -v variation: variation (VCF format, possibly gzipped) used to generate MSA" 28 | echo " -c chromosome: chromosome name used to generate MSA (see vcf2multialign)" 29 | echo " -s nhapl: sample the haplotype list and keep 'nhapl' random haplotypes" 30 | echo " -M Mrows: set parameter --heuristic-subset='Mrows' for iEFG construction (see founderblockgraph)" 31 | echo " -t threads: threads used in iEFG construction (see founderblockgraph)" 32 | } 33 | 34 | # https://stackoverflow.com/questions/12022592/how-can-i-use-long-options-with-the-bash-getopts-builtin 35 | for arg in "$@"; do 36 | shift 37 | case "$arg" in 38 | '--help') set -- "$@" '-h' ;; 39 | *) set -- "$@" "$arg" ;; 40 | esac 41 | done 42 | 43 | argf=false ; argc=false ; argv=false ; args=false ; argM=false ; 44 | OPTIND=1 45 | while getopts "hf:v:c:r:s:M:t:" option; do 46 | case $option in 47 | f) # fasta + vcf input : fasta 48 | argf=true 49 | reference="$(realpath $OPTARG)" ;; 50 | v) # fasta + vcf input : vcf 51 | argv=true 52 | vcf="$(realpath $OPTARG)" ;; 53 | c) # fasta + vcf input : chromosome for vcf2multialign 54 | argc=true 55 | chromosome="$OPTARG" ;; 56 | s) # number of samples 57 | args=true 58 | nhapl="$OPTARG" ;; 59 | M) # heuristic subset 60 | argM=true 61 | heuristicsubset="--heuristic-subset $OPTARG" ;; 62 | t) # threads 63 | argt=true 64 | threads="$OPTARG" ;; 65 | h) # display help 66 | print_help 67 | exit;; 68 | \?) # invalid option 69 | echo "Error: Invalid option" 70 | exit;; 71 | esac 72 | done 73 | shift $(expr $OPTIND - 1) # remove options from positional parameters 74 | 75 | if [ "$argf" = false ] || [ "$argc" = false ] || [ "$argv" = false ] || [ "$args" = false ] 76 | then 77 | print_help 78 | exit 79 | fi 80 | 81 | outputfolder=$thisfolder/output 82 | mkdir $outputfolder 83 | if [[ $? -gt 0 ]] ; then echo "Output directory $outputfolder already exists!" ; exit 1 ; fi 84 | log=$outputfolder/log.txt 85 | stats=$outputfolder/stats.txt 86 | cd $outputfolder 87 | 88 | # 89 | # randomness source 90 | # 91 | # https://www.gnu.org/software/coreutils/manual/html_node/Random-sources.html#Random-sources 92 | get_seeded_random() 93 | { 94 | seed="$1" 95 | openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \ 96 | /dev/null 97 | } 98 | 99 | # 100 | # 1. sampling the vcf 101 | # 102 | echo -n "Sampling the vcf..." 103 | # TODO I am expecting the haplotypes to be after the 9th field specified by the VCF header, is this always correct? 104 | $bcftools query -l $vcf > haplotypes 105 | cat haplotypes | shuf -n $nhapl --random-source=<(get_seeded_random "semi-repeat-free") > sampled_haplotypes 106 | $bcftools view --samples-file sampled_haplotypes $vcf > sampled_haplotypes.vcf 107 | # FIX for the T2T 1KGP data and vcf2multialign, see https://github.com/tsnorri/vcf2multialign/issues/5 108 | sed -i 's/e+06//g' sampled_haplotypes.vcf 109 | sed -i 's/INFO= MSA 114 | # 115 | echo -n "Computing the MSA..." 116 | /usr/bin/time $vcf2multialign \ 117 | --founder-sequences=50 \ 118 | --input-reference=$reference \ 119 | --input-variants=sampled_haplotypes.vcf \ 120 | --chromosome $chromosome \ 121 | --output-graph variant.graph >> $log 2>> $log 122 | 123 | /usr/bin/time $vcf2multialign \ 124 | --input-reference=$reference \ 125 | --input-graph variant.graph \ 126 | -H \ 127 | -s sampled_haplotypes.a2m >> $log 2>> $log 128 | echo " done." 129 | 130 | # 131 | # 3. MSA -> iEFG 132 | # 133 | echo -n "Building the indexable Elastic Founder Graph..." 134 | /usr/bin/time $founderblockgraph --elastic --gfa --ignore-chars="N" --output-paths --threads=$threads --input=sampled_haplotypes.a2m --output=efg-unsimplified.gfa $heuristicsubset >> $log 2>> $log 135 | echo " done." 136 | -------------------------------------------------------------------------------- /test/graph1.gfa: -------------------------------------------------------------------------------- 1 | X 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 2 | B 1 3 4 2 2 1 2 2 5 2 2 1 1 3 4 1 1 2 1 1 1 3 1 2 2 5 1 1 1 1 1 1 6 1 2 1 3 1 2 1 1 3 | S 484932 AAGATTGTGCCACTGCACTCCAGGC 4 | S 484965 CCTGTCCTAGGCCAG 5 | S 484998 GTGGAAGACAGAAAT 6 | S 484989 AAACCCAAAGCAGAC 7 | S 484992 AAAGCTCCTGAAGGG 8 | S 484988 GGGGAAACAGCCAC 9 | S 484951 GAGGCTTCTACAC 10 | S 484963 CCGGTGAGAAGATTAG 11 | S 484975 AGAAGGCCGAGGCAGAGAAT 12 | S 484978 TGCTTGAACCTGGGAGGTGGAGGTTGCAGTGAGCCAAGATCGTGCCACTGC 13 | S 484947 AATCTGATTAATTGCGAGGAGTCTTTG 14 | S 484954 GAAAGCAATATTAT 15 | S 484984 AATTTATTAAATACTCACTGTG 16 | S 484964 GTCATCAAAACCTG 17 | S 484948 AATCTGATGAATTGCGAGGAGTCGTTG 18 | S 484952 GAGGCTTCCACAC 19 | S 484934 GAAATAATAATAAAAAAAAAAG 20 | S 484967 CTGCAGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGCGGGTGGATCAC 21 | S 484941 GATAATACTGAGAAG 22 | S 484999 GTGGAAAACAGAAAAT 23 | S 484985 AATATATTAAATACTCACTGTG 24 | S 484955 CAGAATTGAATTTAA 25 | S 484968 CTGCAGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGTGGATCAC 26 | S 484939 GAAATAATAATATAAAAAAAAG 27 | S 484931 AAGATCGTGCCACTGCACTCCAGGC 28 | S 484983 CCTCCAGCCTGGGCAGCAGATCGAGACTCCATCTCAAAAA 29 | S 484949 AATCTGTTGAATTGCGAGGAGTCTTTG 30 | S 484950 AATCTGATGAATTGCGAGGAGTCTTTA 31 | S 484943 TCATTTGGGGAGGCT 32 | S 484929 GAAAATTAGCTGGGCATGGTGGCGGGCGCCTATAGTCC 33 | S 484940 ATAAAGATCTTAA 34 | S 484991 AAAGCTCCTCAAGGG 35 | S 484969 CTGCAGTGGCTCACACCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGTGGATCAC 36 | S 484979 CCTCCAGCCTGGGCAACAGATCGAGACTCCGTCTCAAAAA 37 | S 484996 GTTTGATTAATAATG 38 | S 484994 GTTTACTTAATAATG 39 | S 484987 GGCAAATACCATTTT 40 | S 484922 TTACTTTTTTAAAGAT 41 | S 484923 AAAGATCTTGAC 42 | S 484982 CCTCCAGCCTGGGCAACAGATTGAGACTCCGTCTCAAAAA 43 | S 484995 GTTTGCTGAATAATG 44 | S 484990 AAACCCAAAACAGAC 45 | S 484958 GCAAATTTATTTGGGCA 46 | S 484936 GAAATAATAATAAAAAAAAAAAG 47 | S 484938 GAAATAATAATAATAAAAAAAG 48 | S 484980 CCTCCAGCCTGGGCAACAGATCGAGACTCCATCTCAAAAA 49 | S 484953 GAAAGCAATATTGT 50 | S 484942 GTGTAGGTTGAGG 51 | S 484927 GAAAATTAGCTGGGCATGGTGGCGGGCGCCTGTAGTCC 52 | S 484981 CCTCCAGCCTGGGCAACAGATCGAGACTCCGTCTCAAAAAA 53 | S 484962 CCGGTGAGAAGTTTAG 54 | S 484937 GAAATAATAATAAAAAAAAAG 55 | S 484924 CGGGCGCGGTGGCTCACACCTGTAATCCCAGCATTTTGGGAGGCCGAGGC 56 | S 484973 TCAAAAATCAGCTGGGTGTGGTGGCG 57 | S 484970 GAGGTCAGGAGTTCAAGACCAGCCTGGCCAAGATGGTGAAACCCCATCTCTACTAAAAA 58 | S 484925 CGGGCGCGGTGGCTCACACCTGTAATCCCAGCATTTAGGGAGGCCGAGGC 59 | S 484976 AGAAGGCTGAGGCAGAGAAT 60 | S 484926 AGGCGGATCACGAGGTCAGGAGATCGAGACCATCCTGGCTAATACGGTGAAACCCTGTCTCTACTAAAAATACA 61 | S 484945 CAGGATACAGACTT 62 | S 484957 GCAAATTCAATTGGGCA 63 | S 484961 CCAGTTTGGAAGTC 64 | S 484997 GTGGAAGACAGAAAAT 65 | S 484930 CAGGCAGAGCTTGCAGTGAGCA 66 | S 484933 TGGGCAACTGAGCGAGACTCCATCT 67 | S 484946 AATCTGATGAATTGCGAGGAGTCTTTG 68 | S 484974 AGCACCTGTAATCCCATCTACTC 69 | S 484956 GCAAATTTAATTGGGCA 70 | S 484971 GAGGTCAGGAGTCAAGACCAGCCTGGCCAAGATGGTGAAACCCCATCTCTACTAAAAA 71 | S 484972 GAGGTCAGGAATTCAAGACCAGCCTGGCCAAGATGGTGAAACCCCATCTCTACTAAAAA 72 | S 484966 CTGCAGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGTGGATCAC 73 | S 484959 GAGGTTTAGTCTGT 74 | S 484993 GTTTGCTTAATAATG 75 | S 484935 GAAATAATAATAATAAAAAAAAG 76 | S 484944 TTGAATGTTAAGTT 77 | S 484986 GGCAAATACAATTTT 78 | S 484977 TGCTTGAACCTGGGAGGTGGAGGTTGCAGTGAGCCAAGATCGTGTCACTGC 79 | S 484960 ATTGGATATGGGGAA 80 | S 484928 GAAAATTAGCTGGGCATGGTGGCGGGCGCTTGTAGTCC 81 | S 485000 AAATGCATATAATCAATA 82 | L 484922 + 484923 + 0M 83 | L 484923 + 484924 + 0M 84 | L 484923 + 484925 + 0M 85 | L 484924 + 484926 + 0M 86 | L 484925 + 484926 + 0M 87 | L 484926 + 484927 + 0M 88 | L 484926 + 484928 + 0M 89 | L 484926 + 484929 + 0M 90 | L 484927 + 484930 + 0M 91 | L 484928 + 484930 + 0M 92 | L 484929 + 484930 + 0M 93 | L 484930 + 484931 + 0M 94 | L 484930 + 484932 + 0M 95 | L 484931 + 484933 + 0M 96 | L 484932 + 484933 + 0M 97 | L 484933 + 484934 + 0M 98 | L 484933 + 484935 + 0M 99 | L 484933 + 484936 + 0M 100 | L 484933 + 484937 + 0M 101 | L 484933 + 484938 + 0M 102 | L 484933 + 484939 + 0M 103 | L 484934 + 484940 + 0M 104 | L 484935 + 484940 + 0M 105 | L 484936 + 484940 + 0M 106 | L 484937 + 484940 + 0M 107 | L 484938 + 484940 + 0M 108 | L 484939 + 484940 + 0M 109 | L 484940 + 484941 + 0M 110 | L 484941 + 484942 + 0M 111 | L 484942 + 484943 + 0M 112 | L 484943 + 484944 + 0M 113 | L 484944 + 484945 + 0M 114 | L 484945 + 484946 + 0M 115 | L 484945 + 484947 + 0M 116 | L 484945 + 484948 + 0M 117 | L 484945 + 484949 + 0M 118 | L 484945 + 484950 + 0M 119 | L 484946 + 484951 + 0M 120 | L 484946 + 484952 + 0M 121 | L 484947 + 484951 + 0M 122 | L 484948 + 484951 + 0M 123 | L 484949 + 484951 + 0M 124 | L 484950 + 484951 + 0M 125 | L 484951 + 484953 + 0M 126 | L 484951 + 484954 + 0M 127 | L 484952 + 484953 + 0M 128 | L 484953 + 484955 + 0M 129 | L 484954 + 484955 + 0M 130 | L 484955 + 484956 + 0M 131 | L 484955 + 484957 + 0M 132 | L 484955 + 484958 + 0M 133 | L 484956 + 484959 + 0M 134 | L 484957 + 484959 + 0M 135 | L 484958 + 484959 + 0M 136 | L 484959 + 484960 + 0M 137 | L 484960 + 484961 + 0M 138 | L 484961 + 484962 + 0M 139 | L 484961 + 484963 + 0M 140 | L 484962 + 484964 + 0M 141 | L 484963 + 484964 + 0M 142 | L 484964 + 484965 + 0M 143 | L 484965 + 484966 + 0M 144 | L 484965 + 484967 + 0M 145 | L 484965 + 484968 + 0M 146 | L 484965 + 484969 + 0M 147 | L 484966 + 484970 + 0M 148 | L 484966 + 484971 + 0M 149 | L 484966 + 484972 + 0M 150 | L 484967 + 484970 + 0M 151 | L 484968 + 484970 + 0M 152 | L 484969 + 484970 + 0M 153 | L 484970 + 484973 + 0M 154 | L 484971 + 484973 + 0M 155 | L 484972 + 484973 + 0M 156 | L 484973 + 484974 + 0M 157 | L 484974 + 484975 + 0M 158 | L 484974 + 484976 + 0M 159 | L 484975 + 484977 + 0M 160 | L 484975 + 484978 + 0M 161 | L 484976 + 484977 + 0M 162 | L 484977 + 484979 + 0M 163 | L 484977 + 484980 + 0M 164 | L 484977 + 484982 + 0M 165 | L 484977 + 484983 + 0M 166 | L 484978 + 484981 + 0M 167 | L 484979 + 484984 + 0M 168 | L 484980 + 484984 + 0M 169 | L 484981 + 484985 + 0M 170 | L 484982 + 484984 + 0M 171 | L 484983 + 484984 + 0M 172 | L 484984 + 484986 + 0M 173 | L 484984 + 484987 + 0M 174 | L 484985 + 484986 + 0M 175 | L 484986 + 484988 + 0M 176 | L 484987 + 484988 + 0M 177 | L 484988 + 484989 + 0M 178 | L 484988 + 484990 + 0M 179 | L 484989 + 484991 + 0M 180 | L 484989 + 484992 + 0M 181 | L 484990 + 484991 + 0M 182 | L 484991 + 484993 + 0M 183 | L 484991 + 484994 + 0M 184 | L 484991 + 484995 + 0M 185 | L 484991 + 484996 + 0M 186 | L 484992 + 484993 + 0M 187 | L 484993 + 484997 + 0M 188 | L 484993 + 484998 + 0M 189 | L 484993 + 484999 + 0M 190 | L 484994 + 484997 + 0M 191 | L 484995 + 484997 + 0M 192 | L 484996 + 484997 + 0M 193 | L 484997 + 485000 + 0M 194 | L 484998 + 485000 + 0M 195 | L 484999 + 485000 + 0M 196 | -------------------------------------------------------------------------------- /test/graph2.gfa: -------------------------------------------------------------------------------- 1 | X 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 2 | B 1 1 1 1 1 6 1 1 1 1 1 1 1 1 1 1 3 2 1 2 1 1 1 2 1 1 1 1 1 1 1 3 | S 252283 GCATTTCGTCATTTCA 4 | S 252282 TTCTTCTCATTGTT 5 | S 252290 TTCATTTCATTTCATCATTTCATCTTTTCATCTC 6 | S 252285 ATCATCTCATTTCATCTT 7 | S 252299 TCATTTCACTTCATCATTTCATTTC 8 | S 252287 TTCATCTCATTTCATCATTTCATCTTTTCATCTC 9 | S 252284 TCATTTCATCATTTCACTTCATCTCATCATTTCATCTCATGATTTCATCTC 10 | S 252296 TTCATCATTTCATCATTTCATATCATTT 11 | S 252294 CATCATTTCATTTCATTATTTCATCATTTA 12 | S 252297 CTTCATTTCACCAT 13 | S 252289 TTCATCTCATTTCATCTTTTCATCTC 14 | S 252298 TTGATCTTCTCATT 15 | S 252293 CAATTCGTCATTT 16 | S 252295 ATCATTTAACTTCAT 17 | S 252292 GTCATTTCATTTGATCATTTCATTTCAT 18 | S 252281 TATTTCATTTCAGCAT 19 | S 252291 TTCATCTCATTTCATTTCATCATTCATCTTTTCATCTC 20 | S 252286 TTCATCTCATTTCATTTCATCATTTCATCTTTTCATCTC 21 | S 252288 TTCATCTCATCATTTCATCTTTTCATCTC 22 | S 252307 TTTTGTTTCATTATA 23 | S 252317 CCTTTTCATTTCATCATTTCATTTC 24 | S 252309 CCTTTTCATTTCATTTCATCAT 25 | S 252304 TATTTCATCATTCCATTTCATCATTTCA 26 | S 252310 TTCATTTCATATCATTTCCTCA 27 | S 252302 TATTTCATCATTCCATTTCATCATTTCATTACATTTCATCATTTCA 28 | S 252316 CGTTTCATTACA 29 | S 252318 ATCATTTCATATCATTTCCTCA 30 | S 252300 ATCATTTCATTTCCTCATTTCATTTC 31 | S 252315 TTTCATCTCATCATTTCATCTTTTAATCTCATTTCATTTAATCATTT 32 | S 252311 ATTCATTATTTCATC 33 | S 252319 ATTCATCATTTCATCTTTTCATTTC 34 | S 252303 TATTTCATCATTCCATTTCATTACATTTCATCATTTCA 35 | S 252306 CTTCATCTCATCATTTCATCTTTTAATCTCATTTCATTTAATCA 36 | S 252305 CTTCATCTCATCATTTCATCTCATGATTTCATTTCATCATTTCATCTTTTAATCTCATTTCATTTAATCA 37 | S 252313 CATTTCATCATTCCATTTCATCATTTCATTACATTTCATCATTTCACTTCATCTCATCATTTCATCTCATGATTTCA 38 | S 252308 CCTTTTCATTTCATTTCATCATTTCAT 39 | S 252321 CTTCATTATTTCATTTCATTTCA 40 | S 252312 TTTTCATTTCATTTCAC 41 | S 252320 ATCATTTCATCATTTCATTTCATCATTTCA 42 | S 252301 ACCATTTCATCATTT 43 | S 252314 CATTTCATCATTCCATTTCATCATTTCATTACATTTCATCATTTCACTTCATCTCATCATTTCATCTCATGA 44 | L 252281 + 252282 + 0M 45 | L 252282 + 252283 + 0M 46 | L 252283 + 252284 + 0M 47 | L 252284 + 252285 + 0M 48 | L 252285 + 252286 + 0M 49 | L 252285 + 252287 + 0M 50 | L 252285 + 252288 + 0M 51 | L 252285 + 252289 + 0M 52 | L 252285 + 252290 + 0M 53 | L 252285 + 252291 + 0M 54 | L 252286 + 252292 + 0M 55 | L 252287 + 252292 + 0M 56 | L 252288 + 252292 + 0M 57 | L 252289 + 252292 + 0M 58 | L 252290 + 252292 + 0M 59 | L 252291 + 252292 + 0M 60 | L 252292 + 252293 + 0M 61 | L 252293 + 252294 + 0M 62 | L 252294 + 252295 + 0M 63 | L 252295 + 252296 + 0M 64 | L 252296 + 252297 + 0M 65 | L 252297 + 252298 + 0M 66 | L 252298 + 252299 + 0M 67 | L 252299 + 252300 + 0M 68 | L 252300 + 252301 + 0M 69 | L 252301 + 252302 + 0M 70 | L 252301 + 252303 + 0M 71 | L 252301 + 252304 + 0M 72 | L 252302 + 252305 + 0M 73 | L 252302 + 252306 + 0M 74 | L 252303 + 252305 + 0M 75 | L 252304 + 252305 + 0M 76 | L 252305 + 252307 + 0M 77 | L 252306 + 252307 + 0M 78 | L 252307 + 252308 + 0M 79 | L 252307 + 252309 + 0M 80 | L 252308 + 252310 + 0M 81 | L 252309 + 252310 + 0M 82 | L 252310 + 252311 + 0M 83 | L 252311 + 252312 + 0M 84 | L 252312 + 252313 + 0M 85 | L 252312 + 252314 + 0M 86 | L 252313 + 252315 + 0M 87 | L 252314 + 252315 + 0M 88 | L 252315 + 252316 + 0M 89 | L 252316 + 252317 + 0M 90 | L 252317 + 252318 + 0M 91 | L 252318 + 252319 + 0M 92 | L 252319 + 252320 + 0M 93 | L 252320 + 252321 + 0M 94 | -------------------------------------------------------------------------------- /test/graph3.gfa: -------------------------------------------------------------------------------- 1 | X 0 1 2 3 4 5 6 7 8 9 10 2 | B 1 1 1 1 1 1 1 1 1 1 1 3 | S 319767 ATGCTTCTGTCTAGATTTGATATGAAGATATTCCCGTTTCCAACGAAATCTTCAAATCTATCCAAATGTCCACTTGCAGATTCAACAAAAAGTGTTTTTCAGAACTGCTCTATCAAAAGAAAGATCCACCTCTGTTAGCTGAGTTCACTCATCACAAACAAGTTTATGAGAATGCTTCTGTCTAGTTTTTATTTGAAGATATTTCCTTTCTCACCATAGACCTGAAAGCTGTCCTAATGTTCACTTCCAGATACTACAGAAAGAGTGTTTCAAAACTGCTGTACGAAAGGGAATGTTCAACTCTGTGACTTGAATGCACACATCACAAAGAAGTTTCTGAGGATGCTGCTGTCTACTTTTTATACGTAATCCCGTTTCCAACGAAATCCTCCAA 4 | S 319769 ACAAGGAAGATTCTGAGATTGCTTCTGTCTAGTTTTTATGGGAAGATATTT 5 | S 319773 AAGGCCTCAGAGCGCTCCAAATATCCACTTGCACATACTACAAAAAGAGTGCCTCAAAGCTGCTCTCTGAAACGGAATGTTCAACTCTATGAGTTGAATGCAAACATCGCAAAGACGTTTCTGAGAATGCTTCTGTCTAGATTTGATATGAAGATATTCCCGTTTCCAACGAAATCTTCAAATCTATCCAAATGTCCACTTGCAGATTCAACAAAAAGTGTTTTTCAGAACTGCTCTATCAAAAGAAAGATCCACCTCTGTTAGCTGAGTTCACACATCACAAACAAGTTTATGAGAATGCTTCTGTCTAGTTTTTATTTGAAGATATTTCCTTTCTCACCATAGACCTGAAAGCTGTCCTAATGTTCACTTCCAGATACTACAGAAAGAGTGTTTCAAAACTGCTGTACGAAAGGGAATGTTCAACTCTGTGACTTGAATGCACACATCACAAAGAAGTTTCTGAGGATGCTGCTGTCT 6 | S 319766 TTTCAACTCTGTGACTTGAATGCAGACATCACAGAGCAGTTTCTGAGAATGCTTCTGTCTAGATTTTATAGGAAGATATTCCCGTTTCCAACGAAATCTTCACAGCTATCCAAATATCCACTTGCAGATTCTACAAAAAGAGTGTATCAAAACTGCTCTGTCAAAAGGAAGGTTCTTCTCTGTTAGGTGAGTGCATACGTCATAAAGGAGTTTCTGAGAATGTTTCTGTCTAGTGGTTATGGGAAGATATTTGCTTTTTCACCGTAGGCCTCAGAGCGCTCCAAATATCCACTTGCACATACTACAAAAAGAGTGCTTCAAAGCTGCTCTCTGAAACGGAATGTTCAACTCTATGAGTTGAATGCAAACATCACAAAGACGTTTCTGAGA 7 | S 319771 CGCAGATTCTACAAAAAGAGTGTATCAAAACTGCTCTGTCAAAAGGAAGGTTCTTCTCTGTTAGGTGAGTGCATACGTCATAAAGGAGTTTCTGAGAATGTTTCTGTCTAGTGGTTATGGGAAGATATTTGCTTTTTCACCGTAGGCCTCAGAGCGCTCCAAATATCCACTTGCACATACTACAAAAAGAGTGCCTCAAAGCTGCTCTCTGAAACGGAATGTTCAACTCTATGAGTTGAATGCAAACATCACAAAGACGTTTCTGAGAATGCTTCTGTCTAGATTTGATATGAAGATATTCCCGTTTCCAACGAAATCTTCCAATCTATCCAAATGTCCACTTGCAGATTCAACAAAAAGTGTTTTTCAGAACTGCTCTATCAAAAGAAAGATCCACCTCTGTTAGCTGAGTTCACACATCACAAACAAGTTTATGAGAATGCTTCTGTCTAGTTTTTATTTGAAGATATTTCCTTTCTCACCATAGACCTGAAAGCTGTCCTAATGTTCACTTCCAGATACTACAGAAAGAGTGTTTTAAAACTGCTGTACGAAAGGGAATGTTCAACTCTGTGACTTGAATGCACACATCACAAAGAAGTTTCTGAGGATGCTGCTGTCTACTTTTTATACGTAATCCCGTTTCCAACGAAATCCTCCAAGCTATCCAAATATCCACTTGCAGATTCCACAGAAAGAGTGTTTCAAAACAGCTCTGTCAATAGAAAGGTTCAACTCTGTTAGCTGCGTGCATATATCCCAAAGAAGATTCTGAGATTGCTTCTGTCTAGTTTTTATGGAAGATATTTCCCTTTTCACCGTAGGCGTCAAGGCGCTCCAAATGTCCACTTCCAGATACTACAAAAAGAGTGTTTCAAACCTACTCTGTGAAAGGGAATATTCAACTCTGTGACTTGAATGCAGATATCACAAAGAAGTTTCTGAGAATGCTTCTGTCGAGATTTTATATGAAGATATTCCCGTTTCCAACGAAATCCTG 8 | S 319776 ACTACAAAAAGAGTGTTTCAAACCTACTCTGTGAAAGGGAATATTCAACTCTGTGACTTGAATGCACATATCACAAAGAAGTTTCTGAGAATGCTTCTGTCGAGATTTTATATGAAGATATTCCCGTTTCCAACGAAATCCTGAAATCTATCCAAATATCCCCTCGCAGATTCTACAAAAAGAGTGTTTCAAAACTGCTCTGTAAAAAGAAAGGTTCAACTCTGTTACTTCAGTACACACATCACAAACAAGTTTCACAGAATGCTTCTTTCTAGCTTGTAGGGGAAGATATTCCCTTTATCACCATGGGCCTCAAACCGTCCGAA 9 | S 319770 GCCTTTTCACCGTAGGCGTCAAGTCGCTCCAAATGTCCACTTCCAGATACTACAAAAAGAGTGTTTCAAACCTACTCTGTGAAAGGGAATATTCAACTCTGTGACTTGAATGCAGATATCACAAAGAAGTTTCTGAGAATGCTTCTGTCGAGATTTTATATGAAGATATTCCCGTTTCCAACGAAATCCTGAAATCTCTCCAAATATCCCCT 10 | S 319774 TCTTTTTATACGTAATCCCGTTTCCAATGAAATCCTCCAAGCTATCCAAATATCCACTTGCAGATTCCACAGAAAGACTGTTTCAAAA 11 | S 319768 TCTATCCAAATATCCACTTGCAGATTCCACAGAAAGACTGTTTCAAAACTGCTCTGTCAATAGAAACGTTCAACTCTGTTAGCTGCGTGCATATATC 12 | S 319775 CTGCTCTGTCAATAAAAAGGTTCAACTCTGTTAGCTGCGTGCATATATCCCAAAGAAGATTCTGAGATTGCTTCTGTCTAGTTTTTATGGGAAGATATTTCCCTTTTCACCGTAGGCGTCAAGGCGCTCCAAATGTCCACTTCCAGAT 13 | S 319772 AAATCTATCCAAATATCCCCTCGCAGATTCTACAAAAAGAGTGTTTCAAAACTGCTCTGTAAAAAGAAAGGTTCAACTCTGTTAGTTGAGTACACACATCACAAACAAGTTTCACAGAATGCTTCTTTCTAGCTTGTAGGGGAAGATATTCCCTTTATCACCATGGGCCTCCAACCGTCCGAAACTTCCACTTCCATATACTACAAAAAGAGCGTTTCAAACCTGCTCTATGAAAGGCAATGTTCAACTCTGTGACTTGAATGCAGACATCACAGAGCAGTTTCTGAGAATGCTTCTGTCTAGATTTTATAGGAAGATATTCCCGTTTCCAACGAAATCTTCACAGCTATCCAAATATCCACTTGCAGATTCTACAAAAAGAGTGTATCAAAACTGCTCTGTCAAAAGGAAGGTTCTTCTCTGTTAGGTGAGTGCATACGTCATAAAGGAGTTTCTGAGAATGTTTCCGTCTAGTGGTTATGGGAAGATATTTGCTTTTTCACCG 14 | L 319766 + 319767 + 0M 15 | L 319767 + 319768 + 0M 16 | L 319768 + 319769 + 0M 17 | L 319769 + 319770 + 0M 18 | L 319770 + 319771 + 0M 19 | L 319771 + 319772 + 0M 20 | L 319772 + 319773 + 0M 21 | L 319773 + 319774 + 0M 22 | L 319774 + 319775 + 0M 23 | L 319775 + 319776 + 0M 24 | -------------------------------------------------------------------------------- /test/read1.fastq: -------------------------------------------------------------------------------- 1 | @reverse_strand_read 2 | AGTAGATGGGTCGCACCATCTTGGCAGGCTGGTCTTGAACTCCTGACCTCGTGATCCACCCGCCTCGGCCTCCCAGAGTGCTGGGATTACAGGCGTGAGCCACTGCAGCTGGCGTAGGACAGGCAGGTTTTGATGACCTGAATCTTCTCACCGGGACTTCCAAACTGGTTCCCCATATCCAATACAGACTAAACCTCTGCCCAATCGAATTTGCTTGAATTTCAATTCTGACAATATTGCTGGTGTAGAAGCCTCTAAAGACTCCTGGCAATTCATCAGATTAAGTCTGTATCCTGAACTTAACATTCAAAGCCTCCCCAAATGACCTCAACCTACACCTTCTCAGTATTATTTTAAGATCTTTATCTTTTTTTATTATTATTATTTCAGATGGAGTCGCTCATTGCCCAGCCTGGAGTGCAGTGGCACGATCTTTGCTCACTGCAAGCTCTGCCTGGGACTATCGGCGCCCGCCACCATGCCCAGCTAATTTTTCTGTATTTTTAGTAGAGACGGGTTTCACCGTATTAGCCAGGATGGTTACCGATCTCCTGACCTCGT 3 | + 4 | FUDLLF@ADCJBIRBD9F4NE&)',:,2'DJ;AB=<:+*{JNC'DW2GJ/<@19FLDFF:DHAG:/CI?EBDCF%*%('&PBHH7C4@OJ?9F>E?{2==E@DJDIC@A7590,1+)37FF4@LE@*I>C(K@KE{HH/,-*CDJ6@@L@G3?>@EJ1C;IGHEHCGJEK{90/5@EALC>9EBF=>3GLN?>?)8,)%')&*J{-JD]P78,C=E>IE-C;N7XJAJ?8E<=%56A>?a?BN=4MZ7;%`8A3EK?BALKE+B-3AH1ME0GIGF*@=1)&(,)(0'()'CKD47DNC(:JEIGLG:GDL,SGEO>HD:P(A5BLGF/C&EMM0S{2X>?D,I9@9;9%,%3{C6ELDJ9QJ:.E{HE-{D=B0:U,5*.)7.HQ'D;=GJ6G(3<>4*-*VO7;M@K-HI@6NBG?0;@/'1*$.&%&((*F;SK:IBD@=={ 5 | -------------------------------------------------------------------------------- /test/read2.fastq: -------------------------------------------------------------------------------- 1 | @forward_read 2 | CTTTTCATCTCGTCATTTCATTTGATCATTCAGTTTCATCAATTCGTCATTTCATCATTTAATTTCATTATTTCATCAATTAATCATCTAACTTCATTTCATCATTTCATCATTTTCATATCATTTCTTCATACCACGTTTGATCTATCTCTAGTTCATTTCACTTCGTCATTATTTCATCATTTCATTTACTCATTTCATTTGACCATTTAATCATTTTTATTTCAACACCCCATTTCATTACATTTCATCATTTCACTTTCATCTCATCATTTCATGTCATGATTTCATTTCATCAGTTCATCTTTTAATCTCATTTATTTTAATCATTTTGTTCATTATACCTTTCATTTCATTTCATGATTTCATTTGGTTTGATATCATTTCCTCAATACATTATTTCATCTTTTCATTTCATTTCACCATTTCATAATTCCATTTCATCATT 3 | + 4 | &'&%/C8-D?EEK;50J9MCA1CJKI*,B<&))2)*JXAGTFHLM)HA-6@>LLD{-0@52.+2)/L=AADJDN3?B1%()0/5==;,,<((.?B@LB3:A:NE(L3GCD4:13?0+%&DJ6A@L,).=%)'*(%(('//DB(-%+(&2&4''24FFKB966=31&0.,9).1LH9:JNC(/7-/-4G=+2-C-*+*27(D&(1(/):'.::'+'6;'&7+)%('.R/JIJ+3HG=UCEE>A9DFHL=4.++'*,+=E?CY>4:<4:7+2('3B,J@BH{>+B)2/%&C(3*:@=PK4795'0+&HB{-@)=;0)6)0,M,H1.7;'&-5,3+D,@,.*'1%)*)/O)UN?L<)11<.&&4**E7*GD@D;:M>1KAIG:L4:L?PN:FA8FNC?BDHBN<:3KU;7?BAM:83KDE=GLAEC(F?H.5:2-0,&)**){K)G?>H@668IC)CGDJ)DH:2?3E8?A9A>H=BDJL5D>F:A7@J49E;'7+3(ENBA@{IICI=J7P{FHAE:PIBD3@NBDE3E4EHB8FEKD8&S>F@*4:?2:@,,.1-(EbLKQM7HA:ECCJ:89{TEF{F@XI9>@P@@E=*P@J/?F(H@?2D7DEMEE5{JG?@GBETA/L{{H@LGE:4+0646(3AA;M9HGAEIISRCB,L7BCHKQHIAI+A(.&.')K79KIKIJ{FC=AEBF;LCU6MIGHA 5 | -------------------------------------------------------------------------------- /tools/ChainX-block-graph/Makefile: -------------------------------------------------------------------------------- 1 | CPPFLAGS=-Ofast -march=native --std=c++20 -pthread 2 | #CPPFLAGS=-g -O0 --std=c++20 -pthread 3 | HEADERS=-I ../sdsl-lite-v3/include -I ../concurrentqueue 4 | 5 | all : chainx-block-graph 6 | 7 | chainx-block-graph : chainx-block-graph.cpp chainx-block-graph.hpp chaining.hpp efg.hpp command-line-parsing/cmdline.h command-line-parsing/cmdline.c 8 | g++ $(CPPFLAGS) $(HEADERS) \ 9 | chainx-block-graph.cpp command-line-parsing/cmdline.c \ 10 | -o chainx-block-graph 11 | 12 | # uncomment for development 13 | #command-line-parsing/cmdline%c command-line-parsing/cmdline%h : command-line-parsing/config.ggo 14 | # gengetopt \ 15 | # --input=./command-line-parsing/config.ggo \ 16 | # --output-dir=./command-line-parsing/ \ 17 | # --unnamed-opts 18 | 19 | test : chainx-block-graph test/test.sh 20 | test/test.sh 21 | 22 | .PHONY : clean all test cleanall 23 | 24 | #clean : 25 | # rm -Rf command-line-parsing/cmdline.c command-line-parsing/cmdline.h test/output* 26 | #cleanall : 27 | # rm -Rf chainx-block-graph command-line-parsing/cmdline.c command-line-parsing/cmdline.h test/output* 28 | clean : 29 | rm -Rf test/output* 30 | cleanall : 31 | rm -Rf chainx-block-graph test/output* 32 | -------------------------------------------------------------------------------- /tools/ChainX-block-graph/README.md: -------------------------------------------------------------------------------- 1 | # chainx-block-graph 2 | Program to perform co-linear chaining on the Elastic Degenerate String relaxation of Elastic Founder Graphs. 3 | 4 | ## todo 5 | - always collect statistics 6 | - docs 7 | - more tests trying complex chains 8 | - get secondary chains with fancy backtracking 9 | -------------------------------------------------------------------------------- /tools/ChainX-block-graph/chaining.hpp: -------------------------------------------------------------------------------- 1 | //code adapted from https://github.com/at-cg/ChainX 2 | //Chirag Jain, Daniel Gibney and Sharma Thankachan. Algorithms for Colinear Chaining with Overlaps and Gap Costs. Journal of Computational Biology, 2022 3 | //license: https://www.apache.org/licenses/LICENSE-2.0.txt 4 | 5 | #ifndef CHAIN_HPP 6 | #define CHAIN_HPP 7 | 8 | #include "efg.hpp" 9 | 10 | //#define CHAIN_HPP_DEBUG 11 | 12 | using std::vector, std::swap; 13 | 14 | namespace chainx_block_graph { 15 | /** 16 | * Compute optimal chain based on anchor-restricted edit distance using 17 | * strong precedence criteria optimized to run faster using engineering 18 | * trick(s), comparison mode: global. 19 | * We assume the anchors are sorted by the starting positions in the 20 | * linear text and that there are two dummy anchors marking the 21 | * beginning and end of the references. 22 | * graph.init_eds_support() must have been called before this function 23 | **/ 24 | vector chain_global_eds(vector &anchors, const Elasticfoundergraph &graph, const int initial_guess, const double ramp_up_factor, Stats &stats, const bool removesol = false) 25 | { 26 | //graph.init_eds_support(); 27 | 28 | int n = anchors.size(); 29 | vector costs(n, 0); 30 | vector backtrack(n, 0); 31 | 32 | int bound_redit = initial_guess; //distance assumed to be <= initial_guess 33 | int revisions = 0; 34 | //with this assumption on upper bound of distance, a gap of >bound_redit will not be allowed between adjacent anchors 35 | 36 | while (true) { 37 | int inner_loop_start = 0; 38 | 39 | for(int j=1; j::max(); 42 | int backtrack_min_cost = std::numeric_limits::max(); 43 | 44 | // anchor i < anchor j 45 | 46 | while (anchors[inner_loop_start].gap_query(anchors[j]) > bound_redit) 47 | inner_loop_start++; 48 | 49 | for(int i=j-1; i>=inner_loop_start; i--) { 50 | if (costs[i] < std::numeric_limits::max() and are_colinear_eds(anchors[i], anchors[j], graph)) { 51 | int g = max_gap_eds(anchors[i], anchors[j], graph); 52 | int o = overlap_eds(anchors[i], anchors[j], graph); 53 | #ifdef CHAIN_HPP_DEBUG 54 | std::cerr << "anchors[" << i << "] -> anchors[" << j << "]: g = " << g << ", o = " << o << std::endl; 55 | #endif 56 | if (costs[i] + g + o < find_min_cost) { 57 | find_min_cost = costs[i] + g + o; 58 | backtrack_min_cost = i; 59 | } 60 | } 61 | } 62 | //save optimal cost at offset j 63 | costs[j] = find_min_cost; 64 | backtrack[j] = backtrack_min_cost; 65 | } 66 | 67 | if (costs[n-1] > bound_redit) { 68 | bound_redit = bound_redit * ramp_up_factor; 69 | revisions++; 70 | } else { 71 | break; 72 | } 73 | } 74 | 75 | #ifdef CHAIN_HPP_DEBUG 76 | std::cerr << "Cost DP array = "; 77 | for (auto c : costs) 78 | std::cerr << c << " "; 79 | std::cerr << std::endl; 80 | std::cerr << "Backtrack array = "; 81 | for (auto b : backtrack) 82 | std::cerr << b << " "; 83 | std::cerr << std::endl; 84 | std::cerr << "Chaining cost computed " << revisions + 1 << " times" << "\n"; 85 | #endif 86 | 87 | //TODO: consider freeing here the space of costs array 88 | // backtrack optimal solution 89 | vector solution; 90 | for (int j = backtrack[n - 1]; j > 0; j = backtrack[j]) 91 | { 92 | solution.push_back(anchors[j]); 93 | } 94 | std::reverse(solution.begin(), solution.end()); 95 | if (removesol) 96 | { 97 | vector newanchors; 98 | newanchors.reserve(anchors.size() - solution.size()); 99 | for (int i = 0, j = 0; i < anchors.size(); i += 1) 100 | { 101 | if (j < solution.size() and anchors[i] == solution[j]) 102 | { 103 | j += 1; 104 | } else { 105 | newanchors.push_back(anchors[i]); 106 | } 107 | } 108 | swap(anchors, newanchors); 109 | } 110 | 111 | //std::cout << "distance = " << costs[n-1] << std::endl; 112 | if (solution.size() > 0) { 113 | stats.maxiterations = std::max(stats.maxiterations, revisions); 114 | stats.miniterations = std::min(stats.miniterations, revisions); 115 | stats.totaliterations += revisions; 116 | stats.maxcost = std::max(stats.maxcost, costs[n-1]); 117 | stats.mincost = std::min(stats.mincost, costs[n-1]); 118 | stats.totalcost += costs[n-1]; 119 | const double relativecost = (double)costs[n-1] / solution.at(0).get_query_length(); 120 | stats.maxrelativecost = std::max(stats.maxrelativecost, relativecost); 121 | stats.minrelativecost = std::min(stats.minrelativecost, relativecost); 122 | stats.totalrelativecost += relativecost; 123 | } 124 | 125 | return solution; 126 | } 127 | 128 | /** 129 | * See chain_global_eds, comparison mode: semiglobal. 130 | **/ 131 | vector chain_semiglobal_eds(vector &anchors, const Elasticfoundergraph &graph, const int initial_guess, const double ramp_up_factor, Stats &stats, const bool removesol = false) 132 | { 133 | //graph.init_eds_support(); 134 | 135 | int n = anchors.size(); 136 | vector costs(n, 0); 137 | vector backtrack(n, 0); 138 | 139 | int bound_redit = initial_guess; //distance assumed to be <= initial_guess 140 | int revisions = 0; 141 | //with this assumption on upper bound of distance, a gap of >bound_redit will not be allowed between adjacent anchors 142 | 143 | while (true) { 144 | int inner_loop_start = 0; 145 | 146 | for(int j=1; j::max(); 149 | int backtrack_min_cost = std::numeric_limits::max(); 150 | 151 | // anchor i < anchor j 152 | 153 | while (anchors[inner_loop_start].gap_query(anchors[j]) > bound_redit) 154 | inner_loop_start++; 155 | 156 | { 157 | //always consider the first dummy anchor 158 | //connection to first dummy anchor is done with modified cost to allow free gaps 159 | //int i_d = std::get<1>(anchors[0]) + std::get<2>(anchors[0]) - 1; 160 | //int qry_gap = j_c - i_d - 1; 161 | int queryg = GAFHit::gap_query(anchors[0], anchors[j]); 162 | find_min_cost = std::min(find_min_cost, costs[0] + queryg); 163 | backtrack_min_cost = 0; 164 | #ifdef CHAIN_HPP_DEBUG 165 | std::cerr << "anchors[" << "0" << "] -> anchors[" << j << "]: queryg = " << queryg << std::endl; 166 | #endif 167 | } 168 | 169 | //process all anchors in array for the final last dummy anchor 170 | if (j == n-1) 171 | inner_loop_start=0; 172 | 173 | for(int i=j-1; i>=inner_loop_start; i--) { 174 | if (costs[i] < std::numeric_limits::max() and are_colinear_eds(anchors[i], anchors[j], graph)) { 175 | int g; 176 | if (j == n-1) //modified cost for the last dummy anchor to allow free gaps 177 | g = GAFHit::gap_query(anchors[i], anchors[j]); 178 | else 179 | g = max_gap_eds(anchors[i], anchors[j], graph); 180 | 181 | int o = overlap_eds(anchors[i], anchors[j], graph); 182 | #ifdef CHAIN_HPP_DEBUG 183 | std::cerr << "anchors[" << i << "] -> anchors[" << j << "]: g = " << g << ", o = " << o << std::endl; 184 | #endif 185 | if (costs[i] + g + o < find_min_cost) { 186 | find_min_cost = costs[i] + g + o; 187 | backtrack_min_cost = i; 188 | } 189 | } 190 | } 191 | //save optimal cost at offset j 192 | costs[j] = find_min_cost; 193 | backtrack[j] = backtrack_min_cost; 194 | } 195 | 196 | if (costs[n-1] > bound_redit) { 197 | bound_redit = bound_redit * ramp_up_factor; 198 | revisions++; 199 | } else { 200 | break; 201 | } 202 | } 203 | 204 | #ifdef CHAIN_HPP_DEBUG 205 | std::cerr << "Cost DP array = "; 206 | for (auto c : costs) 207 | std::cerr << c << " "; 208 | std::cerr << std::endl; 209 | std::cerr << "Backtrack array = "; 210 | for (auto b : backtrack) 211 | std::cerr << b << " "; 212 | std::cerr << std::endl; 213 | std::cerr << "Chaining cost computed " << revisions + 1 << " times" << "\n"; 214 | #endif 215 | 216 | //TODO: consider freeing here the space of costs array 217 | // backtrack optimal solution 218 | vector solution; 219 | for (int j = backtrack[n - 1]; j > 0; j = backtrack[j]) 220 | { 221 | solution.push_back(anchors[j]); 222 | } 223 | std::reverse(solution.begin(), solution.end()); 224 | if (removesol) 225 | { 226 | vector newanchors; 227 | newanchors.reserve(anchors.size() - solution.size()); 228 | for (int i = 0, j = 0; i < anchors.size(); i += 1) 229 | { 230 | if (j < solution.size() and anchors[i] == solution[j]) 231 | { 232 | j += 1; 233 | } else { 234 | newanchors.push_back(anchors[i]); 235 | } 236 | } 237 | swap(anchors, newanchors); 238 | } 239 | 240 | //std::cerr << "distance = " << costs[n-1] << std::endl; 241 | //std::cerr << "length - distance = " << anchors[0].get_query_length() - costs[n-1] << std::endl; 242 | if (solution.size() > 0) { 243 | stats.maxiterations = std::max(stats.maxiterations, revisions); 244 | stats.miniterations = std::min(stats.miniterations, revisions); 245 | stats.totaliterations += revisions; 246 | stats.maxcost = std::max(stats.maxcost, costs[n-1]); 247 | stats.mincost = std::min(stats.mincost, costs[n-1]); 248 | stats.totalcost += costs[n-1]; 249 | const double relativecost = (double)costs[n-1] / solution.at(0).get_query_length(); 250 | stats.maxrelativecost = std::max(stats.maxrelativecost, relativecost); 251 | stats.minrelativecost = std::min(stats.minrelativecost, relativecost); 252 | stats.totalrelativecost += relativecost; 253 | } 254 | 255 | return solution; 256 | } 257 | 258 | } // Namespace chainx_block_graph 259 | 260 | #endif //CHAIN_HPP 261 | -------------------------------------------------------------------------------- /tools/ChainX-block-graph/chaining_hpp_license.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2015 Georgia Institute of Technology 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /tools/ChainX-block-graph/chainx-block-graph.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CHAINX_BLOCK_GRAPH_HPP 2 | #define CHAINX_BLOCK_GRAPH_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using std::string, std::ifstream, std::ofstream; 9 | 10 | namespace chainx_block_graph { 11 | struct Params { 12 | ifstream graphfs; 13 | ifstream anchorsfs; 14 | ofstream outputfs; 15 | string ignorechars; 16 | bool unsorted_anchors; 17 | bool global; 18 | bool semiglobal; 19 | bool nosplit; 20 | bool splitgraphaligner; 21 | int threads; 22 | int alternativealignments; 23 | int initialguess; 24 | double initialguesscov; 25 | double rampupfactor; 26 | }; 27 | 28 | struct Stats { 29 | unsigned long long seeds = 0; 30 | unsigned long long reads = 0; 31 | int maxiterations = 0; 32 | int miniterations = std::numeric_limits::max(); 33 | unsigned long long totaliterations = 0; 34 | 35 | int maxcost = 0; 36 | int mincost = std::numeric_limits::max(); 37 | unsigned long long totalcost = 0; 38 | 39 | double maxrelativecost = 0; 40 | double minrelativecost = std::numeric_limits::max(); 41 | double totalrelativecost = 0; 42 | }; 43 | 44 | struct Stats mergestats(const struct Stats &s1, const struct Stats &s2) 45 | { 46 | struct Stats s; 47 | s.seeds = s1.seeds + s2.seeds; 48 | s.reads = s1.reads + s2.reads; 49 | s.maxiterations = std::max(s1.maxiterations, s2.maxiterations); 50 | s.miniterations = std::min(s1.miniterations, s2.miniterations); 51 | s.totaliterations = s1.totaliterations + s2.totaliterations; 52 | s.maxcost = std::max(s1.maxcost, s2.maxcost); 53 | s.mincost = std::min(s1.mincost, s2.mincost); 54 | s.totalcost = s1.totalcost + s2.totalcost; 55 | s.maxrelativecost = std::max(s1.maxrelativecost, s2.maxrelativecost); 56 | s.minrelativecost = std::min(s1.minrelativecost, s2.minrelativecost); 57 | s.totalrelativecost = s1.totalrelativecost + s2.totalrelativecost; 58 | return s; 59 | } 60 | 61 | } 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /tools/ChainX-block-graph/command-line-parsing/cmdline.h: -------------------------------------------------------------------------------- 1 | /** @file cmdline.h 2 | * @brief The header file for the command line option parser 3 | * generated by GNU Gengetopt version 2.23 4 | * http://www.gnu.org/software/gengetopt. 5 | * DO NOT modify this file, since it can be overwritten 6 | * @author GNU Gengetopt */ 7 | 8 | #ifndef CMDLINE_H 9 | #define CMDLINE_H 10 | 11 | /* If we use autoconf. */ 12 | #ifdef HAVE_CONFIG_H 13 | #include "config.h" 14 | #endif 15 | 16 | #include /* for FILE */ 17 | 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif /* __cplusplus */ 21 | 22 | #ifndef CMDLINE_PARSER_PACKAGE 23 | /** @brief the program name (used for printing errors) */ 24 | #define CMDLINE_PARSER_PACKAGE "chainx-block-graph" 25 | #endif 26 | 27 | #ifndef CMDLINE_PARSER_PACKAGE_NAME 28 | /** @brief the complete program name (used for help and version) */ 29 | #define CMDLINE_PARSER_PACKAGE_NAME "chainx-block-graph" 30 | #endif 31 | 32 | #ifndef CMDLINE_PARSER_VERSION 33 | /** @brief the program version */ 34 | #define CMDLINE_PARSER_VERSION "0.0" 35 | #endif 36 | 37 | /** @brief Where the command line options are stored */ 38 | struct gengetopt_args_info 39 | { 40 | const char *help_help; /**< @brief Print help and exit help description. */ 41 | const char *full_help_help; /**< @brief Print help, including hidden options, and exit help description. */ 42 | const char *version_help; /**< @brief Print version and exit help description. */ 43 | int chain_to_eds_flag; /**< @brief Perform chaining on the Elastic Degenerate String relaxation of the graph (currently the only implemented chaining solution) (default=on). */ 44 | const char *chain_to_eds_help; /**< @brief Perform chaining on the Elastic Degenerate String relaxation of the graph (currently the only implemented chaining solution) help description. */ 45 | int global_flag; /**< @brief Chain between the whole query and any maximal graph path (default=off). */ 46 | const char *global_help; /**< @brief Chain between the whole query and any maximal graph path help description. */ 47 | int semi_global_flag; /**< @brief Chain between the whole query and any graph subpath (default=off). */ 48 | const char *semi_global_help; /**< @brief Chain between the whole query and any graph subpath help description. */ 49 | int unsorted_input_flag; /**< @brief Do not assume the input GAF anchors to be sorted (at the cost of loading all anchors) (default=off). */ 50 | const char *unsorted_input_help; /**< @brief Do not assume the input GAF anchors to be sorted (at the cost of loading all anchors) help description. */ 51 | int no_split_output_matches_flag; /**< @brief Do not split edge matches into node matches in the output chains (default=off). */ 52 | const char *no_split_output_matches_help; /**< @brief Do not split edge matches into node matches in the output chains help description. */ 53 | long initial_guess_arg; /**< @brief Fix a constant starting guess for the cost of the optimal chain (default='100'). */ 54 | char * initial_guess_orig; /**< @brief Fix a constant starting guess for the cost of the optimal chain original value given at command line. */ 55 | const char *initial_guess_help; /**< @brief Fix a constant starting guess for the cost of the optimal chain help description. */ 56 | double initial_guess_coverage_arg; /**< @brief Have the starting guess for the optimal cost be a fraction of the inverse coverage of the read (GUESS * (read length - read coverage)) instead of a constant (by default this is disabled) (default='0'). */ 57 | char * initial_guess_coverage_orig; /**< @brief Have the starting guess for the optimal cost be a fraction of the inverse coverage of the read (GUESS * (read length - read coverage)) instead of a constant (by default this is disabled) original value given at command line. */ 58 | const char *initial_guess_coverage_help; /**< @brief Have the starting guess for the optimal cost be a fraction of the inverse coverage of the read (GUESS * (read length - read coverage)) instead of a constant (by default this is disabled) help description. */ 59 | double ramp_up_factor_arg; /**< @brief At each chaining iteration, multiply by RAMPUP the guess for the cost of the optimal chain (default='4.0'). */ 60 | char * ramp_up_factor_orig; /**< @brief At each chaining iteration, multiply by RAMPUP the guess for the cost of the optimal chain original value given at command line. */ 61 | const char *ramp_up_factor_help; /**< @brief At each chaining iteration, multiply by RAMPUP the guess for the cost of the optimal chain help description. */ 62 | long alternative_chains_arg; /**< @brief Chain N+1 times, removing the used anchors after each execution, and output all chains (default='0'). */ 63 | char * alternative_chains_orig; /**< @brief Chain N+1 times, removing the used anchors after each execution, and output all chains original value given at command line. */ 64 | const char *alternative_chains_help; /**< @brief Chain N+1 times, removing the used anchors after each execution, and output all chains help description. */ 65 | long threads_arg; /**< @brief Max # threads (default='-1'). */ 66 | char * threads_orig; /**< @brief Max # threads original value given at command line. */ 67 | const char *threads_help; /**< @brief Max # threads help description. */ 68 | int overwrite_flag; /**< @brief Overwrite the output file, if it exists (default=off). */ 69 | const char *overwrite_help; /**< @brief Overwrite the output file, if it exists help description. */ 70 | int split_output_matches_graphaligner_flag; /**< @brief Filter out node matches of length 1 for use in GraphAligner (default=off). */ 71 | const char *split_output_matches_graphaligner_help; /**< @brief Filter out node matches of length 1 for use in GraphAligner help description. */ 72 | 73 | unsigned int help_given ; /**< @brief Whether help was given. */ 74 | unsigned int full_help_given ; /**< @brief Whether full-help was given. */ 75 | unsigned int version_given ; /**< @brief Whether version was given. */ 76 | unsigned int chain_to_eds_given ; /**< @brief Whether chain-to-eds was given. */ 77 | unsigned int global_given ; /**< @brief Whether global was given. */ 78 | unsigned int semi_global_given ; /**< @brief Whether semi-global was given. */ 79 | unsigned int unsorted_input_given ; /**< @brief Whether unsorted-input was given. */ 80 | unsigned int no_split_output_matches_given ; /**< @brief Whether no-split-output-matches was given. */ 81 | unsigned int initial_guess_given ; /**< @brief Whether initial-guess was given. */ 82 | unsigned int initial_guess_coverage_given ; /**< @brief Whether initial-guess-coverage was given. */ 83 | unsigned int ramp_up_factor_given ; /**< @brief Whether ramp-up-factor was given. */ 84 | unsigned int alternative_chains_given ; /**< @brief Whether alternative-chains was given. */ 85 | unsigned int threads_given ; /**< @brief Whether threads was given. */ 86 | unsigned int overwrite_given ; /**< @brief Whether overwrite was given. */ 87 | unsigned int split_output_matches_graphaligner_given ; /**< @brief Whether split-output-matches-graphaligner was given. */ 88 | 89 | char **inputs ; /**< @brief unnamed options (options without names) */ 90 | unsigned inputs_num ; /**< @brief unnamed options number */ 91 | } ; 92 | 93 | /** @brief The additional parameters to pass to parser functions */ 94 | struct cmdline_parser_params 95 | { 96 | int override; /**< @brief whether to override possibly already present options (default 0) */ 97 | int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */ 98 | int check_required; /**< @brief whether to check that all required options were provided (default 1) */ 99 | int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */ 100 | int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */ 101 | } ; 102 | 103 | /** @brief the purpose string of the program */ 104 | extern const char *gengetopt_args_info_purpose; 105 | /** @brief the usage string of the program */ 106 | extern const char *gengetopt_args_info_usage; 107 | /** @brief the description string of the program */ 108 | extern const char *gengetopt_args_info_description; 109 | /** @brief all the lines making the help output */ 110 | extern const char *gengetopt_args_info_help[]; 111 | /** @brief all the lines making the full help output (including hidden options) */ 112 | extern const char *gengetopt_args_info_full_help[]; 113 | 114 | /** 115 | * The command line parser 116 | * @param argc the number of command line options 117 | * @param argv the command line options 118 | * @param args_info the structure where option information will be stored 119 | * @return 0 if everything went fine, NON 0 if an error took place 120 | */ 121 | int cmdline_parser (int argc, char **argv, 122 | struct gengetopt_args_info *args_info); 123 | 124 | /** 125 | * The command line parser (version with additional parameters - deprecated) 126 | * @param argc the number of command line options 127 | * @param argv the command line options 128 | * @param args_info the structure where option information will be stored 129 | * @param override whether to override possibly already present options 130 | * @param initialize whether to initialize the option structure my_args_info 131 | * @param check_required whether to check that all required options were provided 132 | * @return 0 if everything went fine, NON 0 if an error took place 133 | * @deprecated use cmdline_parser_ext() instead 134 | */ 135 | int cmdline_parser2 (int argc, char **argv, 136 | struct gengetopt_args_info *args_info, 137 | int override, int initialize, int check_required); 138 | 139 | /** 140 | * The command line parser (version with additional parameters) 141 | * @param argc the number of command line options 142 | * @param argv the command line options 143 | * @param args_info the structure where option information will be stored 144 | * @param params additional parameters for the parser 145 | * @return 0 if everything went fine, NON 0 if an error took place 146 | */ 147 | int cmdline_parser_ext (int argc, char **argv, 148 | struct gengetopt_args_info *args_info, 149 | struct cmdline_parser_params *params); 150 | 151 | /** 152 | * Save the contents of the option struct into an already open FILE stream. 153 | * @param outfile the stream where to dump options 154 | * @param args_info the option struct to dump 155 | * @return 0 if everything went fine, NON 0 if an error took place 156 | */ 157 | int cmdline_parser_dump(FILE *outfile, 158 | struct gengetopt_args_info *args_info); 159 | 160 | /** 161 | * Save the contents of the option struct into a (text) file. 162 | * This file can be read by the config file parser (if generated by gengetopt) 163 | * @param filename the file where to save 164 | * @param args_info the option struct to save 165 | * @return 0 if everything went fine, NON 0 if an error took place 166 | */ 167 | int cmdline_parser_file_save(const char *filename, 168 | struct gengetopt_args_info *args_info); 169 | 170 | /** 171 | * Print the help 172 | */ 173 | void cmdline_parser_print_help(void); 174 | /** 175 | * Print the full help (including hidden options) 176 | */ 177 | void cmdline_parser_print_full_help(void); 178 | /** 179 | * Print the version 180 | */ 181 | void cmdline_parser_print_version(void); 182 | 183 | /** 184 | * Initializes all the fields a cmdline_parser_params structure 185 | * to their default values 186 | * @param params the structure to initialize 187 | */ 188 | void cmdline_parser_params_init(struct cmdline_parser_params *params); 189 | 190 | /** 191 | * Allocates dynamically a cmdline_parser_params structure and initializes 192 | * all its fields to their default values 193 | * @return the created and initialized cmdline_parser_params structure 194 | */ 195 | struct cmdline_parser_params *cmdline_parser_params_create(void); 196 | 197 | /** 198 | * Initializes the passed gengetopt_args_info structure's fields 199 | * (also set default values for options that have a default) 200 | * @param args_info the structure to initialize 201 | */ 202 | void cmdline_parser_init (struct gengetopt_args_info *args_info); 203 | /** 204 | * Deallocates the string fields of the gengetopt_args_info structure 205 | * (but does not deallocate the structure itself) 206 | * @param args_info the structure to deallocate 207 | */ 208 | void cmdline_parser_free (struct gengetopt_args_info *args_info); 209 | 210 | /** 211 | * Checks that all the required options were specified 212 | * @param args_info the structure to check 213 | * @param prog_name the name of the program that will be used to print 214 | * possible errors 215 | * @return 216 | */ 217 | int cmdline_parser_required (struct gengetopt_args_info *args_info, 218 | const char *prog_name); 219 | 220 | 221 | #ifdef __cplusplus 222 | } 223 | #endif /* __cplusplus */ 224 | #endif /* CMDLINE_H */ 225 | -------------------------------------------------------------------------------- /tools/ChainX-block-graph/command-line-parsing/config.ggo: -------------------------------------------------------------------------------- 1 | version "0.0" 2 | package "chainx-block-graph" 3 | purpose "Program to perform colinear chaining on Elastic Founder Graphs" 4 | usage "chainx-block-graph (--global|--semi-global) graph.gfa anchors.gaf output.gaf" 5 | 6 | description "The program takes in input an Elastic Founder Graph (xGFA) and exact matches between text queries and the graph (GAF), it computes the anchor-restricted edit distance between the queries and the (relaxation of the) graph, and outputs the corresponding chain in GAF format." 7 | 8 | option "chain-to-eds" - "Perform chaining on the Elastic Degenerate String relaxation of the graph (currently the only implemented chaining solution)" flag on 9 | option "global" g "Chain between the whole query and any maximal graph path" flag off 10 | option "semi-global" s "Chain between the whole query and any graph subpath" flag off 11 | option "unsorted-input" - "Do not assume the input GAF anchors to be sorted (at the cost of loading all anchors)" flag off 12 | option "no-split-output-matches" - "Do not split edge matches into node matches in the output chains" flag off 13 | option "initial-guess" - "Fix a constant starting guess for the cost of the optimal chain" long typestr = "GUESS" default="100" optional 14 | option "initial-guess-coverage" - "Have the starting guess for the optimal cost be a fraction of the inverse coverage of the read (GUESS * (read length - read coverage)) instead of a constant (by default this is disabled)" double typestr = "GUESS" default="0" optional 15 | option "ramp-up-factor" - "At each chaining iteration, multiply by RAMPUP the guess for the cost of the optimal chain" double typestr = "RAMPUP" default="4.0" optional 16 | option "alternative-chains" a "Chain N+1 times, removing the used anchors after each execution, and output all chains" long typestr = "N" default="0" optional 17 | 18 | option "threads" t "Max # threads" long typestr = "THREADNUM" default = "-1" optional 19 | option "overwrite" - "Overwrite the output file, if it exists" flag off 20 | 21 | option "split-output-matches-graphaligner" - "Filter out node matches of length 1 for use in GraphAligner" flag off hidden 22 | -------------------------------------------------------------------------------- /tools/ChainX-block-graph/test/correctoutput/anchors-1-global.gaf: -------------------------------------------------------------------------------- 1 | q1 8 1 3 + >n2 4 1 3 0 0 255 2 | q1 8 3 5 + >n3 3 0 2 0 0 255 3 | q1 8 5 7 + >n5 4 1 3 0 0 255 4 | q2 8 0 1 + >n2 4 3 4 0 0 255 5 | q2 8 1 4 + >n4 3 0 3 0 0 255 6 | q2 8 1 4 + >n4 3 0 3 0 0 255 7 | q2 8 4 8 + >n5 4 0 4 0 0 255 8 | -------------------------------------------------------------------------------- /tools/ChainX-block-graph/test/correctoutput/anchors-1-semi-global.gaf: -------------------------------------------------------------------------------- 1 | q2 8 0 1 + >n2 4 3 4 0 0 255 2 | q2 8 1 4 + >n4 3 0 3 0 0 255 3 | q2 8 1 4 + >n4 3 0 3 0 0 255 4 | q2 8 4 8 + >n5 4 0 4 0 0 255 5 | q1 8 1 3 + >n3 3 0 2 0 0 255 6 | q1 8 5 7 + >n5 4 1 3 0 0 255 7 | -------------------------------------------------------------------------------- /tools/ChainX-block-graph/test/input/anchors-1.gaf: -------------------------------------------------------------------------------- 1 | q1 8 0 1 + >n4 3 0 1 0 0 255 2 | q1 8 7 8 + >n4 3 0 1 0 0 255 3 | q1 8 1 3 + >n2 4 1 3 0 0 255 4 | q1 8 1 3 + >n3 3 0 2 0 0 255 5 | q1 8 1 3 + >n5 4 1 3 0 0 255 6 | q1 8 3 5 + >n2 4 1 3 0 0 255 7 | q1 8 3 5 + >n3 3 0 2 0 0 255 8 | q1 8 3 5 + >n5 4 1 3 0 0 255 9 | q1 8 5 7 + >n2 4 1 3 0 0 255 10 | q1 8 5 7 + >n3 3 0 2 0 0 255 11 | q1 8 5 7 + >n5 4 1 3 0 0 255 12 | q2 8 0 4 + >n2>n4 7 3 7 0 0 255 13 | q2 8 1 8 + >n4>n5 7 0 7 0 0 255 14 | -------------------------------------------------------------------------------- /tools/ChainX-block-graph/test/input/graph.gfa: -------------------------------------------------------------------------------- 1 | M 2 12 2 | X 1 2 6 9 3 | B 1 1 2 1 4 | S n1 A 5 | S n2 TACT 6 | S n3 ACT 7 | S n4 TTT 8 | S n5 AACT 9 | L n1 + n2 + 0M 10 | L n2 + n3 + 0M 11 | L n2 + n4 + 0M 12 | L n3 + n5 + 0M 13 | L n4 + n5 + 0M 14 | -------------------------------------------------------------------------------- /tools/ChainX-block-graph/test/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 4 | outputfolder=$thisfolder/output-$(date -Iminutes) 5 | logfile=$outputfolder/log 6 | chainxblockgraph=$thisfolder/../chainx-block-graph 7 | 8 | mkdir $outputfolder 9 | echo -n > $logfile 10 | 11 | for testfile in $thisfolder/input/anchors-1.gaf 12 | do 13 | echo "$testfile : " >> $logfile 14 | basename=$(basename $testfile) 15 | outfileg=$outputfolder/${basename%.*}-global.gaf 16 | correctg=$thisfolder/correctoutput/${basename%.*}-global.gaf 17 | $chainxblockgraph --unsorted-input --global $thisfolder/input/graph.gfa $testfile $outfileg \ 18 | >> $logfile 2>> $logfile 19 | diff <(sort $outfileg) <(sort $correctg) > /dev/null 2>/dev/null 20 | exitcode=$? ; if [ $exitcode -ne 0 ] ; then 21 | echo "Test failed on file $testfile!" | tee -a $logfile 22 | exit 1 23 | fi 24 | 25 | outfilesg=$outputfolder/${basename%.*}-semi-global.gaf 26 | correctsg=$thisfolder/correctoutput/${basename%.*}-semi-global.gaf 27 | 28 | $chainxblockgraph --unsorted-input --semi-global $thisfolder/input/graph.gfa $testfile $outfilesg \ 29 | >> $logfile 2>> $logfile 30 | 31 | diff <(sort $outfilesg) <(sort $correctsg) > /dev/null 2>/dev/null 32 | exitcode=$? ; if [ $exitcode -ne 0 ] ; then 33 | echo "Test failed on file $testfile!" | tee -a $logfile 34 | exit 1 35 | fi 36 | done 37 | -------------------------------------------------------------------------------- /tools/efg-ahocorasick/Makefile: -------------------------------------------------------------------------------- 1 | CPPFLAGS=-Ofast -march=native --std=c++20 2 | 3 | all : efg-ahocorasick extractor 4 | 5 | efg-ahocorasick : src/efg-ahocorasick.rs ../daachorse/target/release/libdaachorse.rlib 6 | rustc src/efg-ahocorasick.rs -C opt-level=3 \ 7 | --extern daachorse=../daachorse/target/release/libdaachorse.rlib 8 | 9 | ../daachorse/target/release/libdaachorse.rlib : ../daachorse/.git 10 | cargo build --release --manifest-path=../daachorse/Cargo.toml 11 | 12 | extractor : src/extractor.cpp src/efg.hpp src/efg-locate.hpp 13 | g++ $(CPPFLAGS) $(HEADERS) -o extractor \ 14 | src/extractor.cpp \ 15 | -------------------------------------------------------------------------------- /tools/efg-ahocorasick/src/efg-ahocorasick.rs: -------------------------------------------------------------------------------- 1 | extern crate daachorse; 2 | use daachorse::DoubleArrayAhoCorasick; 3 | use std::env; 4 | 5 | use std::{ 6 | fs::File, 7 | io::{prelude::*, BufReader}, 8 | path::Path, 9 | }; 10 | 11 | fn lines_from_file(filename: impl AsRef) -> Vec { 12 | let file = File::open(filename).expect("no such file"); 13 | let buf = BufReader::new(file); 14 | buf.lines() 15 | .map(|l| l.expect("Could not parse line")) 16 | .collect() 17 | } 18 | 19 | // --- 20 | 21 | fn main() { 22 | let args: Vec = env::args().collect(); // nodes.txt ids.txt reads.txt read_ids.txt 23 | 24 | let nodes : Vec = lines_from_file(args[1].clone()); 25 | let nodeslen : Vec = nodes.iter().cloned().map(|n| n.len()).collect(); 26 | let ids : Vec = lines_from_file(args[2].clone()); 27 | let reads : Vec = lines_from_file(args[3].clone()); 28 | let rids : Vec = lines_from_file(args[4].clone()); 29 | let automaton : DoubleArrayAhoCorasick = DoubleArrayAhoCorasick::new(nodes).unwrap(); 30 | 31 | let mut printed = false; 32 | for (read,readid) in reads.into_iter().zip(rids.into_iter()) { 33 | let readlen = read.len(); 34 | let it = automaton.find_overlapping_iter(read); 35 | 36 | for arg in it { 37 | if !printed { 38 | printed = true; 39 | } else { 40 | print!("\n"); 41 | } 42 | print!("{}\t{}\t{}\t{}\t+\t>{}\t{}\t0\t{}\t0\t0\t255", readid, readlen, arg.start(), arg.end(), ids[arg.value()], nodeslen[arg.value()], nodeslen[arg.value()]); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /tools/efg-ahocorasick/src/efg-locate.hpp: -------------------------------------------------------------------------------- 1 | #ifndef EFG_LOCATE_HPP 2 | #define EFG_LOCATE_HPP 3 | 4 | #include 5 | #include 6 | 7 | using std::string, std::ifstream, std::ofstream; 8 | 9 | struct Params { 10 | ifstream graphfs; 11 | ifstream patternsfs; 12 | ofstream outputfs; 13 | string ignorechars; 14 | bool reversecompl; 15 | int threads; 16 | int mincoverage; 17 | bool reportstats; 18 | bool renamereversecomplement; 19 | bool splitoutputmatches; 20 | bool splitoutputmatchesgraphaligner; 21 | int edgemincount; 22 | bool edgemincountheuristic; 23 | }; 24 | #endif 25 | -------------------------------------------------------------------------------- /tools/efg-ahocorasick/src/extractor.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "efg.hpp" 8 | 9 | using namespace std; 10 | 11 | int main(int argc, char * argv[]) 12 | { 13 | if (argc < 2) 14 | { 15 | cout << "usage: " << argv[0] << " graph.gfa" << std::endl; 16 | return 1; 17 | } 18 | 19 | // open graph file 20 | std::filesystem::path graphpath {argv[1]}; 21 | std::ifstream graphfs = std::ifstream {graphpath}; 22 | if (!graphfs) {std::cerr << "Error opening graph file " << graphpath << "." << std::endl; exit(1);}; 23 | 24 | Elasticfoundergraph graph(graphfs); 25 | vector is_source(graph.ordered_node_ids.size() + 1, true); 26 | vector is_sink(graph.ordered_node_ids.size() + 1, true); 27 | for (int i = 0; i < graph.ordered_node_ids.size(); i++) { 28 | for (int j : graph.edges[i]) { 29 | is_sink[i] = false; 30 | is_source[j] = false; 31 | } 32 | } 33 | 34 | for (int i = 0; i < graph.ordered_node_ids.size(); i++) { 35 | if (!is_source[i] and !is_sink[i]) { 36 | std::cout << graph.ordered_node_labels[i] << "\n"; 37 | std::cerr << graph.ordered_node_ids[i] << "\n"; 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /tools/efg-gaf-splitter/Makefile: -------------------------------------------------------------------------------- 1 | CPPFLAGS=-Ofast -march=native --std=c++20 2 | #CPPFLAGS=-g -O0 --std=c++20 3 | 4 | all : efg-gaf-splitter 5 | 6 | efg-gaf-splitter : efg-gaf-splitter.cpp efg.hpp command-line-parsing/cmdline.h command-line-parsing/cmdline.c 7 | g++ $(CPPFLAGS) \ 8 | efg-gaf-splitter.cpp command-line-parsing/cmdline.c \ 9 | -o efg-gaf-splitter 10 | 11 | # uncomment for development 12 | #command-line-parsing/cmdline%c command-line-parsing/cmdline%h : command-line-parsing/config.ggo 13 | # gengetopt \ 14 | # --input=./command-line-parsing/config.ggo \ 15 | # --output-dir=./command-line-parsing/ \ 16 | # --unnamed-opts 17 | 18 | .PHONY : clean all cleanall 19 | 20 | # uncomment for development 21 | #clean : 22 | # rm -Rf command-line-parsing/cmdline.c command-line-parsing/cmdline.h 23 | #cleanall : 24 | # rm -Rf efg-gaf-splitter command-line-parsing/cmdline.c command-line-parsing/cmdline.h 25 | cleanall : 26 | rm -Rf efg-gaf-splitter 27 | -------------------------------------------------------------------------------- /tools/efg-gaf-splitter/README.md: -------------------------------------------------------------------------------- 1 | # efg-gaf-splitter 2 | Program to split GAF exact matches into node matches that are valid GraphAligner seeds and eventually flip to the reverse complement representation. Expects GCC compiler version >= 10. 3 | 4 | # todo 5 | - docs 6 | - tests 7 | -------------------------------------------------------------------------------- /tools/efg-gaf-splitter/command-line-parsing/cmdline.h: -------------------------------------------------------------------------------- 1 | /** @file cmdline.h 2 | * @brief The header file for the command line option parser 3 | * generated by GNU Gengetopt version 2.23 4 | * http://www.gnu.org/software/gengetopt. 5 | * DO NOT modify this file, since it can be overwritten 6 | * @author GNU Gengetopt */ 7 | 8 | #ifndef CMDLINE_H 9 | #define CMDLINE_H 10 | 11 | /* If we use autoconf. */ 12 | #ifdef HAVE_CONFIG_H 13 | #include "config.h" 14 | #endif 15 | 16 | #include /* for FILE */ 17 | 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif /* __cplusplus */ 21 | 22 | #ifndef CMDLINE_PARSER_PACKAGE 23 | /** @brief the program name (used for printing errors) */ 24 | #define CMDLINE_PARSER_PACKAGE "efg-gaf-splitter" 25 | #endif 26 | 27 | #ifndef CMDLINE_PARSER_PACKAGE_NAME 28 | /** @brief the complete program name (used for help and version) */ 29 | #define CMDLINE_PARSER_PACKAGE_NAME "efg-gaf-splitter" 30 | #endif 31 | 32 | #ifndef CMDLINE_PARSER_VERSION 33 | /** @brief the program version */ 34 | #define CMDLINE_PARSER_VERSION "devel" 35 | #endif 36 | 37 | /** @brief Where the command line options are stored */ 38 | struct gengetopt_args_info 39 | { 40 | const char *help_help; /**< @brief Print help and exit help description. */ 41 | const char *version_help; /**< @brief Print version and exit help description. */ 42 | int sort_flag; /**< @brief Gather and sort the anchors by read (default=off). */ 43 | const char *sort_help; /**< @brief Gather and sort the anchors by read help description. */ 44 | 45 | unsigned int help_given ; /**< @brief Whether help was given. */ 46 | unsigned int version_given ; /**< @brief Whether version was given. */ 47 | unsigned int sort_given ; /**< @brief Whether sort was given. */ 48 | 49 | char **inputs ; /**< @brief unnamed options (options without names) */ 50 | unsigned inputs_num ; /**< @brief unnamed options number */ 51 | } ; 52 | 53 | /** @brief The additional parameters to pass to parser functions */ 54 | struct cmdline_parser_params 55 | { 56 | int override; /**< @brief whether to override possibly already present options (default 0) */ 57 | int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */ 58 | int check_required; /**< @brief whether to check that all required options were provided (default 1) */ 59 | int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */ 60 | int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */ 61 | } ; 62 | 63 | /** @brief the purpose string of the program */ 64 | extern const char *gengetopt_args_info_purpose; 65 | /** @brief the usage string of the program */ 66 | extern const char *gengetopt_args_info_usage; 67 | /** @brief the description string of the program */ 68 | extern const char *gengetopt_args_info_description; 69 | /** @brief all the lines making the help output */ 70 | extern const char *gengetopt_args_info_help[]; 71 | 72 | /** 73 | * The command line parser 74 | * @param argc the number of command line options 75 | * @param argv the command line options 76 | * @param args_info the structure where option information will be stored 77 | * @return 0 if everything went fine, NON 0 if an error took place 78 | */ 79 | int cmdline_parser (int argc, char **argv, 80 | struct gengetopt_args_info *args_info); 81 | 82 | /** 83 | * The command line parser (version with additional parameters - deprecated) 84 | * @param argc the number of command line options 85 | * @param argv the command line options 86 | * @param args_info the structure where option information will be stored 87 | * @param override whether to override possibly already present options 88 | * @param initialize whether to initialize the option structure my_args_info 89 | * @param check_required whether to check that all required options were provided 90 | * @return 0 if everything went fine, NON 0 if an error took place 91 | * @deprecated use cmdline_parser_ext() instead 92 | */ 93 | int cmdline_parser2 (int argc, char **argv, 94 | struct gengetopt_args_info *args_info, 95 | int override, int initialize, int check_required); 96 | 97 | /** 98 | * The command line parser (version with additional parameters) 99 | * @param argc the number of command line options 100 | * @param argv the command line options 101 | * @param args_info the structure where option information will be stored 102 | * @param params additional parameters for the parser 103 | * @return 0 if everything went fine, NON 0 if an error took place 104 | */ 105 | int cmdline_parser_ext (int argc, char **argv, 106 | struct gengetopt_args_info *args_info, 107 | struct cmdline_parser_params *params); 108 | 109 | /** 110 | * Save the contents of the option struct into an already open FILE stream. 111 | * @param outfile the stream where to dump options 112 | * @param args_info the option struct to dump 113 | * @return 0 if everything went fine, NON 0 if an error took place 114 | */ 115 | int cmdline_parser_dump(FILE *outfile, 116 | struct gengetopt_args_info *args_info); 117 | 118 | /** 119 | * Save the contents of the option struct into a (text) file. 120 | * This file can be read by the config file parser (if generated by gengetopt) 121 | * @param filename the file where to save 122 | * @param args_info the option struct to save 123 | * @return 0 if everything went fine, NON 0 if an error took place 124 | */ 125 | int cmdline_parser_file_save(const char *filename, 126 | struct gengetopt_args_info *args_info); 127 | 128 | /** 129 | * Print the help 130 | */ 131 | void cmdline_parser_print_help(void); 132 | /** 133 | * Print the version 134 | */ 135 | void cmdline_parser_print_version(void); 136 | 137 | /** 138 | * Initializes all the fields a cmdline_parser_params structure 139 | * to their default values 140 | * @param params the structure to initialize 141 | */ 142 | void cmdline_parser_params_init(struct cmdline_parser_params *params); 143 | 144 | /** 145 | * Allocates dynamically a cmdline_parser_params structure and initializes 146 | * all its fields to their default values 147 | * @return the created and initialized cmdline_parser_params structure 148 | */ 149 | struct cmdline_parser_params *cmdline_parser_params_create(void); 150 | 151 | /** 152 | * Initializes the passed gengetopt_args_info structure's fields 153 | * (also set default values for options that have a default) 154 | * @param args_info the structure to initialize 155 | */ 156 | void cmdline_parser_init (struct gengetopt_args_info *args_info); 157 | /** 158 | * Deallocates the string fields of the gengetopt_args_info structure 159 | * (but does not deallocate the structure itself) 160 | * @param args_info the structure to deallocate 161 | */ 162 | void cmdline_parser_free (struct gengetopt_args_info *args_info); 163 | 164 | /** 165 | * Checks that all the required options were specified 166 | * @param args_info the structure to check 167 | * @param prog_name the name of the program that will be used to print 168 | * possible errors 169 | * @return 170 | */ 171 | int cmdline_parser_required (struct gengetopt_args_info *args_info, 172 | const char *prog_name); 173 | 174 | 175 | #ifdef __cplusplus 176 | } 177 | #endif /* __cplusplus */ 178 | #endif /* CMDLINE_H */ 179 | -------------------------------------------------------------------------------- /tools/efg-gaf-splitter/command-line-parsing/config.ggo: -------------------------------------------------------------------------------- 1 | version "devel" 2 | package "efg-gaf-splitter" 3 | purpose "Program to split GAF exact matches into node matches that are valid GraphAligner seeds and eventually flip to the reverse complement representation" 4 | usage "efg-gaf-splitter graph.gfa seeds.gaf" 5 | 6 | description "The program takes in input a GFA graph and a set of GAF exact matches, and outputs in stdout the matches split into node matches, filtering node matches of length 1. If the query id in the GAF entries starts with prefix 'rev_', the match is considered to be between the reverse complement of the read and the graph: such prefix is removed and the GAF entries in output are flipped to be between the forward strand of the read and the reverse complement nodes of the graph." 7 | 8 | option "sort" - "Gather and sort the anchors by read" flag off 9 | -------------------------------------------------------------------------------- /tools/efg-gaf-splitter/efg-gaf-splitter.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "command-line-parsing/cmdline.h" // gengetopt-generated parser 11 | #include "efg.hpp" 12 | 13 | using std::string; 14 | 15 | int main(int argc, char* argv[]) 16 | { 17 | gengetopt_args_info argsinfo; 18 | if (cmdline_parser(argc, argv, &argsinfo) != 0) exit(1); 19 | 20 | if (argsinfo.inputs_num == 0) 21 | {cmdline_parser_print_help(); exit(1);}; 22 | if (argsinfo.inputs_num == 1) 23 | {std::cerr << argv[0] << ": missing GAF file" << std::endl; exit(1);}; 24 | if (argsinfo.inputs_num > 2) 25 | {std::cerr << argv[0] << ": too many arguments" << std::endl; exit(1);}; 26 | 27 | // open files 28 | std::filesystem::path graphpath {argsinfo.inputs[0]}; 29 | std::ifstream graphfs {graphpath}; 30 | if (!graphfs) {std::cerr << "Error opening graph file " << graphpath << "." << std::endl; exit(1);}; 31 | 32 | std::filesystem::path gafpath {argsinfo.inputs[1]}; 33 | std::ifstream gaffs {gafpath}; 34 | if (!gaffs) {std::cerr << "Error opening GAF file " << gafpath << "." << std::endl; exit(1);}; 35 | 36 | std::cerr << "Reading the graph..." << std::flush; 37 | Elasticfoundergraph graph(graphfs); 38 | std::cerr << " done." << std::endl; 39 | 40 | if (argsinfo.sort_flag) { 41 | std::cerr << "Reading the seeds..." << std::flush; 42 | vector> seeds = read_gaf(gaffs, graph); 43 | std::cerr << " done." << std::endl; 44 | 45 | std::cerr << "Splitting the seeds..." << std::flush; 46 | for (auto &patternseeds : seeds) { 47 | for (auto &a : patternseeds) { 48 | for (auto &b : a.split_single_graphaligner(graph)) { 49 | if (b.get_query_id().find("rev_") != std::string::npos) { 50 | b.reverse(); 51 | } 52 | std::cout << b.to_string(graph) << std::endl; 53 | } 54 | } 55 | } 56 | std::cerr << " done." << std::endl; 57 | } else { 58 | std::cerr << "Reading and splitting the seeds..." << std::flush; 59 | GAFAnchor seed; 60 | while (read_gaf_single(gaffs, graph, seed)) { 61 | for (auto &b : seed.split_single_graphaligner(graph)) { 62 | if (b.get_query_id().find("rev_") != std::string::npos) { 63 | b.reverse(); 64 | } 65 | std::cout << b.to_string(graph) << std::endl; 66 | } 67 | } 68 | std::cerr << " done." << std::endl; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /tools/efg-locate/Makefile: -------------------------------------------------------------------------------- 1 | CPPFLAGS=-Ofast -march=native --std=c++20 -pthread 2 | #CPPFLAGS=-g -O0 --std=c++20 -pthread 3 | HEADERS=-I ../sdsl-lite-v3/include -I ../concurrentqueue 4 | 5 | all : efg-locate 6 | 7 | efg-locate : efg-locate.cpp efg.hpp algo.cpp command-line-parsing/cmdline.h command-line-parsing/cmdline.c 8 | g++ $(CPPFLAGS) $(HEADERS) \ 9 | efg-locate.cpp command-line-parsing/cmdline.c \ 10 | -o efg-locate 11 | 12 | # uncomment for development 13 | #command-line-parsing/cmdline%c command-line-parsing/cmdline%h : command-line-parsing/config.ggo 14 | # gengetopt \ 15 | # --input=./command-line-parsing/config.ggo \ 16 | # --output-dir=./command-line-parsing/ \ 17 | # --unnamed-opts 18 | 19 | .PHONY : clean all cleanall 20 | 21 | #clean : 22 | # rm -Rf command-line-parsing/cmdline.c command-line-parsing/cmdline.h 23 | #cleanall : 24 | # rm -Rf efg-locate command-line-parsing/cmdline.c command-line-parsing/cmdline.h 25 | cleanall : 26 | rm -Rf efg-locate 27 | -------------------------------------------------------------------------------- /tools/efg-locate/README.md: -------------------------------------------------------------------------------- 1 | # efg-locate 2 | Perform exact or approximate pattern matching on Elastic Founder Graphs. 3 | 4 | ## TODO 5 | - documentation 6 | - investigate I/O bottleneck with high thread number (short-read-exact-match experiment) 7 | - more tests 8 | -------------------------------------------------------------------------------- /tools/efg-locate/command-line-parsing/config.ggo: -------------------------------------------------------------------------------- 1 | version "0.1" 2 | package "efg-locate" 3 | purpose "Program to perform exact and approximate pattern matching on indexable Elastic Founder Graphs." 4 | usage "efg-locate graph.gfa patterns.fasta {paths.gaf,seeds.gaf}" 5 | 6 | description "The program takes in input an indexable Elastic Founder Graph (xGFA) and a set of patterns in FASTA format. In normal mode, the program searches for an exact occurrence of the patterns in the graph, the output is in GFA path format, and the exit value is 0 if all patterns occur and 1 otherwise. In approximate mode (--approximate), the program greedily searches for semi-repeat-free seeds between the patterns and the graph, and the output is in GAF format." 7 | 8 | option "ignore-chars" - "Ignore these characters for the indexability property/pattern matching, breaking up each pattern into maximal strings of non-ignore characters" string optional 9 | option "approximate" - "Approximate pattern matching by greedily matching the pattern in the graph and starting over when the matching fails; output only the recognized matches spanning at least a full node" flag off 10 | option "approximate-edge-match-min-count" - "Consider any approximate occurrence valid if the pattern substring occurs at most COUNT times in the edges" int typestr = "COUNT" default = "0" optional 11 | option "approximate-edge-match-longest" - "Consider the COUNT longest substrings of the pattern appearing in the edges valid" int typestr = "COUNT" default = "0" optional 12 | option "approximate-edge-match-longest-max-count" - "Consider the COUNT longest substrings valid only if they appear less than N times in the edges" int typestr = "COUNT" default = "1000" optional 13 | option "approximate-min-coverage" - "Consider approximate occurrences as valid if they cover at least PERC % of the pattern" int typestr = "PERC" default = "0" optional hidden 14 | option "approximate-stats" - "Output statistics for each read in stdout" flag off 15 | option "reverse-complement" - "Match also the reverse complement of the patterns and output the results as a reverse graph path" flag off 16 | option "rename-reverse-complement" - "When matching the reverse complement of patterns, consider them as a distinct patterns by prepending 'rev_' to its name" flag off 17 | option "split-output-matches" - "In approximate mode (--approximate), split long matches into node matches" flag off 18 | option "split-output-matches-graphaligner" - "Same as --split-output-matches, but filter out node matches of length 1 (for use with GraphAligner --extend)" flag off 19 | option "split-keep-edge-matches" - "In approximate mode and using option --split-output-matches or --split-output-matches-graphaligner, do not split edge matches" flag off 20 | option "threads" t "Number of compute threads" long typestr = "THREADNUM" default = "-1" optional 21 | option "overwrite" - "Overwrite the output file, if it exists" flag off 22 | -------------------------------------------------------------------------------- /tools/efg-locate/efg-locate.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "efg-locate.hpp" 11 | #include "command-line-parsing/cmdline.h" // gengetopt-generated parser 12 | #include "efg.hpp" 13 | #include "algo.cpp" 14 | 15 | //#define LOCATE_DEBUG 16 | 17 | using namespace efg_locate; 18 | using std::string, std::max; 19 | 20 | int main(int argc, char* argv[]) 21 | { 22 | gengetopt_args_info argsinfo; 23 | if (cmdline_parser(argc, argv, &argsinfo) != 0) exit(1); 24 | 25 | if (argsinfo.inputs_num == 0) 26 | {cmdline_parser_print_help(); exit(1);}; 27 | if (argsinfo.inputs_num == 1) 28 | {std::cerr << argv[0] << ": missing patterns file" << std::endl; exit(1);}; 29 | if (argsinfo.inputs_num == 2) 30 | {std::cerr << argv[0] << ": missing output file" << std::endl; exit(1);}; 31 | if (argsinfo.inputs_num > 3) 32 | {std::cerr << argv[0] << ": too many arguments" << std::endl; exit(1);}; 33 | 34 | Params params; 35 | params.ignorechars = ((argsinfo.ignore_chars_arg != NULL) ? string(argsinfo.ignore_chars_arg): ""); 36 | params.reversecompl = argsinfo.reverse_complement_flag; 37 | params.threads = argsinfo.threads_arg; 38 | params.mincoverage = argsinfo.approximate_min_coverage_arg; 39 | params.reportstats = argsinfo.approximate_stats_flag; 40 | params.renamereversecomplement = argsinfo.rename_reverse_complement_flag; 41 | params.splitoutputmatches = argsinfo.split_output_matches_flag; 42 | params.splitoutputmatchesgraphaligner = argsinfo.split_output_matches_graphaligner_flag; 43 | params.splitkeepedgematches = argsinfo.split_keep_edge_matches_flag; 44 | params.edgemincount = argsinfo.approximate_edge_match_min_count_arg; 45 | params.edgelongestcount = argsinfo.approximate_edge_match_longest_arg; 46 | params.edgelongestcountmax = argsinfo.approximate_edge_match_longest_max_count_arg; 47 | 48 | // open graph file 49 | std::filesystem::path graphpath {argsinfo.inputs[0]}; 50 | params.graphfs = std::ifstream {graphpath}; 51 | if (!params.graphfs) {std::cerr << "Error opening graph file " << graphpath << "." << std::endl; exit(1);}; 52 | 53 | // check and open output file 54 | std::filesystem::path outputpath {argsinfo.inputs[2]}; 55 | if (std::filesystem::exists(outputpath)) { 56 | if (argsinfo.overwrite_flag) { 57 | params.outputfs = std::ofstream(outputpath, std::ios::out | std::ios::trunc); 58 | } else { 59 | std::cerr << "Error: output file already exists." << std::endl; 60 | exit(1); 61 | } 62 | } else { 63 | params.outputfs = std::ofstream(outputpath); 64 | } 65 | if (!params.outputfs) {std::cerr << "Error opening output file " << outputpath << "." << std::endl; exit(1);}; 66 | 67 | std::cerr << "Reading the graph..." << std::flush; 68 | Elasticfoundergraph graph(params.graphfs); 69 | std::cerr << " done." << std::endl; 70 | 71 | std::cerr << "Indexing the graph..." << std::flush; 72 | graph.init_pattern_matching_support(); 73 | std::cerr << " done." << std::endl; 74 | 75 | #ifdef LOCATE_DEBUG 76 | std::cerr << "DEBUG graph is " << std::endl; 77 | graph.to_stream(&std::cerr); 78 | #endif 79 | 80 | // check and open patterns file 81 | std::filesystem::path patternspath {argsinfo.inputs[1]}; 82 | params.patternsfs = std::ifstream {patternspath}; 83 | if (!params.patternsfs) {std::cerr << "Error opening patterns file " << patternspath << "." << std::endl; exit(1);}; 84 | 85 | std::atomic input_done = false; 86 | std::thread inputworker; 87 | vector pattern_ids, patterns; 88 | if ((argsinfo.approximate_flag and params.threads > 0) or (!argsinfo.approximate_flag)) { 89 | std::cerr << "Locate" << std::endl; 90 | inputworker = std::thread(reader_worker, std::ref(params.patternsfs), std::ref(input_done)); 91 | } else { 92 | std::cerr << "Reading the patterns..." << std::flush; 93 | std::tie(pattern_ids, patterns) = read_patterns(params.patternsfs); 94 | std::cerr << " done." << std::endl; 95 | } 96 | 97 | #ifdef LOCATE_DEBUG 98 | std::cerr << std::endl; 99 | for (int i = 0; i < pattern_ids.size(); i++) { 100 | cerr << "DEBUG pattern:" << pattern_ids[i] << std::endl << patterns[i] << std::endl; 101 | } 102 | #endif 103 | 104 | int returnvalue = 0; 105 | // exact pattern matching 106 | if (!argsinfo.approximate_flag) { 107 | std::atomic workers_done = false; 108 | std::thread outputworker(writer_worker, std::ref(workers_done), std::ref(params)); 109 | vector workers; 110 | for (int i = 0; i < max(1,params.threads); i++) 111 | workers.push_back(std::thread(exact_worker, std::ref(graph), std::ref(pattern_ids), std::ref(patterns), std::ref(params), std::ref(input_done))); 112 | for (int i = 0; i < workers.size(); i++) 113 | workers[i].join(); 114 | workers_done = true; 115 | inputworker.join(); 116 | outputworker.join(); 117 | // sanity check? 118 | outputworker = std::thread(writer_worker, std::ref(workers_done), std::ref(params)); 119 | outputworker.join(); 120 | return 0; 121 | } 122 | 123 | if (argsinfo.approximate_flag) { 124 | if (params.threads > 0) { 125 | std::atomic workers_done = false; 126 | std::thread outputworker(writer_worker, std::ref(workers_done), std::ref(params)); 127 | vector workers; 128 | for (int i = 0; i < params.threads; i++) 129 | workers.push_back(std::thread(approx_worker, std::ref(graph), std::ref(pattern_ids), std::ref(patterns), std::ref(params), std::ref(input_done))); 130 | for (int i = 0; i < workers.size(); i++) 131 | workers[i].join(); 132 | workers_done = true; 133 | inputworker.join(); 134 | outputworker.join(); 135 | // sanity check? 136 | outputworker = std::thread(writer_worker, std::ref(workers_done), std::ref(params)); 137 | outputworker.join(); 138 | } else { 139 | for (int p = 0; p < patterns.size(); p++) { 140 | vector matches; 141 | 142 | if (approx_efg_backward_search(graph, pattern_ids[p], patterns[p], params, matches) != 0) { 143 | if (params.splitoutputmatches) 144 | anchors_to_stream_split_single(¶ms.outputfs, graph, matches, params.splitkeepedgematches); 145 | else if (params.splitoutputmatchesgraphaligner) 146 | anchors_to_stream_split_single_graphaligner(¶ms.outputfs, graph, matches, params.splitkeepedgematches); 147 | else 148 | anchors_to_stream(¶ms.outputfs, graph, matches); 149 | } else { 150 | cerr << "Cannot find any semi-repeat-free match of " << pattern_ids[p] << std::endl; 151 | } 152 | } 153 | } 154 | 155 | return 0; 156 | } 157 | 158 | cerr << "Mode not implemented!" << std::endl; 159 | return 1; 160 | } 161 | -------------------------------------------------------------------------------- /tools/efg-locate/efg-locate.hpp: -------------------------------------------------------------------------------- 1 | #ifndef EFG_LOCATE_HPP 2 | #define EFG_LOCATE_HPP 3 | 4 | #include 5 | #include 6 | 7 | using std::string, std::ifstream, std::ofstream; 8 | 9 | namespace efg_locate { 10 | struct Params { 11 | ifstream graphfs; 12 | ifstream patternsfs; 13 | ofstream outputfs; 14 | string ignorechars; 15 | bool reversecompl; 16 | int threads; 17 | int mincoverage; 18 | bool reportstats; 19 | bool renamereversecomplement; 20 | bool splitoutputmatches; 21 | bool splitoutputmatchesgraphaligner; 22 | bool splitkeepedgematches; 23 | int edgemincount; 24 | int edgelongestcount; 25 | int edgelongestcountmax; 26 | }; 27 | } 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /tools/efg-locate/test/inputs/indels.gfa: -------------------------------------------------------------------------------- 1 | M 2 36 2 | X 1 4 7 11 15 19 22 26 30 34 3 | B 1 1 2 1 1 1 2 1 1 2 4 | S 0 TAC 5 | S 1 AGT 6 | L 0 + 1 + 0M 7 | S 2 GAA 8 | S 3 GAAA 9 | L 1 + 2 + 0M 10 | L 1 + 3 + 0M 11 | S 4 CAAT 12 | L 2 + 4 + 0M 13 | L 3 + 4 + 0M 14 | S 5 GCTA 15 | L 4 + 5 + 0M 16 | S 6 GGG 17 | L 5 + 6 + 0M 18 | S 7 AGA 19 | S 8 AGAG 20 | L 6 + 7 + 0M 21 | L 6 + 8 + 0M 22 | S 9 GCTG 23 | L 7 + 9 + 0M 24 | L 8 + 9 + 0M 25 | S 10 CCTA 26 | L 9 + 10 + 0M 27 | S 11 TAT 28 | S 12 TT 29 | L 10 + 11 + 0M 30 | L 10 + 12 + 0M 31 | -------------------------------------------------------------------------------- /tools/efg-locate/test/inputs/indels_five_nodes.fasta: -------------------------------------------------------------------------------- 1 | >one 2 | ACAATGCTAGGGAGA 3 | >two 4 | TACAGTGAACAATGCTA 5 | >three 6 | GAGAGGCTGCCTATT 7 | -------------------------------------------------------------------------------- /tools/efg-locate/test/inputs/tcs_fig_5.gfa: -------------------------------------------------------------------------------- 1 | M 4 9 2 | X 1 4 7 9 3 | B 2 2 3 1 4 | S 0 AA 5 | S 1 ACC 6 | S 2 TTC 7 | S 3 TA 8 | L 0 + 2 + 0M 9 | L 0 + 3 + 0M 10 | L 1 + 3 + 0M 11 | S 4 CA 12 | S 5 G 13 | S 6 GC 14 | L 2 + 4 + 0M 15 | L 3 + 5 + 0M 16 | L 3 + 6 + 0M 17 | S 7 C 18 | L 4 + 7 + 0M 19 | L 5 + 7 + 0M 20 | L 6 + 7 + 0M 21 | P seq1 0+,2+,4+,7+ * 22 | P seq2 0+,3+,5+,7+ * 23 | P seq3 1+,3+,5+,7+ * 24 | P seq4 0+,3+,6+,7+ * 25 | -------------------------------------------------------------------------------- /tools/efg-locate/test/inputs/tcs_fig_5_approximate.fasta: -------------------------------------------------------------------------------- 1 | >seq 2 | AATTCCACGGGGGGGGGGGGGGGGGGGGGGGGGGGGAATAGC 3 | -------------------------------------------------------------------------------- /tools/efg-locate/test/inputs/tcs_fig_5_edge.fasta: -------------------------------------------------------------------------------- 1 | >edgeoccurrence1 2 | TTC 3 | >edgeoccurrence2 4 | ATT 5 | >edgeoccurrence3 6 | TAGC 7 | >edgeoccurrence4 8 | ACCTA 9 | -------------------------------------------------------------------------------- /tools/efg-locate/test/inputs/tcs_fig_5_four_nodes.fasta: -------------------------------------------------------------------------------- 1 | >fournode1 2 | AATAGCC 3 | >fournode2 4 | CTAGCC 5 | >fournode3 6 | ATTCCAC 7 | -------------------------------------------------------------------------------- /tools/efg-locate/test/inputs/tcs_fig_5_three_nodes.fasta: -------------------------------------------------------------------------------- 1 | >threenode1 2 | ATAGC 3 | >threenode2 4 | CCAC 5 | -------------------------------------------------------------------------------- /tools/efg-locate/test/outputs/indels_five_nodes.gfa: -------------------------------------------------------------------------------- 1 | one 15 0 15 + >3>4>5>6>7 18 3 18 0 0 255 2 | two 17 0 17 + >0>1>2>4>5 17 0 17 0 0 255 3 | three 15 0 15 + >6>8>9>10>12 17 2 17 0 0 255 4 | -------------------------------------------------------------------------------- /tools/efg-locate/test/outputs/tcs_fig_5_approximate.fasta: -------------------------------------------------------------------------------- 1 | seq 42 0 8 + >0>2>4>7 8 0 8 0 0 255 2 | seq 42 36 42 + >0>3>6 6 0 6 0 0 255 3 | -------------------------------------------------------------------------------- /tools/efg-locate/test/outputs/tcs_fig_5_edge.gfa: -------------------------------------------------------------------------------- 1 | edgeoccurrence1 3 0 3 + >2 3 0 3 0 0 255 2 | edgeoccurrence1 3 0 3 + >2 3 0 3 0 0 255 3 | edgeoccurrence2 3 0 3 + >0>2 5 1 4 0 0 255 4 | edgeoccurrence3 4 0 4 + >3>6 4 0 4 0 0 255 5 | edgeoccurrence4 5 0 5 + >1>3 5 0 5 0 0 255 6 | -------------------------------------------------------------------------------- /tools/efg-locate/test/outputs/tcs_fig_5_four_nodes.gfa: -------------------------------------------------------------------------------- 1 | fournode1 7 0 7 + >0>3>6>7 7 0 7 0 0 255 2 | fournode2 6 0 6 + >1>3>6>7 8 2 8 0 0 255 3 | fournode3 7 0 7 + >0>2>4>7 8 1 8 0 0 255 4 | -------------------------------------------------------------------------------- /tools/efg-locate/test/outputs/tcs_fig_5_three_nodes.gfa: -------------------------------------------------------------------------------- 1 | threenode1 5 0 5 + >0>3>6 6 1 6 0 0 255 2 | threenode2 4 0 4 + >2>4>7 6 2 6 0 0 255 3 | -------------------------------------------------------------------------------- /tools/efg-locate/test/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # barebones testing pipeline that matches the test files with the correct 3 | # outputs as specified by the following arrays 4 | locate=("tcs_fig_5.gfa tcs_fig_5_edge.fasta tcs_fig_5_edge.gfa" 5 | "tcs_fig_5.gfa tcs_fig_5_three_nodes.fasta tcs_fig_5_three_nodes.gfa" 6 | "tcs_fig_5.gfa tcs_fig_5_four_nodes.fasta tcs_fig_5_four_nodes.gfa" 7 | "indels.gfa indels_five_nodes.fasta indels_five_nodes.gfa") 8 | 9 | approximate=("tcs_fig_5.gfa tcs_fig_5_approximate.fasta tcs_fig_5_approximate.fasta") 10 | 11 | thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script 12 | outputfolder=$thisfolder/output-$(date -Iminutes) 13 | logfile=$outputfolder/log 14 | efglocate=$thisfolder/../efg-locate 15 | 16 | mkdir $outputfolder 17 | echo -n > $logfile 18 | 19 | for testfile in "${locate[@]}" 20 | do 21 | graph=$thisfolder/inputs/$(echo "$testfile" | tr -s " " | cut -d' ' -f1) 22 | patterns=$thisfolder/inputs/$(echo "$testfile" | tr -s " " | cut -d' ' -f2) 23 | correct=$thisfolder/outputs/$(echo "$testfile" | tr -s " " | cut -d' ' -f3) 24 | 25 | patternsbasename=$(basename $patterns) 26 | output=$outputfolder/${patternsbasename%.*}.gfa 27 | 28 | echo "$efglocate $graph $patterns $output" >> $logfile 29 | $efglocate $graph $patterns $output >> $logfile 2>> $logfile 30 | diff $output $correct > /dev/null 2>/dev/null 31 | 32 | exitcode=$? ; if [ $exitcode -ne 0 ] ; then 33 | echo "Test failed for files $graph $patterns $correct!" | tee -a $logfile 34 | exit 1 35 | fi 36 | done 37 | 38 | for testfile in "${approximate[@]}" 39 | do 40 | graph=$thisfolder/inputs/$(echo "$testfile" | tr -s " " | cut -d' ' -f1) 41 | patterns=$thisfolder/inputs/$(echo "$testfile" | tr -s " " | cut -d' ' -f2) 42 | correct=$thisfolder/outputs/$(echo "$testfile" | tr -s " " | cut -d' ' -f3) 43 | 44 | patternsbasename=$(basename $patterns) 45 | output=$outputfolder/${patternsbasename%.*}.gaf 46 | 47 | echo "$efglocate $graph $patterns $output" >> $logfile 48 | $efglocate --approximate $graph $patterns $output >> $logfile 2>> $logfile 49 | diff $output $correct > /dev/null 2>/dev/null 50 | 51 | exitcode=$? ; if [ $exitcode -ne 0 ] ; then 52 | echo "Test failed for files $graph $patterns $correct!" | tee -a $logfile 53 | exit 1 54 | fi 55 | done 56 | -------------------------------------------------------------------------------- /tools/efg-simplify/Makefile: -------------------------------------------------------------------------------- 1 | CPPFLAGS=-Ofast -march=native --std=c++20 2 | #CPPFLAGS=-O0 -g --std=c++20 3 | 4 | all : efg-simplify 5 | 6 | efg-simplify : efg-simplify.cpp command-line-parsing/cmdline.h command-line-parsing/cmdline.c 7 | g++ $(CPPFLAGS) \ 8 | efg-simplify.cpp command-line-parsing/cmdline.c \ 9 | -o efg-simplify 10 | 11 | #command-line-parsing/cmdline%c command-line-parsing/cmdline%h : command-line-parsing/config.ggo 12 | # gengetopt \ 13 | # --input=./command-line-parsing/config.ggo \ 14 | # --output-dir=./command-line-parsing/ \ 15 | # --unnamed-opts 16 | 17 | .PHONY : clean all 18 | 19 | clean : 20 | rm -Rf efg-simplify 21 | # rm -Rf efg-simplify command-line-parsing/cmdline.{c,h} 22 | -------------------------------------------------------------------------------- /tools/efg-simplify/README.md: -------------------------------------------------------------------------------- 1 | # efg-simplify 2 | Program that checks whether a given (Elastic) Founder Graph (in xGFA format) respects the repeat-free property or the semi-repeat-free property. 3 | 4 | ``` 5 | Usage: efg-simplify inputgraph.xgfa simplifiedgraph.xgfa 6 | Program to transform and simplify an Elastic Founder Graph given in xGFA 7 | format. 8 | 9 | The program takes an Elastic Founder Graph in xGFA format and merges adjacent 10 | blocks that only contain parallel paths. 11 | 12 | -h, --help Print help and exit 13 | -V, --version Print version and exit 14 | ``` 15 | 16 | ## GFA format (xGFA) 17 | See [here](https://github.com/algbio/founderblockgraphs/blob/master/xGFAspec.md). 18 | 19 | ## known issues 20 | 21 | - as we merge blocks, paths warp a little bit: the program extends the original paths up to the first non-simplified node at the beginning and end 22 | 23 | ## todo 24 | 25 | - tests 26 | - solve the paths extension 27 | -------------------------------------------------------------------------------- /tools/efg-simplify/command-line-parsing/cmdline.h: -------------------------------------------------------------------------------- 1 | /** @file cmdline.h 2 | * @brief The header file for the command line option parser 3 | * generated by GNU Gengetopt version 2.23 4 | * http://www.gnu.org/software/gengetopt. 5 | * DO NOT modify this file, since it can be overwritten 6 | * @author GNU Gengetopt */ 7 | 8 | #ifndef CMDLINE_H 9 | #define CMDLINE_H 10 | 11 | /* If we use autoconf. */ 12 | #ifdef HAVE_CONFIG_H 13 | #include "config.h" 14 | #endif 15 | 16 | #include /* for FILE */ 17 | 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif /* __cplusplus */ 21 | 22 | #ifndef CMDLINE_PARSER_PACKAGE 23 | /** @brief the program name (used for printing errors) */ 24 | #define CMDLINE_PARSER_PACKAGE "efg-simplify" 25 | #endif 26 | 27 | #ifndef CMDLINE_PARSER_PACKAGE_NAME 28 | /** @brief the complete program name (used for help and version) */ 29 | #define CMDLINE_PARSER_PACKAGE_NAME "efg-simplify" 30 | #endif 31 | 32 | #ifndef CMDLINE_PARSER_VERSION 33 | /** @brief the program version */ 34 | #define CMDLINE_PARSER_VERSION "devel" 35 | #endif 36 | 37 | /** @brief Where the command line options are stored */ 38 | struct gengetopt_args_info 39 | { 40 | const char *help_help; /**< @brief Print help and exit help description. */ 41 | const char *version_help; /**< @brief Print version and exit help description. */ 42 | int simplify_tunnels_flag; /**< @brief for each maximal range of blocks to simplify, do not simplify first and last (default=off). */ 43 | const char *simplify_tunnels_help; /**< @brief for each maximal range of blocks to simplify, do not simplify first and last help description. */ 44 | int rename_nodes_flag; /**< @brief rename all node IDs to 0-indexed integers (default=off). */ 45 | const char *rename_nodes_help; /**< @brief rename all node IDs to 0-indexed integers help description. */ 46 | int ignore_only_flag; /**< @brief consider in the simplification only blocks containing ignore characters (default=off). */ 47 | const char *ignore_only_help; /**< @brief consider in the simplification only blocks containing ignore characters help description. */ 48 | char * ignore_chars_arg; /**< @brief Ignore characters. */ 49 | char * ignore_chars_orig; /**< @brief Ignore characters original value given at command line. */ 50 | const char *ignore_chars_help; /**< @brief Ignore characters help description. */ 51 | int overwrite_flag; /**< @brief overwrite the output file, if it exists (default=off). */ 52 | const char *overwrite_help; /**< @brief overwrite the output file, if it exists help description. */ 53 | 54 | unsigned int help_given ; /**< @brief Whether help was given. */ 55 | unsigned int version_given ; /**< @brief Whether version was given. */ 56 | unsigned int simplify_tunnels_given ; /**< @brief Whether simplify-tunnels was given. */ 57 | unsigned int rename_nodes_given ; /**< @brief Whether rename-nodes was given. */ 58 | unsigned int ignore_only_given ; /**< @brief Whether ignore-only was given. */ 59 | unsigned int ignore_chars_given ; /**< @brief Whether ignore-chars was given. */ 60 | unsigned int overwrite_given ; /**< @brief Whether overwrite was given. */ 61 | 62 | char **inputs ; /**< @brief unnamed options (options without names) */ 63 | unsigned inputs_num ; /**< @brief unnamed options number */ 64 | } ; 65 | 66 | /** @brief The additional parameters to pass to parser functions */ 67 | struct cmdline_parser_params 68 | { 69 | int override; /**< @brief whether to override possibly already present options (default 0) */ 70 | int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */ 71 | int check_required; /**< @brief whether to check that all required options were provided (default 1) */ 72 | int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */ 73 | int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */ 74 | } ; 75 | 76 | /** @brief the purpose string of the program */ 77 | extern const char *gengetopt_args_info_purpose; 78 | /** @brief the usage string of the program */ 79 | extern const char *gengetopt_args_info_usage; 80 | /** @brief the description string of the program */ 81 | extern const char *gengetopt_args_info_description; 82 | /** @brief all the lines making the help output */ 83 | extern const char *gengetopt_args_info_help[]; 84 | 85 | /** 86 | * The command line parser 87 | * @param argc the number of command line options 88 | * @param argv the command line options 89 | * @param args_info the structure where option information will be stored 90 | * @return 0 if everything went fine, NON 0 if an error took place 91 | */ 92 | int cmdline_parser (int argc, char **argv, 93 | struct gengetopt_args_info *args_info); 94 | 95 | /** 96 | * The command line parser (version with additional parameters - deprecated) 97 | * @param argc the number of command line options 98 | * @param argv the command line options 99 | * @param args_info the structure where option information will be stored 100 | * @param override whether to override possibly already present options 101 | * @param initialize whether to initialize the option structure my_args_info 102 | * @param check_required whether to check that all required options were provided 103 | * @return 0 if everything went fine, NON 0 if an error took place 104 | * @deprecated use cmdline_parser_ext() instead 105 | */ 106 | int cmdline_parser2 (int argc, char **argv, 107 | struct gengetopt_args_info *args_info, 108 | int override, int initialize, int check_required); 109 | 110 | /** 111 | * The command line parser (version with additional parameters) 112 | * @param argc the number of command line options 113 | * @param argv the command line options 114 | * @param args_info the structure where option information will be stored 115 | * @param params additional parameters for the parser 116 | * @return 0 if everything went fine, NON 0 if an error took place 117 | */ 118 | int cmdline_parser_ext (int argc, char **argv, 119 | struct gengetopt_args_info *args_info, 120 | struct cmdline_parser_params *params); 121 | 122 | /** 123 | * Save the contents of the option struct into an already open FILE stream. 124 | * @param outfile the stream where to dump options 125 | * @param args_info the option struct to dump 126 | * @return 0 if everything went fine, NON 0 if an error took place 127 | */ 128 | int cmdline_parser_dump(FILE *outfile, 129 | struct gengetopt_args_info *args_info); 130 | 131 | /** 132 | * Save the contents of the option struct into a (text) file. 133 | * This file can be read by the config file parser (if generated by gengetopt) 134 | * @param filename the file where to save 135 | * @param args_info the option struct to save 136 | * @return 0 if everything went fine, NON 0 if an error took place 137 | */ 138 | int cmdline_parser_file_save(const char *filename, 139 | struct gengetopt_args_info *args_info); 140 | 141 | /** 142 | * Print the help 143 | */ 144 | void cmdline_parser_print_help(void); 145 | /** 146 | * Print the version 147 | */ 148 | void cmdline_parser_print_version(void); 149 | 150 | /** 151 | * Initializes all the fields a cmdline_parser_params structure 152 | * to their default values 153 | * @param params the structure to initialize 154 | */ 155 | void cmdline_parser_params_init(struct cmdline_parser_params *params); 156 | 157 | /** 158 | * Allocates dynamically a cmdline_parser_params structure and initializes 159 | * all its fields to their default values 160 | * @return the created and initialized cmdline_parser_params structure 161 | */ 162 | struct cmdline_parser_params *cmdline_parser_params_create(void); 163 | 164 | /** 165 | * Initializes the passed gengetopt_args_info structure's fields 166 | * (also set default values for options that have a default) 167 | * @param args_info the structure to initialize 168 | */ 169 | void cmdline_parser_init (struct gengetopt_args_info *args_info); 170 | /** 171 | * Deallocates the string fields of the gengetopt_args_info structure 172 | * (but does not deallocate the structure itself) 173 | * @param args_info the structure to deallocate 174 | */ 175 | void cmdline_parser_free (struct gengetopt_args_info *args_info); 176 | 177 | /** 178 | * Checks that all the required options were specified 179 | * @param args_info the structure to check 180 | * @param prog_name the name of the program that will be used to print 181 | * possible errors 182 | * @return 183 | */ 184 | int cmdline_parser_required (struct gengetopt_args_info *args_info, 185 | const char *prog_name); 186 | 187 | 188 | #ifdef __cplusplus 189 | } 190 | #endif /* __cplusplus */ 191 | #endif /* CMDLINE_H */ 192 | -------------------------------------------------------------------------------- /tools/efg-simplify/command-line-parsing/config.ggo: -------------------------------------------------------------------------------- 1 | version "devel" 2 | package "efg-simplify" 3 | purpose "Program to transform and simplify an Elastic Founder Graph given in xGFA format." 4 | usage "efg-simplify inputgraph.xgfa simplifiedgraph.xgfa" 5 | 6 | description "The program takes an Elastic Founder Graph in xGFA format and merges adjacent blocks that only contain parallel paths." 7 | 8 | option "simplify-tunnels" t "for each maximal range of blocks to simplify, do not simplify first and last" flag off 9 | option "rename-nodes" r "rename all node IDs to 0-indexed integers" flag off 10 | option "ignore-only" n "consider in the simplification only blocks containing ignore characters" flag off 11 | option "ignore-chars" - "Ignore characters" string optional 12 | option "overwrite" - "overwrite the output file, if it exists" flag off 13 | --------------------------------------------------------------------------------