├── .github └── workflows │ └── ubuntu-distcheck.yml ├── .gitignore ├── .readthedocs.yaml ├── Configuration ├── adapter_list.txt ├── contaminant_list.txt └── limits.txt ├── LICENSE ├── Makefile ├── Makefile.am ├── README.md ├── autogen.sh ├── benchmark ├── README.md ├── download_files.sh ├── outs │ ├── falco │ │ ├── README.md │ │ ├── SRR10124060_rnaseq.fastq_qc_data.txt │ │ └── SRR10124060_rnaseq.fastq_report.html │ ├── fastp │ │ ├── SRR10124060_rnaseq.fastq.html │ │ └── SRR10124060_rnaseq.fastq.json │ └── fastqc │ │ ├── README.md │ │ └── SRR10124060_rnaseq_fastqc │ │ ├── fastqc_data.txt │ │ └── fastqc_report.html ├── run_all_falco_tests.sh ├── run_all_fastp_tests.sh ├── run_all_fastqc_tests.sh ├── run_all_htqc_tests.sh └── tests │ └── README.md ├── configure.ac ├── documentation ├── README.md ├── docs │ └── index.md └── mkdocs.yml ├── example.bam ├── example.fq ├── example.sam ├── m4 ├── ax_cxx_check_lib.m4 ├── ax_cxx_compile_stdcxx.m4 └── ax_cxx_compile_stdcxx_17.m4 ├── src ├── FalcoConfig.cpp ├── FalcoConfig.hpp ├── FastqStats.cpp ├── FastqStats.hpp ├── HtmlMaker.cpp ├── HtmlMaker.hpp ├── Makefile ├── Module.cpp ├── Module.hpp ├── OptionParser.cpp ├── OptionParser.hpp ├── StreamReader.cpp ├── StreamReader.hpp ├── aux.hpp ├── falco.cpp ├── falcodiff.cpp ├── smithlab_utils.cpp └── smithlab_utils.hpp └── test ├── falco.test ├── md5sum.txt └── test_data.tgz /.github/workflows/ubuntu-distcheck.yml: -------------------------------------------------------------------------------- 1 | name: Falco distribution check on Ubuntu 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | branches: [ "master" ] 7 | push: 8 | branches: [ "master" ] 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | with: 16 | submodules: recursive 17 | - name: Update packages 18 | run: sudo apt-get update 19 | - name: Install dependencies 20 | run: sudo apt-get install -y libhts-dev 21 | - name: Generate configure script 22 | run: ./autogen.sh 23 | - name: configure 24 | run: ./configure --enable-hts 25 | - name: Build falco 26 | run: make 27 | - name: Check the distribution 28 | run: make distcheck 29 | - name: Cleanup after the build 30 | run: make distclean 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin 2 | *.o 3 | *.fastq 4 | *.fastq.gz 5 | *.bam 6 | *.sam 7 | *.txt 8 | *.html 9 | benchmark 10 | # Autotools generated files 11 | Makefile.in 12 | a.awk 13 | aclocal.m4 14 | autom4te.cache 15 | build.sh 16 | config.h 17 | config.h.in 18 | config.log 19 | config.status 20 | configure 21 | depcomp 22 | install-sh 23 | meta.yaml 24 | missing 25 | src/.deps 26 | src/.dirstamp 27 | stamp-h1 28 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # falco: quality control for sequencing read files 2 | # 3 | # Copyright (C) 2019-2024 Guilherme De Sena Brandine and 4 | # Andrew D. Smith 5 | # Authors: Guilherme De Sena Brandine, Andrew Smith 6 | # 7 | # This program is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # Read the Docs configuration file for MkDocs projects 18 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 19 | 20 | # Required 21 | version: 2 22 | 23 | # Set the version of Python and other tools you might need 24 | build: 25 | os: ubuntu-22.04 26 | tools: 27 | python: "3.12" 28 | 29 | mkdocs: 30 | configuration: documentation/mkdocs.yml 31 | -------------------------------------------------------------------------------- /Configuration/adapter_list.txt: -------------------------------------------------------------------------------- 1 | # This file contains a set of sequence fragments which will be explicitly 2 | # searched against your library. The reporting will be similar to the 3 | # Kmer plot, except that every sequence in this list will be shown so 4 | # you can use this to judge the level of adapter read-through even if those 5 | # adapter sequences aren't picked out by the Kmer module. 6 | # 7 | # Since every sequence here will be analysed and the results plotted it 8 | # doesn't make any sense to include duplicate sequences, or to add too 9 | # many sequences since your plot will end up a mess. 10 | # 11 | # You can add more sequences to the file by putting one line per entry 12 | # and specifying a name[tab]sequence. If the contaminant you add is 13 | # likely to be of use to others please consider sending it to the FastQ 14 | # authors, either via a bug report at www.bioinformatics.babraham.ac.uk/bugzilla/ 15 | # or by directly emailing simon.andrews@babraham.ac.uk so other users of 16 | # the program can benefit. 17 | # 18 | # For the time being it's going to be easier to interpret this plot if all 19 | # of the sequences provided are the same length, so we've gone with 12bp 20 | # fragments for now. 21 | 22 | Illumina Universal Adapter AGATCGGAAGAG 23 | Illumina Small RNA 3' Adapter TGGAATTCTCGG 24 | Illumina Small RNA 5' Adapter GATCGTCGGACT 25 | Nextera Transposase Sequence CTGTCTCTTATA 26 | PolyA AAAAAAAAAAAA 27 | PolyG GGGGGGGGGGGG -------------------------------------------------------------------------------- /Configuration/contaminant_list.txt: -------------------------------------------------------------------------------- 1 | # This file contains a list of potential contaminants which are 2 | # frequently found in high throughput sequencing reactions. These 3 | # are mostly sequences of adapters / primers used in the various 4 | # sequencing chemistries. 5 | # 6 | # Please DO NOT rely on these sequences to design your own oligos, some 7 | # of them are truncated at ambiguous positions, and none of them are 8 | # definitive sequences from the manufacturers so don't blame us if you 9 | # try to use them and they don't work. 10 | # 11 | # You can add more sequences to the file by putting one line per entry 12 | # and specifying a name[tab]sequence. If the contaminant you add is 13 | # likely to be of use to others please consider sending it to the FastQ 14 | # authors, either via a bug report at www.bioinformatics.babraham.ac.uk/bugzilla/ 15 | # or by directly emailing simon.andrews@babraham.ac.uk so other users of 16 | # the program can benefit. 17 | 18 | Illumina Single End Adapter 1 GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG 19 | Illumina Single End Adapter 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT 20 | Illumina Single End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 21 | Illumina Single End PCR Primer 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT 22 | Illumina Single End Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT 23 | 24 | Illumina Paired End Adapter 1 ACACTCTTTCCCTACACGACGCTCTTCCGATCT 25 | Illumina Paired End Adapter 2 GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG 26 | Illumina Paried End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 27 | Illumina Paired End PCR Primer 2 CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT 28 | Illumina Paried End Sequencing Primer 1 ACACTCTTTCCCTACACGACGCTCTTCCGATCT 29 | Illumina Paired End Sequencing Primer 2 CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT 30 | 31 | Illumina DpnII expression Adapter 1 ACAGGTTCAGAGTTCTACAGTCCGAC 32 | Illumina DpnII expression Adapter 2 CAAGCAGAAGACGGCATACGA 33 | Illumina DpnII expression PCR Primer 1 CAAGCAGAAGACGGCATACGA 34 | Illumina DpnII expression PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA 35 | Illumina DpnII expression Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC 36 | 37 | Illumina NlaIII expression Adapter 1 ACAGGTTCAGAGTTCTACAGTCCGACATG 38 | Illumina NlaIII expression Adapter 2 CAAGCAGAAGACGGCATACGA 39 | Illumina NlaIII expression PCR Primer 1 CAAGCAGAAGACGGCATACGA 40 | Illumina NlaIII expression PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA 41 | Illumina NlaIII expression Sequencing Primer CCGACAGGTTCAGAGTTCTACAGTCCGACATG 42 | 43 | Illumina Small RNA Adapter 1 GTTCAGAGTTCTACAGTCCGACGATC 44 | Illumina Small RNA Adapter 2 TGGAATTCTCGGGTGCCAAGG 45 | Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA 46 | Illumina Small RNA PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA 47 | Illumina Small RNA Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC 48 | 49 | Illumina Multiplexing Adapter 1 GATCGGAAGAGCACACGTCT 50 | Illumina Multiplexing Adapter 2 ACACTCTTTCCCTACACGACGCTCTTCCGATCT 51 | Illumina Multiplexing PCR Primer 1.01 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 52 | Illumina Multiplexing PCR Primer 2.01 GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 53 | Illumina Multiplexing Read1 Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT 54 | Illumina Multiplexing Index Sequencing Primer GATCGGAAGAGCACACGTCTGAACTCCAGTCAC 55 | Illumina Multiplexing Read2 Sequencing Primer GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 56 | 57 | Illumina PCR Primer Index 1 CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC 58 | Illumina PCR Primer Index 2 CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC 59 | Illumina PCR Primer Index 3 CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC 60 | Illumina PCR Primer Index 4 CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC 61 | Illumina PCR Primer Index 5 CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC 62 | Illumina PCR Primer Index 6 CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC 63 | Illumina PCR Primer Index 7 CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC 64 | Illumina PCR Primer Index 8 CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC 65 | Illumina PCR Primer Index 9 CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC 66 | Illumina PCR Primer Index 10 CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC 67 | Illumina PCR Primer Index 11 CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC 68 | Illumina PCR Primer Index 12 CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC 69 | 70 | Illumina DpnII Gex Adapter 1 GATCGTCGGACTGTAGAACTCTGAAC 71 | Illumina DpnII Gex Adapter 1.01 ACAGGTTCAGAGTTCTACAGTCCGAC 72 | Illumina DpnII Gex Adapter 2 CAAGCAGAAGACGGCATACGA 73 | Illumina DpnII Gex Adapter 2.01 TCGTATGCCGTCTTCTGCTTG 74 | Illumina DpnII Gex PCR Primer 1 CAAGCAGAAGACGGCATACGA 75 | Illumina DpnII Gex PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA 76 | Illumina DpnII Gex Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC 77 | 78 | Illumina NlaIII Gex Adapter 1.01 TCGGACTGTAGAACTCTGAAC 79 | Illumina NlaIII Gex Adapter 1.02 ACAGGTTCAGAGTTCTACAGTCCGACATG 80 | Illumina NlaIII Gex Adapter 2.01 CAAGCAGAAGACGGCATACGA 81 | Illumina NlaIII Gex Adapter 2.02 TCGTATGCCGTCTTCTGCTTG 82 | Illumina NlaIII Gex PCR Primer 1 CAAGCAGAAGACGGCATACGA 83 | Illumina NlaIII Gex PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA 84 | Illumina NlaIII Gex Sequencing Primer CCGACAGGTTCAGAGTTCTACAGTCCGACATG 85 | 86 | Illumina 5p RNA Adapter GTTCAGAGTTCTACAGTCCGACGATC 87 | Illumina RNA Adapter1 TGGAATTCTCGGGTGCCAAGG 88 | 89 | Illumina Small RNA 3p Adapter 1 ATCTCGTATGCCGTCTTCTGCTTG 90 | Illumina Small RNA PCR Primer 1 CAAGCAGAAGACGGCATACGA 91 | 92 | TruSeq Universal Adapter AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT 93 | TruSeq Adapter, Index 1 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG 94 | TruSeq Adapter, Index 2 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG 95 | TruSeq Adapter, Index 3 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG 96 | TruSeq Adapter, Index 4 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG 97 | TruSeq Adapter, Index 5 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG 98 | TruSeq Adapter, Index 6 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG 99 | TruSeq Adapter, Index 7 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG 100 | TruSeq Adapter, Index 8 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG 101 | TruSeq Adapter, Index 9 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG 102 | TruSeq Adapter, Index 10 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG 103 | TruSeq Adapter, Index 11 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG 104 | TruSeq Adapter, Index 12 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG 105 | TruSeq Adapter, Index 13 GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACTCTCGTATGCCGTCTTCTGCTTG 106 | TruSeq Adapter, Index 14 GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTCTCGTATGCCGTCTTCTGCTTG 107 | TruSeq Adapter, Index 15 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGTCTCGTATGCCGTCTTCTGCTTG 108 | TruSeq Adapter, Index 16 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCTCTCGTATGCCGTCTTCTGCTTG 109 | TruSeq Adapter, Index 18 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG 110 | TruSeq Adapter, Index 19 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACTCTCGTATGCCGTCTTCTGCTTG 111 | TruSeq Adapter, Index 20 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTCTCGTATGCCGTCTTCTGCTTG 112 | TruSeq Adapter, Index 21 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGTCTCGTATGCCGTCTTCTGCTTG 113 | TruSeq Adapter, Index 22 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTTCTCGTATGCCGTCTTCTGCTTG 114 | TruSeq Adapter, Index 23 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCACTCTTCTCGTATGCCGTCTTCTGCTTG 115 | TruSeq Adapter, Index 25 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATCTCGTATGCCGTCTTCTGCTTG 116 | TruSeq Adapter, Index 27 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTCTCGTATGCCGTCTTCTGCTTG 117 | 118 | Illumina RNA RT Primer GCCTTGGCACCCGAGAATTCCA 119 | Illumina RNA PCR Primer AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA 120 | 121 | RNA PCR Primer, Index 1 CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 122 | RNA PCR Primer, Index 2 CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 123 | RNA PCR Primer, Index 3 CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 124 | RNA PCR Primer, Index 4 CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 125 | RNA PCR Primer, Index 5 CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 126 | RNA PCR Primer, Index 6 CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 127 | RNA PCR Primer, Index 7 CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 128 | RNA PCR Primer, Index 8 CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 129 | RNA PCR Primer, Index 9 CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 130 | RNA PCR Primer, Index 10 CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 131 | RNA PCR Primer, Index 11 CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 132 | RNA PCR Primer, Index 12 CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 133 | RNA PCR Primer, Index 13 CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 134 | RNA PCR Primer, Index 14 CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 135 | RNA PCR Primer, Index 15 CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 136 | RNA PCR Primer, Index 16 CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 137 | RNA PCR Primer, Index 17 CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 138 | RNA PCR Primer, Index 18 CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 139 | RNA PCR Primer, Index 19 CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 140 | RNA PCR Primer, Index 20 CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 141 | RNA PCR Primer, Index 21 CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 142 | RNA PCR Primer, Index 22 CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 143 | RNA PCR Primer, Index 23 CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 144 | RNA PCR Primer, Index 24 CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 145 | RNA PCR Primer, Index 25 CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 146 | RNA PCR Primer, Index 26 CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 147 | RNA PCR Primer, Index 27 CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 148 | RNA PCR Primer, Index 28 CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 149 | RNA PCR Primer, Index 29 CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 150 | RNA PCR Primer, Index 30 CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 151 | RNA PCR Primer, Index 31 CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 152 | RNA PCR Primer, Index 32 CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 153 | RNA PCR Primer, Index 33 CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 154 | RNA PCR Primer, Index 34 CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 155 | RNA PCR Primer, Index 35 CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 156 | RNA PCR Primer, Index 36 CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 157 | RNA PCR Primer, Index 37 CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 158 | RNA PCR Primer, Index 38 CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 159 | RNA PCR Primer, Index 39 CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 160 | RNA PCR Primer, Index 40 CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 161 | RNA PCR Primer, Index 41 CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 162 | RNA PCR Primer, Index 42 CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 163 | RNA PCR Primer, Index 43 CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 164 | RNA PCR Primer, Index 44 CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 165 | RNA PCR Primer, Index 45 CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 166 | RNA PCR Primer, Index 46 CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 167 | RNA PCR Primer, Index 47 CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 168 | RNA PCR Primer, Index 48 CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA 169 | 170 | ABI Dynabead EcoP Oligo CTGATCTAGAGGTACCGGATCCCAGCAGT 171 | ABI Solid3 Adapter A CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG 172 | ABI Solid3 Adapter B CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT 173 | ABI Solid3 5' AMP Primer CCACTACGCCTCCGCTTTCCTCTCTATG 174 | ABI Solid3 3' AMP Primer CTGCCCCGGGTTCCTCATTCT 175 | ABI Solid3 EF1 alpha Sense Primer CATGTGTGTTGAGAGCTTC 176 | ABI Solid3 EF1 alpha Antisense Primer GAAAACCAAAGTGGTCCAC 177 | ABI Solid3 GAPDH Forward Primer TTAGCACCCCTGGCCAAGG 178 | ABI Solid3 GAPDH Reverse Primer CTTACTCCTTGGAGGCCATG 179 | 180 | 181 | 182 | Clontech Universal Primer Mix Short CTAATACGACTCACTATAGGGC 183 | Clontech Universal Primer Mix Long CTAATACGACTCACTATAGGGCAAGCAGTGGTATCAACGCAGAGT 184 | Clontech SMARTer II A Oligonucleotide AAGCAGTGGTATCAACGCAGAGTAC 185 | Clontech SMART CDS Primer II A AAGCAGTGGTATCAACGCAGAGTACT 186 | 187 | -------------------------------------------------------------------------------- /Configuration/limits.txt: -------------------------------------------------------------------------------- 1 | # For each of the modules you can choose to not run that 2 | # module at all by setting the value below to 1 for the 3 | # modules you want to remove. 4 | duplication ignore 0 5 | kmer ignore 1 6 | n_content ignore 0 7 | overrepresented ignore 0 8 | quality_base ignore 0 9 | sequence ignore 0 10 | gc_sequence ignore 0 11 | quality_sequence ignore 0 12 | tile ignore 0 13 | sequence_length ignore 0 14 | adapter ignore 0 15 | 16 | # For the duplication module the value is the percentage 17 | # remaining after deduplication. Measured levels below 18 | # these limits trigger the warning / error. 19 | duplication warn 70 20 | duplication error 50 21 | 22 | # For the kmer module the filter is on the -log10 binomial 23 | # pvalue for the most significant Kmer, so 5 would be 24 | # 10^-5 = p<0.00001 25 | kmer warn 2 26 | kmer error 5 27 | 28 | # For the N module the filter is on the percentage of Ns 29 | # at any position in the library 30 | n_content warn 5 31 | n_content error 20 32 | 33 | # For the overrepresented seqs the warn value sets the 34 | # threshold for the overrepresented sequences to be reported 35 | # at all as the proportion of the library which must be seen 36 | # as a single sequence 37 | overrepresented warn 0.1 38 | overrepresented error 1 39 | 40 | # The per base quality filter uses two values, one for the value 41 | # of the lower quartile, and the other for the value of the 42 | # median quality. Failing either of these will trigger the alert 43 | quality_base_lower warn 10 44 | quality_base_lower error 5 45 | quality_base_median warn 25 46 | quality_base_median error 20 47 | 48 | # The per base sequence content module tests the maximum deviation 49 | # between A and T or C and G 50 | sequence warn 10 51 | sequence error 20 52 | 53 | # The per sequence GC content tests the maximum deviation between 54 | # the theoretical distribution and the real distribution 55 | gc_sequence warn 15 56 | gc_sequence error 30 57 | 58 | # The per sequence quality module tests the phred score which is 59 | # most frequently observed 60 | quality_sequence warn 27 61 | quality_sequence error 20 62 | 63 | # The per tile module tests the maximum phred score loss between 64 | # and individual tile and the average for that base across all tiles 65 | tile warn 5 66 | tile error 10 67 | 68 | # The sequence length module tests are binary, so the values here 69 | # simply turn them on or off. The actual tests warn if you have 70 | # sequences of different length, and error if you have sequences 71 | # of zero length. 72 | 73 | sequence_length warn 1 74 | sequence_length error 1 75 | 76 | # The adapter module's warnings and errors are based on the 77 | # percentage of reads in the library which have been observed 78 | # to contain an adapter associated Kmer at any point 79 | 80 | adapter warn 5 81 | adapter error 10 82 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2019-2020 Guilherme de Sena Brandine 2 | # 3 | # Authors: Andrew D. Smith 4 | # 5 | # This file is part of ABISMAL. 6 | # 7 | # ABISMAL is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # ABISMAL is distributed in the hope that it will be useful, but WITHOUT 13 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15 | # License for more details. 16 | # 17 | 18 | SRC_ROOT=$(shell pwd) 19 | all: 20 | @make -C src SRC_ROOT=$(SRC_ROOT) all 21 | 22 | install: 23 | @make -C src SRC_ROOT=$(SRC_ROOT) install 24 | 25 | clean: 26 | @make -C src clean 27 | .PHONY: clean 28 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | # falco: quality control for sequencing read files 2 | # 3 | # Copyright (C) 2019 Guilherme De Sena Brandine and 4 | # Andrew D. Smith 5 | # Authors: Guilherme De Sena Brandine, Andrew Smith 6 | # 7 | # This program is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as 9 | # published by the Free Software Foundation, either version 3 of the 10 | # License, or (at your option) any later version. 11 | # 12 | # This program is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | EXTRA_DIST = README.md LICENSE Configuration example.fq test 18 | 19 | ACLOCAL_AMFLAGS = -I m4 20 | AM_CPPFLAGS = -I $(top_srcdir)/src 21 | 22 | CXXFLAGS = -O3 -DNDEBUG # default optimization on; override on cli 23 | 24 | TESTS = test/falco.test 25 | TEST_EXTENSIONS = .test 26 | 27 | bin_PROGRAMS = falco 28 | 29 | falco_CXXFLAGS = $(OPENMP_CXXFLAGS) $(AM_CXXFLAGS) 30 | falco_CPPFLAGS = -DPROGRAM_PATH=\"$(abspath $(top_srcdir))\" 31 | if ENABLE_HTS 32 | falco_CPPFLAGS += -DUSE_HTS 33 | endif 34 | 35 | falco_SOURCES = \ 36 | src/falco.cpp \ 37 | src/FastqStats.cpp \ 38 | src/HtmlMaker.cpp \ 39 | src/Module.cpp \ 40 | src/StreamReader.cpp \ 41 | src/FalcoConfig.cpp \ 42 | src/OptionParser.cpp \ 43 | src/smithlab_utils.cpp \ 44 | src/Module.hpp \ 45 | src/FastqStats.hpp \ 46 | src/HtmlMaker.hpp \ 47 | src/StreamReader.hpp \ 48 | src/FalcoConfig.hpp \ 49 | src/OptionParser.hpp \ 50 | src/smithlab_utils.hpp \ 51 | src/aux.hpp 52 | 53 | CLEANFILES = tests_build/test_data.tgz 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Falco: FastQC Alternative Code 2 | 3 | [![GitHub Downloads](https://img.shields.io/github/downloads/smithlabcode/falco/total.svg?style=social&logo=github&label=Download)](https://github.com/smithlabcode/falco/releases/latest) 4 | [![DOI](https://zenodo.org/badge/214499063.svg)](https://zenodo.org/badge/latestdoi/214499063) 5 | [![Install on conda](https://anaconda.org/bioconda/falco/badges/platforms.svg)](https://anaconda.org/bioconda/falco) 6 | [![Install on conda](https://anaconda.org/bioconda/falco/badges/license.svg)](https://anaconda.org/bioconda/falco) 7 | [![Install on conda](https://img.shields.io/conda/dn/bioconda/falco?color=red&label=conda%20downloads&style=flat-square)](https://anaconda.org/bioconda/falco) 8 | 9 | This program is an emulation of the popular 10 | [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc) 11 | software to check large sequencing reads for common problems. 12 | 13 | Installing falco 14 | ================ 15 | 16 | ## Installing through conda 17 | If you use [anaconda](https://anaconda.org) to manage your packages, 18 | and the `conda` binary is in your path, you can install the most 19 | recent release of `falco` by running 20 | ``` 21 | $ conda install -c bioconda falco 22 | ``` 23 | 24 | `falco` can be found inside the `bin` directory of your anaconda 25 | installer. 26 | 27 | ## Installing from source (code release) 28 | 29 | Compilation from source can be done by downloading a `falco` release 30 | from the [releases](https://github.com/smithlabcode/falco/releases) 31 | section above. Upon downloading the release (in `.tar.gz` or `.zip` 32 | format), and inflating the downloaded file to a directory 33 | (e.g. `falco`), move to the target directory the file was inflated 34 | (e.g. `cd falco`). You should see a `configure` file in it. In this 35 | directory, run 36 | 37 | ``` 38 | $ ./configure CXXFLAGS="-O3 -Wall" 39 | $ make all 40 | $ make install 41 | ``` 42 | if you wish to install the falco binaries on a specific directory, you can use 43 | the `--prefix` argument when running `./configure`, for instance: 44 | 45 | ``` 46 | $ ./configure CXXFLAGS="-O3 -Wall" --prefix=/path/to/installation/directory 47 | ``` 48 | 49 | The `falco` binary will be found in the `bin` directory inside the 50 | specified prefix. 51 | 52 | ## Installing from a cloned repository 53 | 54 | We strongly recommend using `falco` through stable releases as 55 | described above, as the latest commits might contain undocumented 56 | bugs. For the more advanced users who wish to test the most recent 57 | code, `falco` can be installed by first cloning the repository 58 | 59 | ``` 60 | $ git clone https://github.com/smithlabcode/falco.git 61 | $ cd falco 62 | ``` 63 | 64 | Once inside the generated repsotory directory, run 65 | ``` 66 | $ make all 67 | $ make install 68 | ``` 69 | 70 | This should create a `bin` directory inside the cloned repository 71 | containing `falco`. 72 | 73 | ### Required C++ dependencies 74 | 75 | [zlib](https://zlib.net) is required to read gzip compressed FASTQ 76 | files. It is usually installed by default in most UNIX computers and 77 | is part of the htslib setup, but it can also be installed with 78 | standard package managers like apt, brew or conda. 79 | 80 | On Ubuntu, zlib C++ libraries can be installed with `apt`: 81 | ``` 82 | $ sudo apt install zlib1g zlib1g-dev 83 | ``` 84 | 85 | ### Optional C++ dependencies 86 | 87 | [htslib](https://github.com/samtools/htslib) is required to process 88 | bam files. If not provided, bam files will be treated as unrecognized 89 | file formats. 90 | 91 | If htslib is installed, falco can be compiled with it by simply replacing the 92 | configure command above with the `--enable-hts` flag: 93 | 94 | ``` 95 | $ ./configure CXXFLAGS="-O3 -Wall" --enable-hts 96 | ``` 97 | 98 | If `falco` was cloned from the repository, run the following commands 99 | to allow BAM file reading: 100 | 101 | ``` 102 | $ make HAVE_HTSLIB=1 all 103 | $ make HAVE_HTSLIB=1 install 104 | ``` 105 | 106 | If successfully compiled, `falco` can be used in BAM files the same way as it is 107 | used with fastq and sam files. 108 | 109 | Running falco 110 | ============= 111 | 112 | Run falco in with the following command, where the `example.fq` file 113 | provided can be replaced with the path to any FASTQ file you want to run 114 | `falco` 115 | ``` 116 | $ falco example.fq 117 | ``` 118 | 119 | This will generate three files in the same directory as the input fastq file: 120 | 121 | * `fastqc_data.txt` is a text file with a summary of the QC metrics 122 | 123 | * `fastqc_report.html` is the visual HTML report showing plots of the 124 | QC metrics summarized in the text summary. 125 | 126 | * `summary.txt`: A tab-separated file describing whether the 127 | pass/warn/fail result for each module. If multiple files are 128 | provided, only one summary file is generated, with one of the 129 | columns being the file name associated to each module result. 130 | 131 | The full list of arguments and options can be seen by running `falco` 132 | without any arguments, as well as `falco -?` or `falco --help`. This 133 | will print the following list: 134 | 135 | ``` 136 | Usage: falco [OPTIONS] ... 137 | Options: 138 | -h, --help Print this help file and exit 139 | -v, --version Print the version of the program and exit 140 | -o, --outdir Create all output files in the specified 141 | output directory. FALCO-SPECIFIC: If the 142 | directory does not exists, the program will 143 | create it. If this option is not set then 144 | the output file for each sequence file is 145 | created in the same directory as the 146 | sequence file which was processed. 147 | --casava [IGNORED BY FALCO] Files come from raw 148 | casava output. Files in the same sample 149 | group (differing only by the group number) 150 | will be analysed as a set rather than 151 | individually. Sequences with the filter flag 152 | set in the header will be excluded from the 153 | analysis. Files must have the same names 154 | given to them by casava (including being 155 | gzipped and ending with .gz) otherwise they 156 | won't be grouped together correctly. 157 | --nano [IGNORED BY FALCO] Files come from nanopore 158 | sequences and are in fast5 format. In this 159 | mode you can pass in directories to process 160 | and the program will take in all fast5 files 161 | within those directories and produce a 162 | single output file from the sequences found 163 | in all files. 164 | --nofilter [IGNORED BY FALCO] If running with --casava 165 | then don't remove read flagged by casava as 166 | poor quality when performing the QC 167 | analysis. 168 | --extract [ALWAYS ON IN FALCO] If set then the zipped 169 | output file will be uncompressed in the same 170 | directory after it has been created. By 171 | default this option will be set if fastqc is 172 | run in non-interactive mode. 173 | -j, --java [IGNORED BY FALCO] Provides the full path to 174 | the java binary you want to use to launch 175 | fastqc. If not supplied then java is assumed 176 | to be in your path. 177 | --noextract [IGNORED BY FALCO] Do not uncompress the 178 | output file after creating it. You should 179 | set this option if you do not wish to 180 | uncompress the output when running in 181 | non-interactive mode. 182 | --nogroup Disable grouping of bases for reads >50bp. 183 | All reports will show data for every base in 184 | the read. WARNING: When using this option, 185 | your plots may end up a ridiculous size. You 186 | have been warned! 187 | --min_length [NOT YET IMPLEMENTED IN FALCO] Sets an 188 | artificial lower limit on the length of the 189 | sequence to be shown in the report. As long 190 | as you set this to a value greater or equal 191 | to your longest read length then this will 192 | be the sequence length used to create your 193 | read groups. This can be useful for making 194 | directly comaparable statistics from 195 | datasets with somewhat variable read 196 | lengths. 197 | -f, --format Bypasses the normal sequence file format 198 | detection and forces the program to use the 199 | specified format. Valid formats are bam, sam, 200 | bam_mapped, sam_mapped, fastq, fq, fastq.gz 201 | or fq.gz. 202 | -t, --threads [NOT YET IMPLEMENTED IN FALCO] Specifies the 203 | number of files which can be processed 204 | simultaneously. Each thread will be 205 | allocated 250MB of memory so you shouldn't 206 | run more threads than your available memory 207 | will cope with, and not more than 6 threads 208 | on a 32 bit machine [1] 209 | -c, --contaminants Specifies a non-default file which contains 210 | the list of contaminants to screen 211 | overrepresented sequences against. The file 212 | must contain sets of named contaminants in 213 | the form name[tab]sequence. Lines prefixed 214 | with a hash will be ignored. Default: 215 | Configuration/contaminant_list.txt 216 | -a, --adapters Specifies a non-default file which contains 217 | the list of adapter sequences which will be 218 | explicity searched against the library. The 219 | file must contain sets of named adapters in 220 | the form name[tab]sequence. Lines prefixed 221 | with a hash will be ignored. Default: 222 | Configuration/adapter_list.txt 223 | -l, --limits Specifies a non-default file which contains 224 | a set of criteria which will be used to 225 | determine the warn/error limits for the 226 | various modules. This file can also be used 227 | to selectively remove some modules from the 228 | output all together. The format needs to 229 | mirror the default limits.txt file found in 230 | the Configuration folder. Default: 231 | Configuration/limits.txt 232 | -k, --kmers [IGNORED BY FALCO AND ALWAYS SET TO 7] 233 | Specifies the length of Kmer to look for in 234 | the Kmer content module. Specified Kmer 235 | length must be between 2 and 10. Default 236 | length is 7 if not specified. 237 | -q, --quiet Supress all progress messages on stdout and 238 | only report errors. 239 | -d, --dir [IGNORED: FALCO DOES NOT CREATE TMP FILES] 240 | Selects a directory to be used for temporary 241 | files written when generating report images. 242 | Defaults to system temp directory if not 243 | specified. 244 | -s, -subsample [Falco only] makes falco faster (but 245 | possibly less accurate) by only processing 246 | reads that are multiple of this value (using 247 | 0-based indexing to number reads). [1] 248 | -b, -bisulfite [Falco only] reads are whole genome 249 | bisulfite sequencing, and more Ts and fewer 250 | Cs are therefore expected and will be 251 | accounted for in base content. 252 | -r, -reverse-complement [Falco only] The input is a 253 | reverse-complement. All modules will be 254 | tested by swapping A/T and C/G 255 | -skip-data [Falco only] Do not create FastQC data text 256 | file. 257 | -skip-report [Falco only] Do not create FastQC report 258 | HTML file. 259 | -skip-summary [Falco only] Do not create FastQC summary 260 | file 261 | -D, -data-filename [Falco only] Specify filename for FastQC 262 | data output (TXT). If not specified, it will 263 | be called fastq_data.txt in either the input 264 | file's directory or the one specified in the 265 | --output flag. Only available when running 266 | falco with a single input. 267 | -R, -report-filename [Falco only] Specify filename for FastQC 268 | report output (HTML). If not specified, it 269 | will be called fastq_report.html in either 270 | the input file's directory or the one 271 | specified in the --output flag. Only 272 | available when running falco with a single 273 | input. 274 | -S, -summary-filename [Falco only] Specify filename for the short 275 | summary output (TXT). If not specified, it 276 | will be called fastq_report.html in either 277 | the input file's directory or the one 278 | specified in the --output flag. Only 279 | available when running falco with a single 280 | input. 281 | -K, -add-call [Falco only] add the command call call to 282 | FastQC data output and FastQC report HTML 283 | (this may break the parse of fastqc_data.txt 284 | in programs that are very strict about the 285 | FastQC output format). 286 | 287 | Help options: 288 | -?, -help print this help message 289 | -about print about message 290 | 291 | PROGRAM: falco 292 | A high throughput sequence QC analysis tool 293 | ``` 294 | 295 | Citing falco 296 | ============ 297 | 298 | If `falco` was helpful for your research, you can cite us as follows: 299 | 300 | *de Sena Brandine G and Smith AD. Falco: high-speed FastQC emulation for 301 | quality control of sequencing data. F1000Research 2021, 8:1874 302 | (https://doi.org/10.12688/f1000research.21142.2)* 303 | 304 | **Please do not cite this manuscript if you used FastQC directly and not falco!** 305 | 306 | Copyright and License Information 307 | ================================= 308 | 309 | Copyright (C) 2019-2022 Guilherme de Sena Brandine and 310 | Andrew D. Smith 311 | 312 | Authors: Guilherme de Sena Brandine and Andrew D. Smith 313 | 314 | This is free software: you can redistribute it and/or modify it under 315 | the terms of the GNU General Public License as published by the Free 316 | Software Foundation, either version 3 of the License, or (at your 317 | option) any later version. 318 | 319 | This software is distributed in the hope that it will be useful, but 320 | WITHOUT ANY WARRANTY; without even the implied warranty of 321 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 322 | General Public License for more details. 323 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Run 'autoreconf -i' to generate 'configure', 'Makefile.in', etc., 4 | # including in the subdirectories of falco src (for the git 5 | # submodules). 6 | # 7 | # The first time this is run on a new cloned git repo the configure 8 | # script will not be present, only the configure.ac and 9 | # Makefile.am. The rest must be generated by `autoreconf -i` and this 10 | # must happen in the `src/smithlab_cpp`, 11 | # `src/abismal/src/smithlab_cpp` and `src/abismal` subdirs. Running 12 | # `autoreconf -i` in some of these directories will move recursively 13 | # into others, but this is not guaranteed. This script will do each 14 | # separately. 15 | # 16 | # If you are working with a distribution (file ending with ".tar.gz" 17 | # or similar) then this script should not be needed, and should not be 18 | # present, as all the files should already exist. You should only run 19 | # this script if you know what you are doing with autoreconf. 20 | # 21 | # This script will only work with an argument to confirm the help 22 | # message has been read. 23 | 24 | runautoreconf() { 25 | autoreconf -i; 26 | } 27 | 28 | if test -d .git && test "$(basename "${PWD}")" = "falco" 29 | then 30 | runautoreconf 31 | exit 0 32 | else 33 | echo " It seems you are either attempting to run this script " 34 | echo " from the wrong directory, or in a source tree that was " 35 | echo " not obtained by cloning the falco git repo. " 36 | echo " " 37 | echo " ./autogen.sh generates the configure script in the " 38 | echo " relevant subdirectories. Only run this if you know " 39 | echo " what you are doing with autoreconf and are simply " 40 | echo " avoiding doing that. If you just want to use the " 41 | echo " software, download a release and this script will " 42 | echo " not be needed. " 43 | exit 1 44 | fi 45 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | Benchmarking tools 2 | =================== 3 | 4 | This directory shows the script and one example output of how benchmarking was 5 | performed. the shell files in this directory can be run given an input 6 | directory, and will run the three compared tools, redirecting the `time` output 7 | to the `outs` directory. 8 | 9 | ### Downloading benchmark fastqs 10 | 11 | FASTQ files can be downloaded using the `fastq-dump` program in the [SRA 12 | Toolkit](https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft) (alternatively 13 | available on [conda](https://anaconda.org/bioconda/sra-tools)). 14 | 15 | With an active internet connection, the `fastq-dump` command in your `PATH` 16 | variable and at least 300GB of disk space, run the following to download the 17 | fastq files onto the test directory: 18 | ``` 19 | $ bash download_files.sh 20 | ``` 21 | ### QC software download links 22 | The following tools need to be installed to run the benchmarking: 23 | * [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc) 24 | * [fastp](https://github.com/OpenGene/fastp/releases) 25 | * [HTQC](https://sourceforge.net/projects/htqc) 26 | 27 | ### Command to run benchmarking 28 | Once files are downloaded and programs are installed and added to your local 29 | `PATH` variable, you can reproduce the benchmarking in the paper by running the 30 | following three commands: 31 | ``` 32 | $ ./run_all_falco_tests.sh 33 | $ ./run_all_fastp_tests.sh 34 | $ ./run_all_fastqc_tests.sh 35 | $ ./run_all_htqc_tests.sh 36 | ``` 37 | 38 | This will output the real, user and sys runtimes for each tool in each dataset. 39 | 40 | #### List of SRR accessions tested 41 | The URLS below link to the `.sra` file that can be then converted to 42 | FASTQ using the `fastq-dump` command. Details about each dataset can 43 | be found at https://trace.ncbi.nlm.nih.gov/Traces/sra/?run=[srr], 44 | where [srr] can be replaced by each accession number (e.g. 45 | `?run=SRR1853178`) 46 | * [SRR10124060](https://sra-download.ncbi.nlm.nih.gov/traces/sra4/SRR/009886/SRR10124060) 47 | * [SRR10143153](https://sra-download.ncbi.nlm.nih.gov/traces/sra68/SRR/009905/SRR10143153) 48 | * [SRR3897196](https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-9/SRR3897196/SRR3897196.1) 49 | * [SRR9624732](https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-15/SRR9624732/SRR9624732.1) 50 | * [SRR1853178](https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos1/sra-pub-run-5/SRR1853178/SRR1853178.1) 51 | * [SRR6387347](https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-11/SRR6387347/SRR6387347.1) 52 | * [SRR891268](https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos1/sra-pub-run-5/SRR891268/SRR891268.1) 53 | * [SRR1772703](https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos1/sra-pub-run-2/SRR1772703/SRR1772703.1) 54 | * [SRR9878537](https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-15/SRR9878537/SRR9878537.1) 55 | * [SRR6059706](https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-11/SRR6059706/SRR6059706.1) 56 | 57 | The human genome nanopore file can be downloaded from the [Human Whole 58 | Genome Sequencing 59 | Project](https://github.com/nanopore-wgs-consortium/NA12878/blob/master/nanopore-human-genome/rel_3_4.md) 60 | (file 61 | [FAB49164](http://s3.amazonaws.com/nanopore-human-wgs/rel3-nanopore-wgs-4045668814-FAB49164.fastq.gz)) 62 | and extracted into the tests/fastq directory 63 | -------------------------------------------------------------------------------- /benchmark/download_files.sh: -------------------------------------------------------------------------------- 1 | files="SRR10124060 SRR10143153 SRR3897196 SRR9624732 SRR1853178 SRR6387347 2 | SRR891268 SRR1772703 SRR9878537 SRR6059706" 3 | mkdir -p tests/fastq 4 | for i in $files 5 | do 6 | echo "Downloading ${i}..." 7 | fastq-dump --outdir tests/fastq --skip-technical --readids \ 8 | --read-filter pass --dumpbase --split-3 --clip \ 9 | ${i} 10 | done 11 | 12 | -------------------------------------------------------------------------------- /benchmark/outs/falco/README.md: -------------------------------------------------------------------------------- 1 | Falco results 2 | ============== 3 | Placeholder for falco results 4 | -------------------------------------------------------------------------------- /benchmark/outs/fastqc/README.md: -------------------------------------------------------------------------------- 1 | FastQC results 2 | =============== 3 | Placeholder for FastQC results 4 | -------------------------------------------------------------------------------- /benchmark/run_all_falco_tests.sh: -------------------------------------------------------------------------------- 1 | mkdir -p outs/falco 2 | for i in `ls tests/fastq` 3 | do 4 | a=`echo $(basename $i) | sed 's/.fastq//g'` 5 | echo "[$(date) - falco] $a" 6 | mkdir -p outs/falco/${a} 7 | time falco -o outs/falco/${a} tests/fastq/${a}.fastq \ 8 | 1>outs/falco/${a}/${a}.output \ 9 | 2>outs/falco/${a}/${a}.error 10 | done 11 | -------------------------------------------------------------------------------- /benchmark/run_all_fastp_tests.sh: -------------------------------------------------------------------------------- 1 | # change this directory to any directory with a set of fastq/fastq.gz/sam/bam 2 | # files to run the benchmarking in all of them 3 | mkdir -p outs/fastp 4 | for i in `ls tests/fastq` 5 | do 6 | # Remove the "-p" flag to run without overrepresentation 7 | a=`echo $(basename $i) | sed 's/.fastq//g'` 8 | mkdir -p outs/fastp/${a} 9 | echo "[$(date) - fastp] $a" 10 | time fastp -V -A -G -Q -L -w 1 -i tests/fastq/${a}.fastq \ 11 | -h outs/fastp/${a}/fastp_report.html \ 12 | -j outs/fastp/${a}/fastp_data.json \ 13 | 1>outs/fastp/${a}output_${a}.txt \ 14 | 2>outs/fastp/${a}/error_${a}.txt 15 | done 16 | -------------------------------------------------------------------------------- /benchmark/run_all_fastqc_tests.sh: -------------------------------------------------------------------------------- 1 | mkdir -p outs/fastqc 2 | for i in `ls tests/fastq` 3 | do 4 | a=`echo $(basename $i) | sed 's/.fastq//g'` 5 | echo "[$(date) - fastqc] $a" 6 | mkdir -p outs/fastqc/${a} 7 | time fastqc -o outs/fastqc/${a} tests/fastq/${a}.fastq \ 8 | 1>outs/fastqc/${a}/${a}.output \ 9 | 2>outs/fastqc/${a}/${a}.error 10 | done 11 | -------------------------------------------------------------------------------- /benchmark/run_all_htqc_tests.sh: -------------------------------------------------------------------------------- 1 | mkdir -p outs/htqc 2 | for i in tests/fastq/test*.fastq 3 | do 4 | a=$(basename $i) 5 | echo $a 6 | time ht-stat -S -i $i -o outs/htqc \ 7 | 1>outs/falco/output_${a} \ 8 | 2>outs/falco/error_${a} 9 | done 10 | -------------------------------------------------------------------------------- /benchmark/tests/README.md: -------------------------------------------------------------------------------- 1 | Placeholder directory for test datasets 2 | ======================================== 3 | This directory can be used to download the SRR files. 4 | 5 | 6 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | dnl falco: quality control for sequencing read files 2 | dnl 3 | dnl Copyright (C) 2019-2024 Guilherme De Sena Brandine and 4 | dnl Andrew D. Smith 5 | dnl Authors: Guilherme De Sena Brandine, Andrew Smith 6 | dnl 7 | dnl This program is free software: you can redistribute it and/or 8 | dnl modify it under the terms of the GNU General Public License as 9 | dnl published by the Free Software Foundation, either version 3 of the 10 | dnl License, or (at your option) any later version. 11 | dnl 12 | dnl This program is distributed in the hope that it will be useful, but 13 | dnl WITHOUT ANY WARRANTY; without even the implied warranty of 14 | dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | dnl General Public License for more details. 16 | 17 | AC_INIT([falco], [1.2.5], [andrewds@usc.edu], 18 | [falco], [https://github.com/smithlabcode/falco]) 19 | dnl the config.h is not currently #included in the source, and only 20 | dnl used to keep command lines short. 21 | AC_CONFIG_HEADERS([config.h]) 22 | AM_INIT_AUTOMAKE([subdir-objects foreign]) 23 | 24 | AC_CONFIG_MACRO_DIR([m4]) 25 | AC_LANG(C++) 26 | AC_PROG_CXX 27 | AX_CXX_COMPILE_STDCXX_17([noext], [mandatory]) 28 | AC_OPENMP([C++]) dnl make sure we have openmp for multi-core in falco 29 | 30 | dnl check for the Zlib library 31 | AC_CHECK_LIB([z], [zlibVersion], [], 32 | [AC_MSG_FAILURE([Zlib must be installed to build falco])], []) 33 | 34 | dnl check for HTSLib if requested 35 | hts_fail_msg=" 36 | 37 | Failed to locate HTSLib on your system. Please use the LDFLAGS and 38 | CPPFLAGS variables to specify the directories where the HTSLib library 39 | and headers can be found. 40 | " 41 | AC_ARG_ENABLE([hts], 42 | [AS_HELP_STRING([--enable-hts], [Enable HTSLib @<:@yes@:>@])], 43 | [enable_hts=yes], [enable_hts=no]) 44 | AS_IF([test "x$enable_hts" = "xyes"], 45 | [AC_CHECK_LIB([hts], [hts_version], [], 46 | [AC_MSG_FAILURE([$hts_fail_msg])])] 47 | ) 48 | AM_CONDITIONAL([ENABLE_HTS], [test "x$enable_hts" = "xyes"]) 49 | 50 | AC_CONFIG_FILES([Makefile]) 51 | dnl make the test data files available in the build tree 52 | AC_CONFIG_LINKS([ 53 | test_build/md5sum.txt:test/md5sum.txt 54 | test_build/test_data.tgz:test/test_data.tgz 55 | ]) 56 | 57 | AC_OUTPUT 58 | -------------------------------------------------------------------------------- /documentation/README.md: -------------------------------------------------------------------------------- 1 | # falco documentation 2 | 3 | This is the documentation for falco that uses 4 | [mkdocs](https://mkdocs.readthedocs.io) to generate readthedocs pages. 5 | The public web verison of this documentation is available at 6 | [falco.readthedocs.io](https://falco.readthedocs.io), but for 7 | uses who wish to see the documentation on a web browser offline, you 8 | can build the documentation locally as described below. 9 | 10 | ### Dependencies 11 | 12 | To build the documentation locally, install mkdocs 13 | 14 | ``` 15 | pip install -U mkdocs 16 | ``` 17 | 18 | ### Local compilation 19 | 20 | Build the HTML documentation by running 21 | ``` 22 | mkdocs build 23 | ``` 24 | which will create a `site` directory where markdown files are 25 | converted to HTML 26 | 27 | Create a local host for the HTML documentation by running 28 | 29 | ``` 30 | mkdocs serve 31 | ``` 32 | 33 | This will create the documentation, usually at http://localhost:8000 . 34 | -------------------------------------------------------------------------------- /documentation/docs/index.md: -------------------------------------------------------------------------------- 1 | # Falco: FastQC Alternative Code 2 | 3 | [![GitHub Downloads](https://img.shields.io/github/downloads/smithlabcode/falco/total.svg?style=social&logo=github&label=Download)](https://github.com/smithlabcode/falco/releases/latest) 4 | [![DOI](https://zenodo.org/badge/214499063.svg)](https://zenodo.org/badge/latestdoi/214499063) 5 | [![Install on conda](https://anaconda.org/bioconda/falco/badges/platforms.svg)](https://anaconda.org/bioconda/falco) 6 | [![Install on conda](https://anaconda.org/bioconda/falco/badges/license.svg)](https://anaconda.org/bioconda/falco) 7 | [![Install on conda](https://img.shields.io/conda/dn/bioconda/falco?color=red&label=conda%20downloads&style=flat-square)](https://anaconda.org/bioconda/falco) 8 | 9 | This program is an emulation of the popular 10 | [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc) 11 | software to check large sequencing reads for common problems. 12 | 13 | Installing falco 14 | ================ 15 | 16 | ## Installing through conda 17 | If you use [anaconda](https://anaconda.org) to manage your packages, 18 | and the `conda` binary is in your path, you can install the most 19 | recent release of `falco` by running 20 | ``` 21 | $ conda install -c bioconda falco 22 | ``` 23 | 24 | `falco` can be found inside the `bin` directory of your anaconda 25 | installer. 26 | 27 | ## Installing from source (code release) 28 | 29 | Compilation from source can be done by downloading a `falco` release 30 | from the [releases](https://github.com/smithlabcode/falco/releases) 31 | section above. Upon downloading the release (in `.tar.gz` or `.zip` 32 | format), and inflating the downloaded file to a directory 33 | (e.g. `falco`), move to the target directory the file was inflated 34 | (e.g. `cd falco`). You should see a `configure` file in it. In this 35 | directory, run 36 | 37 | ``` 38 | $ ./configure CXXFLAGS="-O3 -Wall" 39 | $ make all 40 | $ make install 41 | ``` 42 | if you wish to install the falco binaries on a specific directory, you can use 43 | the `--prefix` argument when running `./configure`, for instance: 44 | 45 | ``` 46 | $ ./configure CXXFLAGS="-O3 -Wall" --prefix=/path/to/installation/directory 47 | ``` 48 | 49 | The `falco` binary will be found in the `bin` directory inside the 50 | specified prefix. 51 | 52 | ## Installing from a cloned repository 53 | 54 | We strongly recommend using `falco` through stable releases as 55 | described above, as the latest commits might contain undocumented 56 | bugs. For the more advanced users who wish to test the most recent 57 | code, `falco` can be installed by first cloning the repository 58 | 59 | ``` 60 | $ git clone https://github.com/smithlabcode/falco.git 61 | $ cd falco 62 | ``` 63 | 64 | Once inside the generated repsotory directory, run 65 | ``` 66 | $ make all 67 | $ make install 68 | ``` 69 | 70 | This should create a `bin` directory inside the cloned repository 71 | containing `falco`. 72 | 73 | ### Required C++ dependencies 74 | 75 | [zlib](https://zlib.net) is required to read gzip compressed FASTQ 76 | files. It is usually installed by default in most UNIX computers and 77 | is part of the htslib setup, but it can also be installed with 78 | standard package managers like apt, brew or conda. 79 | 80 | On Ubuntu, zlib C++ libraries can be installed with `apt`: 81 | ``` 82 | $ sudo apt install zlib1g zlib1g-dev 83 | ``` 84 | 85 | ### Optional C++ dependencies 86 | 87 | [htslib](https://github.com/samtools/htslib) is required to process 88 | bam files. If not provided, bam files will be treated as unrecognized 89 | file formats. 90 | 91 | If htslib is installed, falco can be compiled with it by simply replacing the 92 | configure command above with the `--enable-hts` flag: 93 | 94 | ``` 95 | $ ./configure CXXFLAGS="-O3 -Wall" --enable-hts 96 | ``` 97 | 98 | If `falco` was cloned from the repository, run the following commands 99 | to allow BAM file reading: 100 | 101 | ``` 102 | $ make HAVE_HTSLIB=1 all 103 | $ make HAVE_HTSLIB=1 install 104 | ``` 105 | 106 | If successfully compiled, `falco` can be used in BAM files the same way as it is 107 | used with fastq and sam files. 108 | 109 | Running falco 110 | ============= 111 | 112 | Run falco in with the following command, where the `example.fq` file 113 | provided can be replaced with the path to any FASTQ file you want to run 114 | `falco` 115 | ``` 116 | $ falco example.fq 117 | ``` 118 | 119 | This will generate three files in the same directory as the input fastq file: 120 | 121 | * `fastqc_data.txt` is a text file with a summary of the QC metrics 122 | 123 | * `fastqc_report.html` is the visual HTML report showing plots of the 124 | QC metrics summarized in the text summary. 125 | 126 | * `summary.txt`: A tab-separated file describing whether the 127 | pass/warn/fail result for each module. If multiple files are 128 | provided, only one summary file is generated, with one of the 129 | columns being the file name associated to each module result. 130 | 131 | The full list of arguments and options can be seen by running `falco` 132 | without any arguments, as well as `falco -?` or `falco --help`. This 133 | will print the following list: 134 | 135 | ``` 136 | Usage: falco [OPTIONS] ... 137 | Options: 138 | -h, --help Print this help file and exit 139 | -v, --version Print the version of the program and exit 140 | -o, --outdir Create all output files in the specified 141 | output directory. FALCO-SPECIFIC: If the 142 | directory does not exists, the program will 143 | create it. If this option is not set then 144 | the output file for each sequence file is 145 | created in the same directory as the 146 | sequence file which was processed. 147 | --casava [IGNORED BY FALCO] Files come from raw 148 | casava output. Files in the same sample 149 | group (differing only by the group number) 150 | will be analysed as a set rather than 151 | individually. Sequences with the filter flag 152 | set in the header will be excluded from the 153 | analysis. Files must have the same names 154 | given to them by casava (including being 155 | gzipped and ending with .gz) otherwise they 156 | won't be grouped together correctly. 157 | --nano [IGNORED BY FALCO] Files come from nanopore 158 | sequences and are in fast5 format. In this 159 | mode you can pass in directories to process 160 | and the program will take in all fast5 files 161 | within those directories and produce a 162 | single output file from the sequences found 163 | in all files. 164 | --nofilter [IGNORED BY FALCO] If running with --casava 165 | then don't remove read flagged by casava as 166 | poor quality when performing the QC 167 | analysis. 168 | --extract [ALWAYS ON IN FALCO] If set then the zipped 169 | output file will be uncompressed in the same 170 | directory after it has been created. By 171 | default this option will be set if fastqc is 172 | run in non-interactive mode. 173 | -j, --java [IGNORED BY FALCO] Provides the full path to 174 | the java binary you want to use to launch 175 | fastqc. If not supplied then java is assumed 176 | to be in your path. 177 | --noextract [IGNORED BY FALCO] Do not uncompress the 178 | output file after creating it. You should 179 | set this option if you do not wish to 180 | uncompress the output when running in 181 | non-interactive mode. 182 | --nogroup Disable grouping of bases for reads >50bp. 183 | All reports will show data for every base in 184 | the read. WARNING: When using this option, 185 | your plots may end up a ridiculous size. You 186 | have been warned! 187 | --min_length [NOT YET IMPLEMENTED IN FALCO] Sets an 188 | artificial lower limit on the length of the 189 | sequence to be shown in the report. As long 190 | as you set this to a value greater or equal 191 | to your longest read length then this will 192 | be the sequence length used to create your 193 | read groups. This can be useful for making 194 | directly comaparable statistics from 195 | datasets with somewhat variable read 196 | lengths. 197 | -f, --format Bypasses the normal sequence file format 198 | detection and forces the program to use the 199 | specified format. Valid formats are bam, sam, 200 | bam_mapped, sam_mapped, fastq, fq, fastq.gz 201 | or fq.gz. 202 | -t, --threads [NOT YET IMPLEMENTED IN FALCO] Specifies the 203 | number of files which can be processed 204 | simultaneously. Each thread will be 205 | allocated 250MB of memory so you shouldn't 206 | run more threads than your available memory 207 | will cope with, and not more than 6 threads 208 | on a 32 bit machine [1] 209 | -c, --contaminants Specifies a non-default file which contains 210 | the list of contaminants to screen 211 | overrepresented sequences against. The file 212 | must contain sets of named contaminants in 213 | the form name[tab]sequence. Lines prefixed 214 | with a hash will be ignored. Default: 215 | Configuration/contaminant_list.txt 216 | -a, --adapters Specifies a non-default file which contains 217 | the list of adapter sequences which will be 218 | explicity searched against the library. The 219 | file must contain sets of named adapters in 220 | the form name[tab]sequence. Lines prefixed 221 | with a hash will be ignored. Default: 222 | Configuration/adapter_list.txt 223 | -l, --limits Specifies a non-default file which contains 224 | a set of criteria which will be used to 225 | determine the warn/error limits for the 226 | various modules. This file can also be used 227 | to selectively remove some modules from the 228 | output all together. The format needs to 229 | mirror the default limits.txt file found in 230 | the Configuration folder. Default: 231 | Configuration/limits.txt 232 | -k, --kmers [IGNORED BY FALCO AND ALWAYS SET TO 7] 233 | Specifies the length of Kmer to look for in 234 | the Kmer content module. Specified Kmer 235 | length must be between 2 and 10. Default 236 | length is 7 if not specified. 237 | -q, --quiet Supress all progress messages on stdout and 238 | only report errors. 239 | -d, --dir [IGNORED: FALCO DOES NOT CREATE TMP FILES] 240 | Selects a directory to be used for temporary 241 | files written when generating report images. 242 | Defaults to system temp directory if not 243 | specified. 244 | -s, -subsample [Falco only] makes falco faster (but 245 | possibly less accurate) by only processing 246 | reads that are multiple of this value (using 247 | 0-based indexing to number reads). [1] 248 | -b, -bisulfite [Falco only] reads are whole genome 249 | bisulfite sequencing, and more Ts and fewer 250 | Cs are therefore expected and will be 251 | accounted for in base content. 252 | -r, -reverse-complement [Falco only] The input is a 253 | reverse-complement. All modules will be 254 | tested by swapping A/T and C/G 255 | -skip-data [Falco only] Do not create FastQC data text 256 | file. 257 | -skip-report [Falco only] Do not create FastQC report 258 | HTML file. 259 | -skip-summary [Falco only] Do not create FastQC summary 260 | file 261 | -D, -data-filename [Falco only] Specify filename for FastQC 262 | data output (TXT). If not specified, it will 263 | be called fastq_data.txt in either the input 264 | file's directory or the one specified in the 265 | --output flag. Only available when running 266 | falco with a single input. 267 | -R, -report-filename [Falco only] Specify filename for FastQC 268 | report output (HTML). If not specified, it 269 | will be called fastq_report.html in either 270 | the input file's directory or the one 271 | specified in the --output flag. Only 272 | available when running falco with a single 273 | input. 274 | -S, -summary-filename [Falco only] Specify filename for the short 275 | summary output (TXT). If not specified, it 276 | will be called fastq_report.html in either 277 | the input file's directory or the one 278 | specified in the --output flag. Only 279 | available when running falco with a single 280 | input. 281 | -K, -add-call [Falco only] add the command call call to 282 | FastQC data output and FastQC report HTML 283 | (this may break the parse of fastqc_data.txt 284 | in programs that are very strict about the 285 | FastQC output format). 286 | 287 | Help options: 288 | -?, -help print this help message 289 | -about print about message 290 | 291 | PROGRAM: falco 292 | A high throughput sequence QC analysis tool 293 | ``` 294 | 295 | Citing falco 296 | ============ 297 | 298 | If `falco` was helpful for your research, you can cite us as follows: 299 | 300 | *de Sena Brandine G and Smith AD. Falco: high-speed FastQC emulation for 301 | quality control of sequencing data. F1000Research 2021, 8:1874 302 | (https://doi.org/10.12688/f1000research.21142.2)* 303 | 304 | **Please do not cite this manuscript if you used FastQC directly and not falco!** 305 | 306 | Copyright and License Information 307 | ================================= 308 | 309 | Copyright (C) 2019-2024 Guilherme de Sena Brandine and 310 | Andrew D. Smith 311 | 312 | Authors: Guilherme de Sena Brandine and Andrew D. Smith 313 | 314 | This is free software: you can redistribute it and/or modify it under 315 | the terms of the GNU General Public License as published by the Free 316 | Software Foundation, either version 3 of the License, or (at your 317 | option) any later version. 318 | 319 | This software is distributed in the hope that it will be useful, but 320 | WITHOUT ANY WARRANTY; without even the implied warranty of 321 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 322 | General Public License for more details. 323 | -------------------------------------------------------------------------------- /documentation/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: falco 2 | strict: true 3 | 4 | theme: readthedocs 5 | nav: 6 | - Home: 'index.md' 7 | -------------------------------------------------------------------------------- /example.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/falco/f4f0e6ca35e262cbeffc81fdfc620b3413ecfe2c/example.bam -------------------------------------------------------------------------------- /m4/ax_cxx_check_lib.m4: -------------------------------------------------------------------------------- 1 | dnl @synopsis AX_CXX_CHECK_LIB(libname, functioname, action-if, action-if-not) 2 | dnl 3 | dnl The standard AC_CHECK_LIB can not test functions in namespaces. 4 | dnl Therefore AC_CHECK_LIB(cgicc, cgicc::Cgicc::getVersion) will always 5 | dnl fail. We need to decompose the functionname into a series of namespaces 6 | dnl where it gets declared so that it can be used for a link test. 7 | dnl 8 | dnl In the first version I did allow namespace::functionname to be a 9 | dnl reference to a void-argument global functionname (just wrapped in a 10 | dnl namespace) like its C counterparts would be - but in reality such 11 | dnl thing does not exist. The only global / static functions are always 12 | dnl made const-functions which is an attribute mangled along into the 13 | dnl library function export name. 14 | dnl 15 | dnl The normal usage will ask for a test of a class-member function which 16 | dnl should be presented with a full function spec with arguments given in 17 | dnl parentheses following the function name - if the function to test for 18 | dnl does expect arguments then you should add default initial values in the 19 | dnl prototype (even if they do not exist originally, these are used only 20 | dnl locally to build a correct function call in the configure test script). 21 | dnl 22 | dnl In the current version if you do omit the parenthesis from the macro 23 | dnl argument then the macro will assume that you want to check for the 24 | dnl class name - which is really to check for default constructor being 25 | dnl exported from the given library name. 26 | dnl 27 | dnl EXAMPLE: 28 | dnl AX_CXX_CHECK_LIB(cgicc, [cgicc::HTTPCookie]) 29 | dnl AX_CXX_CHECK_LIB(cgicc, [cgicc::Cgicc::getVersion () const], 30 | dnl AX_CXX_CHECK_LIB(boost_regex, [boost::RegEx::Position (int i = 0) const]) 31 | dnl 32 | dnl Result: 33 | dnl Just as the usual AX_CXX_CHECK_LIB - defines HAVE_LIBCGICC 34 | dnl and adds the libraries to the default library path (and 35 | dnl uses internally the normal ac_check_lib cache symbol 36 | dnl like ac_cv_lib_cgicc_cgicc__Cgicc) 37 | dnl 38 | dnl Footnote: The C++ language is not good at creating stable library 39 | dnl interfaces at the binary level - a lot of functionality is usually being 40 | dnl given as inline functions plus there is hardly a chance to create opaque 41 | dnl types. Therefore most C++ library tests will only do compile tests using 42 | dnl the header files. Doing a check_lib is however good to check the link 43 | dnl dependency before hitting it as an error in the build later. 44 | dnl 45 | dnl @category C++ 46 | dnl @author Guido U. Draheim 47 | dnl @vesion 2006-12-18 48 | 49 | AC_DEFUN([AX_CXX_CHECK_LIB], 50 | [m4_ifval([$3], , [AH_CHECK_LIB([$1])])dnl 51 | AS_LITERAL_IF([$1], 52 | [AS_VAR_PUSHDEF([ac_Lib], [ac_cv_lib_$1_$2])], 53 | [AS_VAR_PUSHDEF([ac_Lib], [ac_cv_lib_$1''_$2])])dnl 54 | AC_CACHE_CHECK([for $2 in -l$1], ac_Lib, 55 | [ac_check_lib_save_LIBS=$LIBS 56 | LIBS="-l$1 $5 $LIBS" 57 | case "$2" 58 | in *::*::*\(*) 59 | AC_LINK_IFELSE([AC_LANG_PROGRAM([ 60 | namespace `echo "$2" | sed -e "s/::.*//"` 61 | { class `echo "$2" | sed -e "s/.*::\\(.*\\)::.*/\\1/" -e "s/(.*//"` 62 | { public: int `echo "$2" | sed -e "s/.*:://" -e "/(/!s/..*/&()/"`; 63 | }; 64 | } 65 | ],[`echo "$2" | sed -e "s/(.*//" -e "s/\\(.*\\)::\\(.*\\)/((\\1*)(0))->\\2/g"`()])], 66 | [AS_VAR_SET(ac_Lib, yes)], 67 | [AS_VAR_SET(ac_Lib, no)]) 68 | ;; *::*::*) 69 | AC_LINK_IFELSE([AC_LANG_PROGRAM([ 70 | namespace `echo "$2" | sed -e "s/::.*//"` 71 | { namespace `echo "$2" | sed -e "s/.*::\\(.*\\)::.*/\\1/"` 72 | { class `echo "$2" | sed -e "s/.*:://"` 73 | { public: `echo "$2" | sed -e "s/.*:://"` (); 74 | }; 75 | } 76 | } 77 | ],[new $2()])], 78 | [AS_VAR_SET(ac_Lib, yes)], 79 | [AS_VAR_SET(ac_Lib, no)]) 80 | ;; *::*\(*) 81 | AC_LINK_IFELSE([AC_LANG_PROGRAM([ 82 | class `echo "$2" | sed -e "s/\\(.*\\)::.*/\\1/" -e "s/(.*//"` 83 | { public: int `echo "$2" | sed -e "s/.*:://" -e "/(/!s/..*/&()/"`; 84 | }; 85 | ],[`echo "$2" | sed -e "s/(.*//" -e "s/\\(.*\\)::\\(.*\\)/((\\1*)(0))->\\2/g"`()])], 86 | [AS_VAR_SET(ac_Lib, yes)], 87 | [AS_VAR_SET(ac_Lib, no)]) 88 | ;; *::*) 89 | AC_LINK_IFELSE([AC_LANG_PROGRAM([ 90 | namespace `echo "$2" | sed -e "s/::.*//"` 91 | { class `echo "$2" | sed -e "s/.*:://"` 92 | { public: `echo "$2" | sed -e "s/.*:://"` (); 93 | }; 94 | } 95 | ],[new $2()])], 96 | [AS_VAR_SET(ac_Lib, yes)], 97 | [AS_VAR_SET(ac_Lib, no)]) 98 | ;; *) 99 | AC_LINK_IFELSE([AC_LANG_CALL([], [$2])], 100 | [AS_VAR_SET(ac_Lib, yes)], 101 | [AS_VAR_SET(ac_Lib, no)]) 102 | ;; esac 103 | LIBS=$ac_check_lib_save_LIBS]) 104 | AS_IF([test AS_VAR_GET(ac_Lib) = yes], 105 | [m4_default([$3], [AC_DEFINE_UNQUOTED(AS_TR_CPP(HAVE_LIB$1)) 106 | LIBS="-l$1 $LIBS" 107 | ])], 108 | [$4])dnl 109 | AS_VAR_POPDEF([ac_Lib])dnl 110 | ])# AC_CHECK_LIB 111 | -------------------------------------------------------------------------------- /m4/ax_cxx_compile_stdcxx.m4: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx.html 3 | # =========================================================================== 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_CXX_COMPILE_STDCXX(VERSION, [ext|noext], [mandatory|optional]) 8 | # 9 | # DESCRIPTION 10 | # 11 | # Check for baseline language coverage in the compiler for the specified 12 | # version of the C++ standard. If necessary, add switches to CXX and 13 | # CXXCPP to enable support. VERSION may be '11' (for the C++11 standard) 14 | # or '14' (for the C++14 standard). 15 | # 16 | # The second argument, if specified, indicates whether you insist on an 17 | # extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g. 18 | # -std=c++11). If neither is specified, you get whatever works, with 19 | # preference for an extended mode. 20 | # 21 | # The third argument, if specified 'mandatory' or if left unspecified, 22 | # indicates that baseline support for the specified C++ standard is 23 | # required and that the macro should error out if no mode with that 24 | # support is found. If specified 'optional', then configuration proceeds 25 | # regardless, after defining HAVE_CXX${VERSION} if and only if a 26 | # supporting mode is found. 27 | # 28 | # LICENSE 29 | # 30 | # Copyright (c) 2008 Benjamin Kosnik 31 | # Copyright (c) 2012 Zack Weinberg 32 | # Copyright (c) 2013 Roy Stogner 33 | # Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov 34 | # Copyright (c) 2015 Paul Norman 35 | # Copyright (c) 2015 Moritz Klammler 36 | # Copyright (c) 2016, 2018 Krzesimir Nowak 37 | # Copyright (c) 2019 Enji Cooper 38 | # 39 | # Copying and distribution of this file, with or without modification, are 40 | # permitted in any medium without royalty provided the copyright notice 41 | # and this notice are preserved. This file is offered as-is, without any 42 | # warranty. 43 | 44 | #serial 11 45 | 46 | dnl This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro 47 | dnl (serial version number 13). 48 | 49 | AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl 50 | m4_if([$1], [11], [ax_cxx_compile_alternatives="11 0x"], 51 | [$1], [14], [ax_cxx_compile_alternatives="14 1y"], 52 | [$1], [17], [ax_cxx_compile_alternatives="17 1z"], 53 | [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl 54 | m4_if([$2], [], [], 55 | [$2], [ext], [], 56 | [$2], [noext], [], 57 | [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX])])dnl 58 | m4_if([$3], [], [ax_cxx_compile_cxx$1_required=true], 59 | [$3], [mandatory], [ax_cxx_compile_cxx$1_required=true], 60 | [$3], [optional], [ax_cxx_compile_cxx$1_required=false], 61 | [m4_fatal([invalid third argument `$3' to AX_CXX_COMPILE_STDCXX])]) 62 | AC_LANG_PUSH([C++])dnl 63 | ac_success=no 64 | 65 | m4_if([$2], [noext], [], [dnl 66 | if test x$ac_success = xno; then 67 | for alternative in ${ax_cxx_compile_alternatives}; do 68 | switch="-std=gnu++${alternative}" 69 | cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch]) 70 | AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch, 71 | $cachevar, 72 | [ac_save_CXX="$CXX" 73 | CXX="$CXX $switch" 74 | AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])], 75 | [eval $cachevar=yes], 76 | [eval $cachevar=no]) 77 | CXX="$ac_save_CXX"]) 78 | if eval test x\$$cachevar = xyes; then 79 | CXX="$CXX $switch" 80 | if test -n "$CXXCPP" ; then 81 | CXXCPP="$CXXCPP $switch" 82 | fi 83 | ac_success=yes 84 | break 85 | fi 86 | done 87 | fi]) 88 | 89 | m4_if([$2], [ext], [], [dnl 90 | if test x$ac_success = xno; then 91 | dnl HP's aCC needs +std=c++11 according to: 92 | dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf 93 | dnl Cray's crayCC needs "-h std=c++11" 94 | for alternative in ${ax_cxx_compile_alternatives}; do 95 | for switch in -std=c++${alternative} +std=c++${alternative} "-h std=c++${alternative}"; do 96 | cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch]) 97 | AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch, 98 | $cachevar, 99 | [ac_save_CXX="$CXX" 100 | CXX="$CXX $switch" 101 | AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])], 102 | [eval $cachevar=yes], 103 | [eval $cachevar=no]) 104 | CXX="$ac_save_CXX"]) 105 | if eval test x\$$cachevar = xyes; then 106 | CXX="$CXX $switch" 107 | if test -n "$CXXCPP" ; then 108 | CXXCPP="$CXXCPP $switch" 109 | fi 110 | ac_success=yes 111 | break 112 | fi 113 | done 114 | if test x$ac_success = xyes; then 115 | break 116 | fi 117 | done 118 | fi]) 119 | AC_LANG_POP([C++]) 120 | if test x$ax_cxx_compile_cxx$1_required = xtrue; then 121 | if test x$ac_success = xno; then 122 | AC_MSG_ERROR([*** A compiler with support for C++$1 language features is required.]) 123 | fi 124 | fi 125 | if test x$ac_success = xno; then 126 | HAVE_CXX$1=0 127 | AC_MSG_NOTICE([No compiler with C++$1 support was found]) 128 | else 129 | HAVE_CXX$1=1 130 | AC_DEFINE(HAVE_CXX$1,1, 131 | [define if the compiler supports basic C++$1 syntax]) 132 | fi 133 | AC_SUBST(HAVE_CXX$1) 134 | ]) 135 | 136 | 137 | dnl Test body for checking C++11 support 138 | 139 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11], 140 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 141 | ) 142 | 143 | 144 | dnl Test body for checking C++14 support 145 | 146 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14], 147 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 148 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_14 149 | ) 150 | 151 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_17], 152 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 153 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_14 154 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_17 155 | ) 156 | 157 | dnl Tests for new features in C++11 158 | 159 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[ 160 | 161 | // If the compiler admits that it is not ready for C++11, why torture it? 162 | // Hopefully, this will speed up the test. 163 | 164 | #ifndef __cplusplus 165 | 166 | #error "This is not a C++ compiler" 167 | 168 | #elif __cplusplus < 201103L 169 | 170 | #error "This is not a C++11 compiler" 171 | 172 | #else 173 | 174 | namespace cxx11 175 | { 176 | 177 | namespace test_static_assert 178 | { 179 | 180 | template 181 | struct check 182 | { 183 | static_assert(sizeof(int) <= sizeof(T), "not big enough"); 184 | }; 185 | 186 | } 187 | 188 | namespace test_final_override 189 | { 190 | 191 | struct Base 192 | { 193 | virtual ~Base() {} 194 | virtual void f() {} 195 | }; 196 | 197 | struct Derived : public Base 198 | { 199 | virtual ~Derived() override {} 200 | virtual void f() override {} 201 | }; 202 | 203 | } 204 | 205 | namespace test_double_right_angle_brackets 206 | { 207 | 208 | template < typename T > 209 | struct check {}; 210 | 211 | typedef check single_type; 212 | typedef check> double_type; 213 | typedef check>> triple_type; 214 | typedef check>>> quadruple_type; 215 | 216 | } 217 | 218 | namespace test_decltype 219 | { 220 | 221 | int 222 | f() 223 | { 224 | int a = 1; 225 | decltype(a) b = 2; 226 | return a + b; 227 | } 228 | 229 | } 230 | 231 | namespace test_type_deduction 232 | { 233 | 234 | template < typename T1, typename T2 > 235 | struct is_same 236 | { 237 | static const bool value = false; 238 | }; 239 | 240 | template < typename T > 241 | struct is_same 242 | { 243 | static const bool value = true; 244 | }; 245 | 246 | template < typename T1, typename T2 > 247 | auto 248 | add(T1 a1, T2 a2) -> decltype(a1 + a2) 249 | { 250 | return a1 + a2; 251 | } 252 | 253 | int 254 | test(const int c, volatile int v) 255 | { 256 | static_assert(is_same::value == true, ""); 257 | static_assert(is_same::value == false, ""); 258 | static_assert(is_same::value == false, ""); 259 | auto ac = c; 260 | auto av = v; 261 | auto sumi = ac + av + 'x'; 262 | auto sumf = ac + av + 1.0; 263 | static_assert(is_same::value == true, ""); 264 | static_assert(is_same::value == true, ""); 265 | static_assert(is_same::value == true, ""); 266 | static_assert(is_same::value == false, ""); 267 | static_assert(is_same::value == true, ""); 268 | return (sumf > 0.0) ? sumi : add(c, v); 269 | } 270 | 271 | } 272 | 273 | namespace test_noexcept 274 | { 275 | 276 | int f() { return 0; } 277 | int g() noexcept { return 0; } 278 | 279 | static_assert(noexcept(f()) == false, ""); 280 | static_assert(noexcept(g()) == true, ""); 281 | 282 | } 283 | 284 | namespace test_constexpr 285 | { 286 | 287 | template < typename CharT > 288 | unsigned long constexpr 289 | strlen_c_r(const CharT *const s, const unsigned long acc) noexcept 290 | { 291 | return *s ? strlen_c_r(s + 1, acc + 1) : acc; 292 | } 293 | 294 | template < typename CharT > 295 | unsigned long constexpr 296 | strlen_c(const CharT *const s) noexcept 297 | { 298 | return strlen_c_r(s, 0UL); 299 | } 300 | 301 | static_assert(strlen_c("") == 0UL, ""); 302 | static_assert(strlen_c("1") == 1UL, ""); 303 | static_assert(strlen_c("example") == 7UL, ""); 304 | static_assert(strlen_c("another\0example") == 7UL, ""); 305 | 306 | } 307 | 308 | namespace test_rvalue_references 309 | { 310 | 311 | template < int N > 312 | struct answer 313 | { 314 | static constexpr int value = N; 315 | }; 316 | 317 | answer<1> f(int&) { return answer<1>(); } 318 | answer<2> f(const int&) { return answer<2>(); } 319 | answer<3> f(int&&) { return answer<3>(); } 320 | 321 | void 322 | test() 323 | { 324 | int i = 0; 325 | const int c = 0; 326 | static_assert(decltype(f(i))::value == 1, ""); 327 | static_assert(decltype(f(c))::value == 2, ""); 328 | static_assert(decltype(f(0))::value == 3, ""); 329 | } 330 | 331 | } 332 | 333 | namespace test_uniform_initialization 334 | { 335 | 336 | struct test 337 | { 338 | static const int zero {}; 339 | static const int one {1}; 340 | }; 341 | 342 | static_assert(test::zero == 0, ""); 343 | static_assert(test::one == 1, ""); 344 | 345 | } 346 | 347 | namespace test_lambdas 348 | { 349 | 350 | void 351 | test1() 352 | { 353 | auto lambda1 = [](){}; 354 | auto lambda2 = lambda1; 355 | lambda1(); 356 | lambda2(); 357 | } 358 | 359 | int 360 | test2() 361 | { 362 | auto a = [](int i, int j){ return i + j; }(1, 2); 363 | auto b = []() -> int { return '0'; }(); 364 | auto c = [=](){ return a + b; }(); 365 | auto d = [&](){ return c; }(); 366 | auto e = [a, &b](int x) mutable { 367 | const auto identity = [](int y){ return y; }; 368 | for (auto i = 0; i < a; ++i) 369 | a += b--; 370 | return x + identity(a + b); 371 | }(0); 372 | return a + b + c + d + e; 373 | } 374 | 375 | int 376 | test3() 377 | { 378 | const auto nullary = [](){ return 0; }; 379 | const auto unary = [](int x){ return x; }; 380 | using nullary_t = decltype(nullary); 381 | using unary_t = decltype(unary); 382 | const auto higher1st = [](nullary_t f){ return f(); }; 383 | const auto higher2nd = [unary](nullary_t f1){ 384 | return [unary, f1](unary_t f2){ return f2(unary(f1())); }; 385 | }; 386 | return higher1st(nullary) + higher2nd(nullary)(unary); 387 | } 388 | 389 | } 390 | 391 | namespace test_variadic_templates 392 | { 393 | 394 | template 395 | struct sum; 396 | 397 | template 398 | struct sum 399 | { 400 | static constexpr auto value = N0 + sum::value; 401 | }; 402 | 403 | template <> 404 | struct sum<> 405 | { 406 | static constexpr auto value = 0; 407 | }; 408 | 409 | static_assert(sum<>::value == 0, ""); 410 | static_assert(sum<1>::value == 1, ""); 411 | static_assert(sum<23>::value == 23, ""); 412 | static_assert(sum<1, 2>::value == 3, ""); 413 | static_assert(sum<5, 5, 11>::value == 21, ""); 414 | static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, ""); 415 | 416 | } 417 | 418 | // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae 419 | // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function 420 | // because of this. 421 | namespace test_template_alias_sfinae 422 | { 423 | 424 | struct foo {}; 425 | 426 | template 427 | using member = typename T::member_type; 428 | 429 | template 430 | void func(...) {} 431 | 432 | template 433 | void func(member*) {} 434 | 435 | void test(); 436 | 437 | void test() { func(0); } 438 | 439 | } 440 | 441 | } // namespace cxx11 442 | 443 | #endif // __cplusplus >= 201103L 444 | 445 | ]]) 446 | 447 | 448 | dnl Tests for new features in C++14 449 | 450 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[ 451 | 452 | // If the compiler admits that it is not ready for C++14, why torture it? 453 | // Hopefully, this will speed up the test. 454 | 455 | #ifndef __cplusplus 456 | 457 | #error "This is not a C++ compiler" 458 | 459 | #elif __cplusplus < 201402L 460 | 461 | #error "This is not a C++14 compiler" 462 | 463 | #else 464 | 465 | namespace cxx14 466 | { 467 | 468 | namespace test_polymorphic_lambdas 469 | { 470 | 471 | int 472 | test() 473 | { 474 | const auto lambda = [](auto&&... args){ 475 | const auto istiny = [](auto x){ 476 | return (sizeof(x) == 1UL) ? 1 : 0; 477 | }; 478 | const int aretiny[] = { istiny(args)... }; 479 | return aretiny[0]; 480 | }; 481 | return lambda(1, 1L, 1.0f, '1'); 482 | } 483 | 484 | } 485 | 486 | namespace test_binary_literals 487 | { 488 | 489 | constexpr auto ivii = 0b0000000000101010; 490 | static_assert(ivii == 42, "wrong value"); 491 | 492 | } 493 | 494 | namespace test_generalized_constexpr 495 | { 496 | 497 | template < typename CharT > 498 | constexpr unsigned long 499 | strlen_c(const CharT *const s) noexcept 500 | { 501 | auto length = 0UL; 502 | for (auto p = s; *p; ++p) 503 | ++length; 504 | return length; 505 | } 506 | 507 | static_assert(strlen_c("") == 0UL, ""); 508 | static_assert(strlen_c("x") == 1UL, ""); 509 | static_assert(strlen_c("test") == 4UL, ""); 510 | static_assert(strlen_c("another\0test") == 7UL, ""); 511 | 512 | } 513 | 514 | namespace test_lambda_init_capture 515 | { 516 | 517 | int 518 | test() 519 | { 520 | auto x = 0; 521 | const auto lambda1 = [a = x](int b){ return a + b; }; 522 | const auto lambda2 = [a = lambda1(x)](){ return a; }; 523 | return lambda2(); 524 | } 525 | 526 | } 527 | 528 | namespace test_digit_separators 529 | { 530 | 531 | constexpr auto ten_million = 100'000'000; 532 | static_assert(ten_million == 100000000, ""); 533 | 534 | } 535 | 536 | namespace test_return_type_deduction 537 | { 538 | 539 | auto f(int& x) { return x; } 540 | decltype(auto) g(int& x) { return x; } 541 | 542 | template < typename T1, typename T2 > 543 | struct is_same 544 | { 545 | static constexpr auto value = false; 546 | }; 547 | 548 | template < typename T > 549 | struct is_same 550 | { 551 | static constexpr auto value = true; 552 | }; 553 | 554 | int 555 | test() 556 | { 557 | auto x = 0; 558 | static_assert(is_same::value, ""); 559 | static_assert(is_same::value, ""); 560 | return x; 561 | } 562 | 563 | } 564 | 565 | } // namespace cxx14 566 | 567 | #endif // __cplusplus >= 201402L 568 | 569 | ]]) 570 | 571 | 572 | dnl Tests for new features in C++17 573 | 574 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_17], [[ 575 | 576 | // If the compiler admits that it is not ready for C++17, why torture it? 577 | // Hopefully, this will speed up the test. 578 | 579 | #ifndef __cplusplus 580 | 581 | #error "This is not a C++ compiler" 582 | 583 | #elif __cplusplus < 201703L 584 | 585 | #error "This is not a C++17 compiler" 586 | 587 | #else 588 | 589 | #include 590 | #include 591 | #include 592 | 593 | namespace cxx17 594 | { 595 | 596 | namespace test_constexpr_lambdas 597 | { 598 | 599 | constexpr int foo = [](){return 42;}(); 600 | 601 | } 602 | 603 | namespace test::nested_namespace::definitions 604 | { 605 | 606 | } 607 | 608 | namespace test_fold_expression 609 | { 610 | 611 | template 612 | int multiply(Args... args) 613 | { 614 | return (args * ... * 1); 615 | } 616 | 617 | template 618 | bool all(Args... args) 619 | { 620 | return (args && ...); 621 | } 622 | 623 | } 624 | 625 | namespace test_extended_static_assert 626 | { 627 | 628 | static_assert (true); 629 | 630 | } 631 | 632 | namespace test_auto_brace_init_list 633 | { 634 | 635 | auto foo = {5}; 636 | auto bar {5}; 637 | 638 | static_assert(std::is_same, decltype(foo)>::value); 639 | static_assert(std::is_same::value); 640 | } 641 | 642 | namespace test_typename_in_template_template_parameter 643 | { 644 | 645 | template typename X> struct D; 646 | 647 | } 648 | 649 | namespace test_fallthrough_nodiscard_maybe_unused_attributes 650 | { 651 | 652 | int f1() 653 | { 654 | return 42; 655 | } 656 | 657 | [[nodiscard]] int f2() 658 | { 659 | [[maybe_unused]] auto unused = f1(); 660 | 661 | switch (f1()) 662 | { 663 | case 17: 664 | f1(); 665 | [[fallthrough]]; 666 | case 42: 667 | f1(); 668 | } 669 | return f1(); 670 | } 671 | 672 | } 673 | 674 | namespace test_extended_aggregate_initialization 675 | { 676 | 677 | struct base1 678 | { 679 | int b1, b2 = 42; 680 | }; 681 | 682 | struct base2 683 | { 684 | base2() { 685 | b3 = 42; 686 | } 687 | int b3; 688 | }; 689 | 690 | struct derived : base1, base2 691 | { 692 | int d; 693 | }; 694 | 695 | derived d1 {{1, 2}, {}, 4}; // full initialization 696 | derived d2 {{}, {}, 4}; // value-initialized bases 697 | 698 | } 699 | 700 | namespace test_general_range_based_for_loop 701 | { 702 | 703 | struct iter 704 | { 705 | int i; 706 | 707 | int& operator* () 708 | { 709 | return i; 710 | } 711 | 712 | const int& operator* () const 713 | { 714 | return i; 715 | } 716 | 717 | iter& operator++() 718 | { 719 | ++i; 720 | return *this; 721 | } 722 | }; 723 | 724 | struct sentinel 725 | { 726 | int i; 727 | }; 728 | 729 | bool operator== (const iter& i, const sentinel& s) 730 | { 731 | return i.i == s.i; 732 | } 733 | 734 | bool operator!= (const iter& i, const sentinel& s) 735 | { 736 | return !(i == s); 737 | } 738 | 739 | struct range 740 | { 741 | iter begin() const 742 | { 743 | return {0}; 744 | } 745 | 746 | sentinel end() const 747 | { 748 | return {5}; 749 | } 750 | }; 751 | 752 | void f() 753 | { 754 | range r {}; 755 | 756 | for (auto i : r) 757 | { 758 | [[maybe_unused]] auto v = i; 759 | } 760 | } 761 | 762 | } 763 | 764 | namespace test_lambda_capture_asterisk_this_by_value 765 | { 766 | 767 | struct t 768 | { 769 | int i; 770 | int foo() 771 | { 772 | return [*this]() 773 | { 774 | return i; 775 | }(); 776 | } 777 | }; 778 | 779 | } 780 | 781 | namespace test_enum_class_construction 782 | { 783 | 784 | enum class byte : unsigned char 785 | {}; 786 | 787 | byte foo {42}; 788 | 789 | } 790 | 791 | namespace test_constexpr_if 792 | { 793 | 794 | template 795 | int f () 796 | { 797 | if constexpr(cond) 798 | { 799 | return 13; 800 | } 801 | else 802 | { 803 | return 42; 804 | } 805 | } 806 | 807 | } 808 | 809 | namespace test_selection_statement_with_initializer 810 | { 811 | 812 | int f() 813 | { 814 | return 13; 815 | } 816 | 817 | int f2() 818 | { 819 | if (auto i = f(); i > 0) 820 | { 821 | return 3; 822 | } 823 | 824 | switch (auto i = f(); i + 4) 825 | { 826 | case 17: 827 | return 2; 828 | 829 | default: 830 | return 1; 831 | } 832 | } 833 | 834 | } 835 | 836 | namespace test_template_argument_deduction_for_class_templates 837 | { 838 | 839 | template 840 | struct pair 841 | { 842 | pair (T1 p1, T2 p2) 843 | : m1 {p1}, 844 | m2 {p2} 845 | {} 846 | 847 | T1 m1; 848 | T2 m2; 849 | }; 850 | 851 | void f() 852 | { 853 | [[maybe_unused]] auto p = pair{13, 42u}; 854 | } 855 | 856 | } 857 | 858 | namespace test_non_type_auto_template_parameters 859 | { 860 | 861 | template 862 | struct B 863 | {}; 864 | 865 | B<5> b1; 866 | B<'a'> b2; 867 | 868 | } 869 | 870 | namespace test_structured_bindings 871 | { 872 | 873 | int arr[2] = { 1, 2 }; 874 | std::pair pr = { 1, 2 }; 875 | 876 | auto f1() -> int(&)[2] 877 | { 878 | return arr; 879 | } 880 | 881 | auto f2() -> std::pair& 882 | { 883 | return pr; 884 | } 885 | 886 | struct S 887 | { 888 | int x1 : 2; 889 | volatile double y1; 890 | }; 891 | 892 | S f3() 893 | { 894 | return {}; 895 | } 896 | 897 | auto [ x1, y1 ] = f1(); 898 | auto& [ xr1, yr1 ] = f1(); 899 | auto [ x2, y2 ] = f2(); 900 | auto& [ xr2, yr2 ] = f2(); 901 | const auto [ x3, y3 ] = f3(); 902 | 903 | } 904 | 905 | namespace test_exception_spec_type_system 906 | { 907 | 908 | struct Good {}; 909 | struct Bad {}; 910 | 911 | void g1() noexcept; 912 | void g2(); 913 | 914 | template 915 | Bad 916 | f(T*, T*); 917 | 918 | template 919 | Good 920 | f(T1*, T2*); 921 | 922 | static_assert (std::is_same_v); 923 | 924 | } 925 | 926 | namespace test_inline_variables 927 | { 928 | 929 | template void f(T) 930 | {} 931 | 932 | template inline T g(T) 933 | { 934 | return T{}; 935 | } 936 | 937 | template<> inline void f<>(int) 938 | {} 939 | 940 | template<> int g<>(int) 941 | { 942 | return 5; 943 | } 944 | 945 | } 946 | 947 | } // namespace cxx17 948 | 949 | #endif // __cplusplus < 201703L 950 | 951 | ]]) 952 | -------------------------------------------------------------------------------- /m4/ax_cxx_compile_stdcxx_17.m4: -------------------------------------------------------------------------------- 1 | # ============================================================================= 2 | # https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_17.html 3 | # ============================================================================= 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_CXX_COMPILE_STDCXX_17([ext|noext], [mandatory|optional]) 8 | # 9 | # DESCRIPTION 10 | # 11 | # Check for baseline language coverage in the compiler for the C++17 12 | # standard; if necessary, add switches to CXX and CXXCPP to enable 13 | # support. 14 | # 15 | # This macro is a convenience alias for calling the AX_CXX_COMPILE_STDCXX 16 | # macro with the version set to C++17. The two optional arguments are 17 | # forwarded literally as the second and third argument respectively. 18 | # Please see the documentation for the AX_CXX_COMPILE_STDCXX macro for 19 | # more information. If you want to use this macro, you also need to 20 | # download the ax_cxx_compile_stdcxx.m4 file. 21 | # 22 | # LICENSE 23 | # 24 | # Copyright (c) 2015 Moritz Klammler 25 | # Copyright (c) 2016 Krzesimir Nowak 26 | # 27 | # Copying and distribution of this file, with or without modification, are 28 | # permitted in any medium without royalty provided the copyright notice 29 | # and this notice are preserved. This file is offered as-is, without any 30 | # warranty. 31 | 32 | #serial 2 33 | 34 | AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX]) 35 | AC_DEFUN([AX_CXX_COMPILE_STDCXX_17], [AX_CXX_COMPILE_STDCXX([17], [$1], [$2])]) 36 | -------------------------------------------------------------------------------- /src/FalcoConfig.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2019-2022 Guilherme De Sena Brandine and 2 | * Andrew D. Smith 3 | * Authors: Guilherme De Sena Brandine, Andrew Smith 4 | * 5 | * This program is free software: you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation, either version 3 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License for more details. 14 | */ 15 | 16 | #ifndef FALCO_CONFIG_HPP 17 | #define FALCO_CONFIG_HPP 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "aux.hpp" 25 | 26 | 27 | /************************************************************* 28 | ******************** CUSTOM CONFIGURATION ******************* 29 | *************************************************************/ 30 | 31 | // config from options, constants, magic numbers, etc 32 | struct FalcoConfig { 33 | static const std::string FalcoVersion; 34 | FalcoConfig(const int argc, const char **argv); 35 | 36 | /************************************************************ 37 | *************** FASTQC OPTION PARSER************************ 38 | ************************************************************/ 39 | bool casava; // files from raw casava output 40 | bool nanopore; // fast5 format 41 | bool nofilter; // if running with --casava flag 42 | bool extract; // if set the zipped file will be uncompressed 43 | bool nogroup; // disable grouping of bases for reads >50bp 44 | bool compressed; // whether or not to inflate file 45 | bool quiet; 46 | size_t read_step; // only process reads that are multiple of read_step 47 | size_t threads; // number of threads to read multiple files in parallel 48 | std::string call; // the function call 49 | std::string format; // force file format 50 | std::string contaminants_file; // custom contaminants file 51 | std::string adapters_file; // adapters file 52 | std::string limits_file; // file with limits and options and custom analyses 53 | static const std::string html_template; // the html for the template 54 | std::string tmpdir; // dir for temp files when generating report images 55 | 56 | // config on how to handle reads 57 | bool do_duplication, 58 | do_kmer, 59 | do_n_content, 60 | do_overrepresented, 61 | do_quality_base, 62 | do_sequence, 63 | do_gc_sequence, 64 | do_quality_sequence, 65 | do_tile, 66 | do_adapter, 67 | do_adapter_optimized, 68 | do_sequence_length; 69 | 70 | /************************************************************ 71 | *************** FASTQC LIMITS ******************************* 72 | ************************************************************/ 73 | // These will become const bools in the stream reader 74 | std::unordered_map > limits; 76 | static const std::vector values_to_check; 77 | 78 | /*************** CONTAMINANTS *****************/ 79 | // below: first = name, scond = seq 80 | std::vector > contaminants; 81 | 82 | /*************** ADAPTERS *********************/ 83 | // Name (eg: Illumina Small RNA adapter) 84 | std::vector adapter_names; 85 | 86 | // Actual string sequence (eg: ATTGCCACA) 87 | std::vector adapter_seqs; 88 | 89 | // two-bit hash of the sequence above 90 | std::vector adapter_hashes; 91 | 92 | size_t adapter_size; 93 | size_t shortest_adapter_size; 94 | /************************************************************ 95 | ******* ADDITIONAL INFORMATION ABOUT THE SAMPLE ************ 96 | ************************************************************/ 97 | bool is_bisulfite; 98 | bool is_reverse_complement; 99 | 100 | /*************** DEFINE FILE TYPE ************/ 101 | 102 | // IO 103 | bool is_sam, is_bam, is_fastq, is_fastq_gz; 104 | std::string filename; 105 | std::string filename_stripped; 106 | 107 | /*********** FUNCTIONS TO READ FILES *************/ 108 | void define_file_format(); 109 | void read_limits(); // populate limits hash map 110 | void read_adapters(); 111 | void read_contaminants_file(); 112 | 113 | void setup(); 114 | }; 115 | 116 | /************************************************************* 117 | ******************** ALL MAGIC NUMBERS ********************** 118 | *************************************************************/ 119 | namespace Constants { 120 | // log of a power of two, to use in bit shifting for fast index acces 121 | // returns the log2 of a number if it is a power of two, or zero 122 | // otherwise 123 | constexpr size_t 124 | log2exact(size_t v) { 125 | return (63 - 126 | ((v & 0x00000000FFFFFFFF) ? 32 : 0) - 127 | ((v & 0x0000FFFF0000FFFF) ? 16 : 0) - 128 | ((v & 0x00FF00FF00FF00FF) ? 8 : 0) - 129 | ((v & 0x0F0F0F0F0F0F0F0F) ? 4 : 0) - 130 | ((v & 0x3333333333333333) ? 2 : 0) - 131 | ((v & 0x5555555555555555) ? 1 : 0)); 132 | } 133 | 134 | static const size_t kmer_size = 7; 135 | static const size_t max_adapters = 128; 136 | 137 | // number of bases for static allocation. 138 | static const size_t num_static_bases = 500; 139 | 140 | // Value to subtract quality characters to get the actual quality value 141 | static const size_t quality_zero = 33; // The ascii for the lowest quality 142 | 143 | // Smallest power of two that comprises all possible Illumina quality values. 144 | // Illumina gives qualities from 0 to 40, therefore we set it as 64. Power of 145 | // is to avoid double pointer jumps and to get indices with bit shifts. 146 | static const size_t num_quality_values = 128; 147 | 148 | // How many possible nucleotides (must be power of 2!) 149 | static const size_t num_nucleotides = 4; // A = 00,C = 01,T = 10,G = 11 150 | 151 | /************* DUPLICATION ESTIMATES *************/ 152 | // Number of unique sequences to see before stopping counting sequences 153 | static const size_t unique_reads_stop_counting = 1e5; 154 | 155 | // Maximum read length to store the entire read in memory 156 | static const size_t unique_reads_max_length = 75; 157 | 158 | // Prefix size to cut if read length exceeds the value above 159 | static const size_t unique_reads_truncate = 50; 160 | 161 | /****Bit shifts as instructions for the std::arrays***/ 162 | // for matrices that count stats per nucleotide 163 | static const size_t bit_shift_base = log2exact(num_nucleotides); 164 | 165 | // for matrices that count stats for quality value 166 | static const size_t bit_shift_quality = log2exact(num_quality_values); 167 | 168 | // bit shift for adapters, log(128) = 7 169 | static const size_t bit_shift_adapter = log2exact(max_adapters); 170 | 171 | // we shift 14 bits when reading a kmer, two bits per base 172 | static const size_t bit_shift_kmer = bit_shift_base*kmer_size; 173 | 174 | // mask to get only the first 2*k bits of the sliding window 175 | static const size_t kmer_mask = (1ull << (bit_shift_kmer)) - 1; 176 | }; 177 | 178 | #endif 179 | -------------------------------------------------------------------------------- /src/FastqStats.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2019 Guilherme De Sena Brandine and 2 | * Andrew D. Smith 3 | * Authors: Guilherme De Sena Brandine, Andrew Smith 4 | * 5 | * This program is free software: you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation, either version 3 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License for more details. 14 | */ 15 | 16 | #include "FastqStats.hpp" 17 | 18 | #include 19 | #include 20 | #include 21 | using std::string; 22 | using std::vector; 23 | using std::array; 24 | using std::unordered_map; 25 | using std::sort; 26 | using std::min; 27 | using std::max; 28 | using std::ostream; 29 | using std::pair; 30 | using std::transform; 31 | using std::toupper; 32 | using std::setprecision; 33 | 34 | // ADS: Defining const static integer class variables here for 35 | // correctness. Optimizer has been ignoring the issue. Hopefully it 36 | // still will when turned on, and allow non-optimized code for 37 | // debugging to compile. 38 | const size_t FastqStats::SHORT_READ_THRESHOLD; 39 | const size_t FastqStats::kBaseQuality; 40 | const size_t FastqStats::kNumQualityValues; 41 | const size_t FastqStats::kNumNucleotides; 42 | const size_t FastqStats::kDupUniqueCutoff; 43 | const size_t FastqStats::kDupReadMaxSize; 44 | const size_t FastqStats::kDupReadTruncateSize; 45 | const size_t FastqStats::kBitShiftNucleotide; 46 | const size_t FastqStats::kBitShiftQuality; 47 | const size_t FastqStats::kBitShiftAdapter; 48 | 49 | // To make the gc models static const 50 | static array 51 | make_gc_models () { 52 | array ans; 53 | for (size_t i = 0; i < FastqStats::SHORT_READ_THRESHOLD; ++i) { 54 | ans[i] = GCModel(i); 55 | } 56 | return ans; 57 | } 58 | 59 | /****************************************************************/ 60 | /******************** FASTQ STATS FUNCTIONS *********************/ 61 | /****************************************************************/ 62 | // Default constructor 63 | FastqStats::FastqStats() { 64 | lowest_char = std::numeric_limits::max(); 65 | encoding_offset = 0; 66 | 67 | total_bases = 0; 68 | num_extra_bases = 0; 69 | total_gc = 0; 70 | num_reads = 0; 71 | empty_reads = 0; 72 | min_read_length = 0; 73 | max_read_length = 0; 74 | num_poor = 0; 75 | 76 | num_unique_seen = 0; 77 | count_at_limit = 0; 78 | 79 | // Initialize IO arrays 80 | base_count.fill(0); 81 | n_base_count.fill(0); 82 | read_length_freq.fill(0); 83 | quality_count.fill(0); 84 | gc_count.fill(0); 85 | position_quality_count.fill(0); 86 | pos_kmer_count.fill(0); 87 | pos_adapter_count.fill(0); 88 | kmer_count = vector(SHORT_READ_THRESHOLD*(Constants::kmer_mask + 1), 0); 89 | } 90 | 91 | // Initialize as many gc models as fast bases 92 | const array 93 | FastqStats::gc_models = make_gc_models(); 94 | 95 | // When we read new bases, dynamically allocate new space for their statistics 96 | void 97 | FastqStats::allocate_new_base(const bool ignore_tile) { 98 | for (size_t i = 0; i < kNumNucleotides; ++i) { 99 | long_base_count.push_back(0); 100 | } 101 | 102 | long_n_base_count.push_back(0); 103 | 104 | // space for quality boxplot 105 | for (size_t i = 0; i < kNumQualityValues; ++i) 106 | long_position_quality_count.push_back(0); 107 | 108 | long_read_length_freq.push_back(0); 109 | 110 | // space for tile quality in each position. 111 | // 112 | if (!ignore_tile) { 113 | for (auto &v : tile_position_quality) { 114 | v.second.push_back(0); 115 | } 116 | for (auto &v : tile_position_count) { 117 | v.second.push_back(0); 118 | } 119 | } 120 | 121 | // Successfully allocated space for a new base 122 | ++num_extra_bases; 123 | } 124 | 125 | // Calculates all summary statistics and pass warn fails 126 | void 127 | FastqStats::summarize() { 128 | // Cumulative read length frequency 129 | size_t cumulative_sum = 0; 130 | for (size_t i = 0; i < max_read_length; ++i) { 131 | if (i < SHORT_READ_THRESHOLD) { 132 | cumulative_sum += read_length_freq[i]; 133 | if (read_length_freq[i] > 0) 134 | if (min_read_length == 0) 135 | min_read_length = i + 1; 136 | } 137 | else { 138 | cumulative_sum += long_read_length_freq[i - SHORT_READ_THRESHOLD]; 139 | if (long_read_length_freq[i - SHORT_READ_THRESHOLD] > 0) 140 | if (min_read_length == 0) 141 | min_read_length = i + 1; 142 | } 143 | } 144 | 145 | for (size_t i = 0; i < max_read_length; ++i) { 146 | if (i < SHORT_READ_THRESHOLD) { 147 | cumulative_read_length_freq[i] = cumulative_sum; 148 | cumulative_sum -= read_length_freq[i]; 149 | } 150 | else { 151 | long_cumulative_read_length_freq.push_back(cumulative_sum); 152 | cumulative_sum -= long_read_length_freq[i - SHORT_READ_THRESHOLD]; 153 | } 154 | } 155 | } 156 | 157 | 158 | void 159 | FastqStats::adjust_tile_maps_len() { 160 | for (auto it = begin(tile_position_quality); 161 | it != end(tile_position_quality); it++) { 162 | it->second.resize(max_read_length); // Always increase space 163 | } 164 | for (auto it = begin(tile_position_count); 165 | it != end(tile_position_count); it++) { 166 | it->second.resize(max_read_length); // Always increase space 167 | } 168 | } 169 | 170 | -------------------------------------------------------------------------------- /src/FastqStats.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2019 Guilherme De Sena Brandine and 2 | * Andrew D. Smith 3 | * Authors: Guilherme De Sena Brandine, Andrew Smith 4 | * 5 | * This program is free software: you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation, either version 3 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License for more details. 14 | */ 15 | 16 | #ifndef FASTQSTATS_HPP 17 | #define FASTQSTATS_HPP 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "FalcoConfig.hpp" 25 | // log of a power of two, to use in bit shifting for fast index acces 26 | // returns the log2 of a number if it is a power of two, or zero 27 | // otherwise 28 | constexpr size_t 29 | log2exact(size_t v) { 30 | return (63 - 31 | ((v & 0x00000000FFFFFFFF) ? 32 : 0) - 32 | ((v & 0x0000FFFF0000FFFF) ? 16 : 0) - 33 | ((v & 0x00FF00FF00FF00FF) ? 8 : 0) - 34 | ((v & 0x0F0F0F0F0F0F0F0F) ? 4 : 0) - 35 | ((v & 0x3333333333333333) ? 2 : 0) - 36 | ((v & 0x5555555555555555) ? 1 : 0)); 37 | } 38 | 39 | /******************* BEGIN COPY FROM FASTQC *****************/ 40 | struct GCModelValue { 41 | int percent; 42 | double increment; 43 | GCModelValue() { 44 | percent = 0; 45 | increment = 0.0; 46 | } 47 | }; 48 | struct GCModel { 49 | std::vector> models; 50 | GCModel() {} 51 | GCModel(const int read_length) { 52 | if (read_length == 0) 53 | return; 54 | 55 | // Number of counts that goes into each bin 56 | std::array claiming_counts; 57 | claiming_counts.fill(0); 58 | 59 | 60 | // Iterate over all possible gc counts (pos) 61 | for (int pos = 0; pos <= read_length; pos++) { 62 | double low_count = static_cast(pos) - 0.5; 63 | double high_count = static_cast(pos) + 0.5; 64 | 65 | if (low_count < 0) low_count = 0; 66 | if (high_count < 0) high_count = 0; 67 | if (high_count > read_length) high_count = read_length; 68 | if (low_count > read_length) low_count = read_length; 69 | 70 | int low_pct = (int)round(100 * low_count / 71 | static_cast(read_length)); 72 | int high_pct = (int)round(100 * high_count / 73 | static_cast(read_length)); 74 | 75 | for(int p = low_pct; p <= high_pct; p++) { 76 | claiming_counts[p]++; 77 | } 78 | } 79 | 80 | // We now do a second pass to make up the model using the weightings 81 | // we calculated previously. 82 | for (int pos = 0; pos <= read_length; pos++) { 83 | double low_count = static_cast(pos) - 0.5; 84 | double high_count = static_cast(pos) + 0.5; 85 | 86 | if (low_count < 0) low_count = 0; 87 | if (high_count < 0) high_count = 0; 88 | if (high_count > read_length) high_count = read_length; 89 | if (low_count > read_length) low_count = read_length; 90 | 91 | // Check the bins in which percentages must be put 92 | int low_pct = (int)round((100 * low_count) / 93 | static_cast(read_length)); 94 | 95 | int high_pct = (int)round((100 * high_count) / 96 | static_cast(read_length)); 97 | 98 | // Add a new vector of values 99 | models.push_back( 100 | std::vector(high_pct - low_pct + 1, GCModelValue()) 101 | ); 102 | 103 | // populates the increment in each bin 104 | for (int p = low_pct; p <= high_pct; ++p) { 105 | models[pos][p - low_pct].percent = p; 106 | models[pos][p - low_pct].increment = 107 | 1.0 / static_cast(claiming_counts[p]); 108 | } 109 | } 110 | } 111 | }; 112 | 113 | /********************** END COPY FROM FASTQC *************/ 114 | 115 | /************************************************************* 116 | ******************** FASTQ STATS **************************** 117 | *************************************************************/ 118 | 119 | struct FastqStats { 120 | // number of bases for static allocation. 121 | static const size_t SHORT_READ_THRESHOLD = 500; 122 | 123 | // Value to subtract quality characters to get the actual quality value 124 | static const size_t kBaseQuality = 33; // The ascii for the lowest quality 125 | 126 | // Smallest power of two that comprises all possible Illumina quality values. 127 | // Illumina gives qualities from 0 to 40, therefore we set it as 64. Power of 128 | // is to avoid double pointer jumps and to get indices with bit shifts. 129 | static const size_t kNumQualityValues = 128; 130 | 131 | // How many possible nucleotides (must be power of 2!) 132 | static const size_t kNumNucleotides = 4; // A = 00,C = 01,T = 10,G = 11 133 | 134 | /************* DUPLICATION ESTIMATES *************/ 135 | // Number of unique sequences to see before stopping counting sequences 136 | static const size_t kDupUniqueCutoff = 1e5; 137 | 138 | // Maximum read length to store the entire read in memory 139 | static const size_t kDupReadMaxSize = 75; 140 | 141 | // Prefix size to cut if read length exceeds the value above 142 | static const size_t kDupReadTruncateSize = 50; 143 | 144 | // Bit shifts as instructions for the std::arrays 145 | static const size_t kBitShiftNucleotide = log2exact(kNumNucleotides); 146 | static const size_t kBitShiftQuality = log2exact(kNumQualityValues); 147 | 148 | 149 | /************ ADAPTER CONSTANTS **********/ 150 | // bit shift for adapters, log(100) = 7 151 | static const size_t kBitShiftAdapter = log2exact(Constants::max_adapters); 152 | 153 | 154 | public: 155 | /*********** SINGLE NUMBERS FROM THE ENTIRE FASTQ ****************/ 156 | 157 | // lowest quality char to infer encoding 158 | char lowest_char; 159 | char encoding_offset; 160 | 161 | // Number of unique sequences seen thus far 162 | size_t num_unique_seen; 163 | 164 | // How many reads were processed before num_unique_seen = kDupUniqueCutoff 165 | size_t count_at_limit; 166 | 167 | size_t total_bases; // sum of all bases in all reads 168 | size_t num_reads; // total number of lines read 169 | size_t empty_reads; 170 | size_t min_read_length; // minimum read length seen 171 | size_t max_read_length; // total number of lines read 172 | size_t num_poor; // reads whose average quality was <= poor 173 | size_t num_extra_bases; // number of bases outside of buffer 174 | size_t total_gc; // sum of all G+C bases in all reads 175 | 176 | // Pre-calculated GC model increments 177 | static const std::array gc_models; 178 | 179 | /********************************************************* 180 | *********** METRICS COLLECTED DURING IO ***************** 181 | *********************************************************/ 182 | /*********** PER BASE METRICS ****************/ 183 | 184 | // counts the number of bases in every read position 185 | std::array base_count; // ATGC 186 | std::array n_base_count; // N 187 | 188 | /*********** PER QUALITY VALUE METRICS ****************/ 189 | // Counts of quality in each base position 190 | std::array position_quality_count; 191 | 192 | // Counts of average quality (truncated) per sequence 193 | std::array quality_count; 194 | 195 | /*********** PER GC VALUE METRICS ****************/ 196 | // histogram of GC fraction in each read from 0 to 100% 197 | std::array gc_count; 198 | 199 | /*********** PER READ METRICS ***************/ 200 | // Distribution of read lengths 201 | std::array read_length_freq; 202 | std::array cumulative_read_length_freq; 203 | 204 | /*********** PER TILE SEQUENCE QUALITY OVERSERQUENCES ********/ 205 | std::unordered_map > tile_position_quality; 206 | std::unordered_map > tile_position_count; 207 | 208 | /*********** SLOW DATA STRUCTURES FOR LONGER READS ************/ 209 | // Leftover memory using dynamic allocation 210 | std::vector long_base_count; 211 | std::vector long_n_base_count; 212 | std::vector long_position_quality_count; 213 | std::vector long_read_length_freq; 214 | std::vector long_cumulative_read_length_freq; 215 | 216 | /********** KMER FREQUENCY ****************/ 217 | // A (4^K + 1)*SHORT_READ_THRESHOLD std::vector to count all possible kmers 218 | std::vector kmer_count; 219 | 220 | // How many kmers were counted in each position 221 | std::array pos_kmer_count; 222 | 223 | // How many adapters were counted in each position 224 | std::array pos_adapter_count; 225 | 226 | /*********** DUPLICATION ******************/ 227 | // First 100k unique sequences and how often they were seen 228 | std::unordered_map sequence_count; 229 | 230 | /**************** FUNCTIONS ****************************/ 231 | // Default constructor that zeros everything 232 | FastqStats(); 233 | 234 | // Allocation of more read positions 235 | void allocate_new_base(const bool ignore_tile); 236 | 237 | void summarize(); 238 | 239 | void adjust_tile_maps_len(); 240 | 241 | // Given an input fastqc_data.txt file, populate the statistics with it 242 | void read(std::istream &is); 243 | }; 244 | #endif 245 | -------------------------------------------------------------------------------- /src/HtmlMaker.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2019 Guilherme De Sena Brandine and 2 | * Andrew D. Smith 3 | * Authors: Guilherme De Sena Brandine, Andrew Smith 4 | * 5 | * This program is free software: you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation, either version 3 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License for more details. 14 | */ 15 | 16 | #include "HtmlMaker.hpp" 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | using std::ostringstream; 24 | using std::string; 25 | using std::vector; 26 | using std::sort; 27 | using std::ifstream; 28 | using std::runtime_error; 29 | using std::chrono::system_clock; 30 | using std::min; 31 | 32 | 33 | void 34 | HtmlMaker::put_data(const string &placeholder, 35 | const string &data) { 36 | auto pos = html_boilerplate.find(placeholder); 37 | // Placeholder not found 38 | if (pos == string::npos) { 39 | throw runtime_error("placeholder not found: " + placeholder); 40 | } 41 | 42 | // at least one placeholder found 43 | while (pos != string::npos) { 44 | html_boilerplate.replace(pos, placeholder.size(), data); 45 | pos = html_boilerplate.find(placeholder, pos + 1); 46 | } 47 | } 48 | 49 | // Comments out html pieces if analyses were skipped 50 | void 51 | HtmlMaker::put_comment(string &comment_begin, 52 | string &comment_end, 53 | bool done) { 54 | // put html comments if analysis was skipped 55 | if (!done) { 56 | put_data(comment_begin, ""); 58 | } 59 | 60 | // otherwise delete placeholder 61 | else { 62 | put_data(comment_begin, ""); 63 | put_data(comment_end, ""); 64 | } 65 | } 66 | 67 | void 68 | HtmlMaker::put_file_details(const FalcoConfig &falco_config) { 69 | // Put file name in filename placeholder 70 | put_data("{{filename}}", 71 | falco_config.filename_stripped); 72 | 73 | // Put date on date placeholder 74 | auto tmp = system_clock::to_time_t(system_clock::now()); 75 | string time_fmt = string(ctime(&tmp)); 76 | put_data("{{date}}", time_fmt); 77 | } 78 | 79 | HtmlMaker::HtmlMaker() { 80 | html_boilerplate = FalcoConfig::html_template; 81 | } 82 | 83 | -------------------------------------------------------------------------------- /src/HtmlMaker.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2019 Guilherme De Sena Brandine and 2 | * Andrew D. Smith 3 | * Authors: Guilherme De Sena Brandine, Andrew Smith 4 | * 5 | * This program is free software: you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation, either version 3 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License for more details. 14 | */ 15 | 16 | #ifndef HTMLMAKER_HPP 17 | #define HTMLMAKER_HPP 18 | 19 | #include 20 | #include 21 | 22 | #include "FalcoConfig.hpp" 23 | #include "FastqStats.hpp" 24 | 25 | /*******************************************************/ 26 | /*************** HTML MAKER ****************************/ 27 | /*******************************************************/ 28 | class HtmlMaker { 29 | public: 30 | std::string html_boilerplate; 31 | HtmlMaker(); 32 | // Fill data from module 33 | void put_data(const std::string &placeholder, const std::string &data); 34 | 35 | // Comment or remove placeholders 36 | void put_comment(std::string &comment_begin, 37 | std::string &comment_end, 38 | bool done); 39 | 40 | // Put file details and date 41 | void put_file_details(const FalcoConfig &falco_config); 42 | }; 43 | #endif 44 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018-2019 Andrew D. Smith 2 | # 3 | # Authors: Andrew D. Smith 4 | # 5 | # This file is part of ABISMAL. 6 | # 7 | # ABISMAL is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # ABISMAL is distributed in the hope that it will be useful, but WITHOUT 13 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15 | # License for more details. 16 | 17 | PROGS = falco 18 | 19 | CXX = g++ 20 | CXXFLAGS = -Wall -std=c++11 21 | CPPFLAGS = -DPROGRAM_PATH=\"$(SRC_ROOT)\" 22 | LDLIBS = -lz 23 | OPTFLAGS = -O3 24 | DEBUGFLAGS = -g 25 | 26 | ifdef HAVE_HTSLIB 27 | CPPFLAGS += -DUSE_HTS 28 | LDLIBS += -lhts 29 | endif 30 | 31 | ifdef DEBUG 32 | CXXFLAGS += $(DEBUGFLAGS) 33 | else 34 | CXXFLAGS += $(OPTFLAGS) 35 | endif 36 | 37 | all: $(PROGS) 38 | install: $(PROGS) 39 | @mkdir -p $(SRC_ROOT)/bin 40 | @install -m 755 $(PROGS) $(SRC_ROOT)/bin 41 | 42 | %.o: %.cpp %.hpp 43 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(CPPFLAGS) 44 | 45 | $(PROGS): FalcoConfig.o FastqStats.o HtmlMaker.o Module.o OptionParser.o \ 46 | smithlab_utils.o StreamReader.o 47 | 48 | %: %.cpp 49 | $(CXX) $(CXXFLAGS) -o $@ $^ $(CPPFLAGS) $(LDLIBS) 50 | 51 | clean: 52 | @-rm -f $(PROGS) *.o *.so *.a *~ 53 | .PHONY: clean 54 | 55 | -------------------------------------------------------------------------------- /src/Module.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2019 Guilherme De Sena Brandine and 3 | * Andrew D. Smith 4 | * Authors: Guilherme De Sena Brandine, Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | */ 16 | 17 | 18 | #ifndef _MODULE_CPP 19 | #define _MODULE_CPP 20 | #include "FastqStats.hpp" 21 | #include "FalcoConfig.hpp" 22 | #include "HtmlMaker.hpp" 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | /* base groups for longer reads, copied from FastQC*/ 32 | struct BaseGroup { 33 | size_t start,end; 34 | BaseGroup (size_t _start, size_t _end) : start(_start), end(_end) {} 35 | }; 36 | 37 | 38 | class Module { 39 | private: 40 | const std::string module_name; 41 | public: 42 | // avoid writing things prior to summarizing 43 | bool summarized; 44 | 45 | // The module name displayed in outputs and html 46 | // GS TODO: automate placing it in html too 47 | 48 | // the module name lowercased without spaces 49 | std::string placeholder; 50 | 51 | // the placeholder in the html for the data the module generates 52 | std::string placeholder_data; 53 | 54 | // the placeholder in the html for the comments 55 | // (comment start: 56 | std::string placeholder_cs; 57 | std::string placeholder_ce; 58 | 59 | // placeholder for grade 60 | std::string placeholder_grade; 61 | 62 | // placeholder for module name 63 | std::string placeholder_name; 64 | 65 | // pass warn fail 66 | std::string grade; 67 | 68 | std::string html_data; 69 | Module(const std::string &_module_name); 70 | virtual ~Module() = 0; 71 | 72 | /*********************************************/ 73 | /*****Abstract functions to be implemented****/ 74 | /*********************************************/ 75 | 76 | // Summarize the module 77 | virtual void summarize_module(FastqStats &stats) = 0; 78 | 79 | // Decide if it's a pass/warn/fail 80 | virtual void make_grade() = 0; 81 | 82 | // write long summary 83 | virtual void write_module(std::ostream &os) = 0; 84 | virtual std::string make_html_data() = 0; 85 | 86 | /*********************************************/ 87 | /**************Visible functions**************/ 88 | /*********************************************/ 89 | // Summarizes and registers that it summarized 90 | void summarize(FastqStats &stats); 91 | 92 | // Write the module in the FastQC standard, starting with title, 93 | // pass/warn/fail and then ending with >>END_MODULE 94 | void write(std::ostream &os); 95 | 96 | // write short summary 97 | void write_short_summary(std::ostream &os, const std::string &filename); 98 | 99 | // Put html data 100 | void put_data_on_html(HtmlMaker &html_maker); 101 | }; 102 | 103 | class ModuleBasicStatistics : public Module { 104 | public: 105 | bool is_nanopore; 106 | std::string file_type; 107 | std::string file_encoding; 108 | std::string filename_stripped; 109 | size_t avg_read_length; 110 | size_t avg_gc; 111 | size_t num_poor; 112 | size_t min_read_length; 113 | size_t max_read_length; 114 | size_t total_sequences; 115 | static const std::string module_name; 116 | ModuleBasicStatistics(const FalcoConfig &config); 117 | ~ModuleBasicStatistics() {} 118 | void summarize_module(FastqStats &stats); 119 | void make_grade(); 120 | void write_module(std::ostream &os); 121 | std::string make_html_data(); 122 | 123 | void read_data_line(const std::string &line); 124 | }; 125 | 126 | class ModulePerBaseSequenceQuality : public Module { 127 | private: 128 | // from FastQC: whether to group bases 129 | bool do_group; 130 | size_t num_bases; 131 | size_t num_groups; 132 | // grade criteria 133 | size_t base_lower_warn, 134 | base_lower_error, 135 | base_median_warn, 136 | base_median_error; 137 | size_t num_warn, num_error; 138 | std::vector group_mean; 139 | std::vector group_ldecile, 140 | group_lquartile, 141 | group_median, 142 | group_uquartile, 143 | group_udecile; 144 | std::vector base_groups; 145 | 146 | public: 147 | static const std::string module_name; 148 | ModulePerBaseSequenceQuality(const FalcoConfig &config); 149 | ~ModulePerBaseSequenceQuality() {} 150 | void summarize_module(FastqStats &stats); 151 | void make_grade(); 152 | void write_module(std::ostream &os); 153 | void read_data_line(const std::string &line); 154 | std::string make_html_data(); 155 | }; 156 | 157 | class ModulePerTileSequenceQuality : public Module { 158 | private: 159 | double grade_warn, grade_error; 160 | size_t max_read_length; 161 | std::unordered_map> tile_position_quality; 162 | std::vector tiles_sorted; 163 | public: 164 | static const std::string module_name; 165 | ModulePerTileSequenceQuality(const FalcoConfig &config); 166 | ~ModulePerTileSequenceQuality() {} 167 | void summarize_module(FastqStats &stats); 168 | void make_grade(); 169 | void write_module(std::ostream &os); 170 | std::string make_html_data(); 171 | }; 172 | 173 | class ModulePerSequenceQualityScores : public Module { 174 | private: 175 | size_t mode_val; 176 | size_t mode_ind; 177 | size_t offset; 178 | std::array quality_count; 179 | // grade criteria 180 | size_t mode_warn; 181 | size_t mode_error; 182 | public: 183 | static const std::string module_name; 184 | ModulePerSequenceQualityScores(const FalcoConfig &config); 185 | ~ModulePerSequenceQualityScores() {} 186 | void summarize_module(FastqStats &stats); 187 | void make_grade(); 188 | void write_module(std::ostream &os); 189 | std::string make_html_data(); 190 | }; 191 | 192 | class ModulePerBaseSequenceContent : public Module { 193 | private: 194 | bool do_group; 195 | std::vector a_pct, c_pct, t_pct, g_pct; 196 | double max_diff; 197 | size_t num_bases; 198 | 199 | // flag as to whether or not dataset is WGBS 200 | bool is_bisulfite; 201 | 202 | // if so we have to test T vs C instead of A vs G 203 | bool is_reverse_complement; 204 | 205 | // for grade 206 | double sequence_error, sequence_warn; 207 | 208 | size_t num_groups; 209 | std::vector base_groups; 210 | public: 211 | static const std::string module_name; 212 | ModulePerBaseSequenceContent(const FalcoConfig &config); 213 | ~ModulePerBaseSequenceContent() {} 214 | void summarize_module(FastqStats &stats); 215 | void make_grade(); 216 | void write_module(std::ostream &os); 217 | std::string make_html_data(); 218 | }; 219 | 220 | class ModulePerSequenceGCContent : public Module { 221 | private: 222 | double gc_warn, gc_error; 223 | double gc_deviation; 224 | std::array gc_count; 225 | std::array theoretical_gc_count; 226 | 227 | public: 228 | static const std::string module_name; 229 | ModulePerSequenceGCContent(const FalcoConfig &config); 230 | ~ModulePerSequenceGCContent() {} 231 | void summarize_module(FastqStats &stats); 232 | void make_grade(); 233 | void write_module(std::ostream &os); 234 | std::string make_html_data(); 235 | }; 236 | 237 | class ModulePerBaseNContent : public Module { 238 | private: 239 | size_t num_bases; 240 | // for grade 241 | size_t grade_n_warn; 242 | size_t grade_n_error; 243 | 244 | double max_n_pct; 245 | std::array gc_count; 246 | std::array theoretical_gc_count; 247 | std::vector n_pct; 248 | // grade vars 249 | size_t gc_warn, gc_error; 250 | 251 | bool do_group; 252 | size_t num_groups; 253 | std::vector base_groups; 254 | public: 255 | static const std::string module_name; 256 | ModulePerBaseNContent(const FalcoConfig &config); 257 | ~ModulePerBaseNContent() {} 258 | void summarize_module(FastqStats &stats); 259 | void make_grade(); 260 | void write_module(std::ostream &os); 261 | std::string make_html_data(); 262 | }; 263 | 264 | class ModuleSequenceLengthDistribution : public Module { 265 | private: 266 | bool do_grade_error; 267 | bool do_grade_warn; 268 | size_t max_read_length; 269 | std::vector sequence_lengths; 270 | 271 | // warn and fail criteria 272 | bool is_all_same_length; 273 | size_t empty_reads; 274 | 275 | bool do_group; 276 | size_t num_groups; 277 | std::vector base_groups; 278 | public: 279 | static const std::string module_name; 280 | ModuleSequenceLengthDistribution(const FalcoConfig &config); 281 | ~ModuleSequenceLengthDistribution() {} 282 | void summarize_module(FastqStats &stats); 283 | void make_grade(); 284 | void write_module(std::ostream &os); 285 | std::string make_html_data(); 286 | }; 287 | 288 | class ModuleSequenceDuplicationLevels : public Module { 289 | private: 290 | double seq_total, seq_dedup; 291 | 292 | double grade_dup_warn; 293 | double grade_dup_error; 294 | double total_deduplicated_pct; 295 | std::array percentage_deduplicated; 296 | std::array percentage_total; 297 | std::unordered_map counts_by_freq; 298 | public: 299 | static const std::string module_name; 300 | ModuleSequenceDuplicationLevels(const FalcoConfig &config); 301 | ~ModuleSequenceDuplicationLevels() {} 302 | void summarize_module(FastqStats &stats); 303 | void make_grade(); 304 | void write_module(std::ostream &os); 305 | std::string make_html_data(); 306 | }; 307 | 308 | class ModuleOverrepresentedSequences : public Module { 309 | private: 310 | size_t num_reads; 311 | std::vector> overrep_sequences; 312 | double grade_warn, grade_error; 313 | const double min_fraction_to_overrepresented = 0.001; 314 | std::vector > contaminants; 315 | 316 | // Function to find the matching contaminant within the list 317 | std::string get_matching_contaminant(const std::string &seq); 318 | public: 319 | static const std::string module_name; 320 | ModuleOverrepresentedSequences(const FalcoConfig &config); 321 | ~ModuleOverrepresentedSequences() {} 322 | void summarize_module(FastqStats &stats); 323 | void make_grade(); 324 | void write_module(std::ostream &os); 325 | std::string make_html_data(); 326 | }; 327 | 328 | class ModuleAdapterContent : public Module { 329 | private: 330 | // Number of adapters to test 331 | size_t num_adapters; 332 | 333 | // number of bases to report 334 | size_t num_bases; 335 | 336 | // adapter size to know how many bases to report 337 | size_t adapter_size; 338 | 339 | // Information from config 340 | std::vector adapter_names; 341 | std::vector adapter_seqs; 342 | std::vector adapter_hashes; 343 | size_t shortest_adapter_size; 344 | 345 | // vector to be reported 346 | std::vector> adapter_pos_pct; 347 | // minimum percentages for warn/fail 348 | double grade_warn, grade_error; 349 | 350 | // Aux function to count adapter in a position 351 | double count_adapter (const std::vector &kmer_count, 352 | const size_t pos, 353 | const size_t adapter_hash, 354 | const size_t adapter_size, 355 | const size_t kmer_size); 356 | public: 357 | static const std::string module_name; 358 | ModuleAdapterContent(const FalcoConfig &config); 359 | ~ModuleAdapterContent() {} 360 | void summarize_module(FastqStats &stats); 361 | void make_grade(); 362 | void write_module(std::ostream &os); 363 | std::string make_html_data(); 364 | }; 365 | 366 | class ModuleKmerContent : public Module { 367 | private: 368 | size_t num_kmer_bases; 369 | size_t kmer_size; 370 | size_t num_kmers; 371 | size_t num_seen_kmers; 372 | 373 | double grade_warn, grade_error; 374 | std::array pos_kmer_count; 375 | std::vector total_kmer_counts; 376 | std::vector obs_exp_max; 377 | std::vector where_obs_exp_is_max; 378 | std::vector> kmers_to_report; 379 | public: 380 | static const std::string module_name; 381 | static const size_t MIN_OBS_EXP_TO_REPORT = 5; 382 | static const size_t MAX_KMERS_TO_REPORT = 20; 383 | static const size_t MAX_KMERS_TO_PLOT = 10; 384 | ModuleKmerContent(const FalcoConfig &config); 385 | ~ModuleKmerContent(){} 386 | void summarize_module(FastqStats &stats); 387 | void make_grade(); 388 | void write_module(std::ostream &os); 389 | std::string make_html_data(); 390 | }; 391 | #endif 392 | 393 | -------------------------------------------------------------------------------- /src/OptionParser.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Part of SMITHLAB software 3 | * 4 | * Copyright (C) 2008 Cold Spring Harbor Laboratory, 5 | * University of Southern California and 6 | * Andrew D. Smith 7 | * 8 | * Authors: Andrew D. Smith 9 | * 10 | * This program is free software: you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation, either version 3 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * This program is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with this program. If not, see . 22 | */ 23 | 24 | #include "OptionParser.hpp" 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include "smithlab_utils.hpp" 38 | 39 | using std::vector; 40 | using std::string; 41 | using std::endl; 42 | using std::runtime_error; 43 | using std::begin; 44 | using std::end; 45 | 46 | static const size_t MAX_LINE_LENGTH = 72; 47 | 48 | enum { 49 | SMITHLAB_ARG_INT, SMITHLAB_ARG_UINT, SMITHLAB_ARG_LONG, 50 | SMITHLAB_ARG_ULONG, SMITHLAB_ARG_FLOAT, SMITHLAB_ARG_DOUBLE, 51 | SMITHLAB_ARG_STRING, SMITHLAB_ARG_BOOL, SMITHLAB_ARG_CHAR 52 | }; 53 | 54 | void 55 | Option::format_option(const string &argument) { 56 | std::istringstream ss(argument); 57 | if ((arg_type == SMITHLAB_ARG_INT && !(ss >> *int_value)) || 58 | (arg_type == SMITHLAB_ARG_UINT && !(ss >> *uint_value)) || 59 | (arg_type == SMITHLAB_ARG_LONG && !(ss >> *long_value)) || 60 | (arg_type == SMITHLAB_ARG_ULONG && !(ss >> *ulong_value)) || 61 | (arg_type == SMITHLAB_ARG_FLOAT && !(ss >> *float_value)) || 62 | (arg_type == SMITHLAB_ARG_DOUBLE && !(ss >> *double_value)) || 63 | (arg_type == SMITHLAB_ARG_CHAR && !(ss >> *char_value))) 64 | throw runtime_error("Invalid argument [" + argument + 65 | "] to option [" + format_option_name() + "]"); 66 | else if (arg_type == SMITHLAB_ARG_STRING) 67 | *string_value = argument; 68 | else if (arg_type == SMITHLAB_ARG_BOOL) { 69 | *bool_value = !(*bool_value); 70 | if (argument == "true" || argument == "on") 71 | *bool_value = true; 72 | if (argument == "false" || argument == "off") 73 | *bool_value = false; 74 | } 75 | } 76 | 77 | using std::numeric_limits; 78 | using std::to_string; 79 | 80 | template string 81 | format_int_like(T &val) { 82 | return "[" + 83 | ((val == numeric_limits::max()) ? "infty" : 84 | ((val == -numeric_limits::max()) ? "-infty" : to_string(val))) + "]"; 85 | } 86 | 87 | template string 88 | format_unsigned_like(T &val) { 89 | return "[" + 90 | ((val == numeric_limits::max()) ? "infty" : to_string(val)) + "]"; 91 | } 92 | 93 | template string 94 | format_float_like(T &val) { 95 | return "[" + 96 | ((val == numeric_limits::max()) ? "infty" : 97 | ((val == -numeric_limits::max()) ? "-infty" : 98 | ((val == numeric_limits::min()) ? "eps" : 99 | ((val == -numeric_limits::min()) ? "-eps" : 100 | ((std::abs(val) < numeric_limits::min()) ? "0.0" : 101 | to_string(val)))))) + "]"; 102 | } 103 | 104 | string 105 | Option::format_default_value() const { 106 | std::istringstream ss; 107 | if (arg_type == SMITHLAB_ARG_INT) 108 | return format_int_like(*int_value); 109 | else if (arg_type == SMITHLAB_ARG_LONG) 110 | return format_int_like(*long_value); 111 | else if (arg_type == SMITHLAB_ARG_UINT) 112 | return format_unsigned_like(*uint_value); 113 | else if (arg_type == SMITHLAB_ARG_ULONG) 114 | return format_unsigned_like(*ulong_value); 115 | else if (arg_type == SMITHLAB_ARG_FLOAT) 116 | return format_float_like(*float_value); 117 | else if (arg_type == SMITHLAB_ARG_DOUBLE) 118 | return format_float_like(*double_value); 119 | else if (arg_type == SMITHLAB_ARG_STRING) 120 | return *string_value; 121 | else if (arg_type == SMITHLAB_ARG_CHAR) 122 | return "[" + string(1, *char_value) + "]"; 123 | else // if (arg_type == SMITHLAB_ARG_BOOL) 124 | return ""; //*bool_value ? "true" : "false"; 125 | } 126 | 127 | 128 | //////////////////////////////////////////////////////////////////////// 129 | //////////////////////////////////////////////////////////////////////// 130 | //////////////////////////////////////////////////////////////////////// 131 | 132 | Option::Option(const string l_name, const char s_name, const string descr, 133 | const bool reqd, int &val) : 134 | arg_type(SMITHLAB_ARG_INT), long_name(l_name), short_name(s_name), 135 | description(descr), required(reqd), specified(false), int_value(&val) {} 136 | 137 | Option::Option(const string l_name, const char s_name, const string descr, 138 | const bool reqd, unsigned int &val) : 139 | arg_type(SMITHLAB_ARG_UINT), long_name(l_name), short_name(s_name), 140 | description(descr), required(reqd), specified(false), uint_value(&val) {} 141 | 142 | Option::Option(const string l_name, const char s_name, const string descr, 143 | const bool reqd, long &val) : 144 | arg_type(SMITHLAB_ARG_LONG), long_name(l_name), short_name(s_name), 145 | description(descr), required(reqd), specified(false), long_value(&val) {} 146 | 147 | Option::Option(const string l_name, const char s_name, const string descr, 148 | const bool reqd, unsigned long &val) : 149 | arg_type(SMITHLAB_ARG_ULONG), long_name(l_name), short_name(s_name), 150 | description(descr), required(reqd), specified(false), ulong_value(&val) {} 151 | 152 | Option::Option(const string l_name, const char s_name, const string descr, 153 | const bool reqd, float &val) : 154 | arg_type(SMITHLAB_ARG_FLOAT), long_name(l_name), short_name(s_name), 155 | description(descr), required(reqd), specified(false), float_value(&val) {} 156 | 157 | Option::Option(const string l_name, const char s_name, const string descr, 158 | const bool reqd, double &val) : 159 | arg_type(SMITHLAB_ARG_DOUBLE), long_name(l_name), short_name(s_name), 160 | description(descr), required(reqd), specified(false), double_value(&val) {} 161 | 162 | Option::Option(const string l_name, const char s_name, const string descr, 163 | const bool reqd, string &val) : 164 | arg_type(SMITHLAB_ARG_STRING), long_name(l_name), short_name(s_name), 165 | description(descr), required(reqd), specified(false), string_value(&val) {} 166 | 167 | Option::Option(const string l_name, const char s_name, const string descr, 168 | const bool reqd, bool &val) : 169 | arg_type(SMITHLAB_ARG_BOOL), long_name(l_name), short_name(s_name), 170 | description(descr), required(reqd), specified(false), bool_value(&val) {} 171 | 172 | Option::Option(const string l_name, const char s_name, const string descr, 173 | const bool reqd, char &val) : 174 | arg_type(SMITHLAB_ARG_CHAR), long_name(l_name), short_name(s_name), 175 | description(descr), required(reqd), specified(false), char_value(&val) {} 176 | 177 | //////////////////////////////////////////////////////////////////////// 178 | //////////////////////////////////////////////////////////////////////// 179 | //////////////////////////////////////////////////////////////////////// 180 | 181 | string 182 | Option::format_option_name() const { 183 | std::ostringstream ss; 184 | if (short_name != '\0') 185 | ss << '-' << short_name << ", -" << long_name; 186 | else ss << " -" << long_name; 187 | return ss.str(); 188 | } 189 | 190 | string 191 | Option::format_option_description(const size_t offset, 192 | const bool show_default) const { 193 | std::ostringstream ss; 194 | if (!description.empty()) { 195 | vector parts; 196 | smithlab::split_whitespace(description, parts); 197 | if (required) 198 | parts.push_back("[required]"); 199 | if (!required && show_default) 200 | parts.push_back(format_default_value()); 201 | 202 | size_t line_len = 0; 203 | for (size_t i = 0; i < parts.size(); ++i) { 204 | if (offset + line_len + parts[i].size() >= MAX_LINE_LENGTH && i > 0) { 205 | line_len = 0; 206 | ss << endl; 207 | } 208 | if (i > 0 && line_len == 0) 209 | ss << string(offset, ' '); 210 | ss << parts[i] << " "; 211 | line_len += parts[i].size()+1; //+1 for the added space 212 | } 213 | } 214 | return ss.str(); 215 | } 216 | 217 | bool 218 | Option::option_match(const string &other) { 219 | return (long_name == other || 220 | (other.length() > 1 && other[0] == '-' && 221 | (other.substr(1) == long_name || 222 | (other[1] == short_name && other.length() == 2)))); 223 | } 224 | 225 | bool 226 | Option::parse(vector &command_line) { 227 | static const string dummy; 228 | if (!command_line.empty()) { 229 | for (size_t i = 0; i < command_line.size();) 230 | if (option_match(command_line[i])) { 231 | if (specified) 232 | throw runtime_error("duplicate use of option: " + long_name); 233 | 234 | if (i < command_line.size() - 1) { 235 | format_option(command_line[i + 1]); 236 | } 237 | else { 238 | // this will only work if it's a bool, because the 239 | // format_option function will ignore the argument 240 | format_option(dummy); 241 | } 242 | 243 | specified = true; 244 | // remove this option from the set of options 245 | command_line.erase(command_line.begin() + i); 246 | // if there was an argument (i.e. non bool) then remove that 247 | // argument also 248 | if (arg_type != SMITHLAB_ARG_BOOL) { 249 | command_line.erase(command_line.begin() + i); 250 | } 251 | } 252 | else { 253 | ++i; 254 | } 255 | } 256 | return (specified || !required); 257 | } 258 | 259 | void 260 | Option::parse_config_file(vector &options) { 261 | size_t i = 0; 262 | size_t op_num = options.size(); 263 | while (i < op_num) { 264 | vector opt_val = smithlab::split(options[i], "="); 265 | opt_val.front() = smithlab::strip(opt_val.front()); 266 | opt_val.back() = smithlab::strip(opt_val.back()); 267 | if (option_match(opt_val.front())) { 268 | format_option(opt_val.back()); 269 | options.erase(options.begin() + i); 270 | specified = true; 271 | --op_num; 272 | } 273 | else { 274 | ++i; 275 | } 276 | } 277 | } 278 | 279 | //////////////////////////////////////////////////////////////////////// 280 | //////////////////////////////////////////////////////////////////////// 281 | //////////////////////////////////////////////////////////////////////// 282 | //////////////////////////////////////////////////////////////////////// 283 | 284 | void 285 | OptionParser::add_opt(const string l_name, const char s_name, const string descr, 286 | const bool reqd, int &val) { 287 | options.push_back(Option(l_name, s_name, descr, reqd, val)); 288 | } 289 | 290 | void 291 | OptionParser::add_opt(const string l_name, const char s_name, const string descr, 292 | const bool reqd, unsigned &val) { 293 | options.push_back(Option(l_name, s_name, descr, reqd, val)); 294 | } 295 | 296 | void 297 | OptionParser::add_opt(const string l_name, const char s_name, const string descr, 298 | const bool reqd, long &val) { 299 | options.push_back(Option(l_name, s_name, descr, reqd, val)); 300 | } 301 | 302 | void 303 | OptionParser::add_opt(const string l_name, const char s_name, const string descr, 304 | const bool reqd, unsigned long &val) { 305 | options.push_back(Option(l_name, s_name, descr, reqd, val)); 306 | } 307 | 308 | void 309 | OptionParser::add_opt(const string l_name, const char s_name, const string descr, 310 | const bool reqd, float &val) { 311 | options.push_back(Option(l_name, s_name, descr, reqd, val)); 312 | } 313 | 314 | void 315 | OptionParser::add_opt(const string l_name, const char s_name, const string descr, 316 | const bool reqd, double &val) { 317 | options.push_back(Option(l_name, s_name, descr, reqd, val)); 318 | } 319 | 320 | void 321 | OptionParser::add_opt(const string l_name, const char s_name, const string descr, 322 | const bool reqd, string &val) { 323 | options.push_back(Option(l_name, s_name, descr, reqd, val)); 324 | } 325 | 326 | void 327 | OptionParser::add_opt(const string l_name, const char s_name, const string descr, 328 | const bool reqd, bool &val) { 329 | options.push_back(Option(l_name, s_name, descr, reqd, val)); 330 | } 331 | 332 | void 333 | OptionParser::add_opt(const string l_name, const char s_name, const string descr, 334 | const bool reqd, char &val) { 335 | options.push_back(Option(l_name, s_name, descr, reqd, val)); 336 | } 337 | 338 | //////////////////////////////////////////////////////////////////////// 339 | //////////////////////////////////////////////////////////////////////// 340 | //////////////////////////////////////////////////////////////////////// 341 | //////////////////////////////////////////////////////////////////////// 342 | 343 | bool valid_option_char(char ch) { 344 | return std::isalnum(ch) || ch == '_'; 345 | } 346 | 347 | static void 348 | fix_whitespace(string &s) { 349 | std::istringstream iss(s); 350 | string token; 351 | s.clear(); 352 | while (iss >> token) { 353 | if (!s.empty()) 354 | s += ' '; 355 | s += token; 356 | } 357 | } 358 | 359 | static void 360 | read_config_file(const string &config_filename, 361 | vector &config_file_options) { 362 | static const char comment_character = '#'; 363 | static const char separator_character = ':'; 364 | static const string outer_space = "^[:space:]+|[:space:]+$"; 365 | static const string inner_space = "([:space:])[:space:]+"; 366 | 367 | config_file_options.clear(); 368 | 369 | std::ifstream in(config_filename); 370 | if (!in) 371 | throw runtime_error("cannot open config file: " + config_filename); 372 | 373 | string line; 374 | size_t line_number = 0; 375 | while (in) { 376 | 377 | if (!getline(in, line)) 378 | throw runtime_error("failed to config line from " + config_filename); 379 | 380 | // remove leading and trailing space 381 | fix_whitespace(line); 382 | 383 | if (!line.empty() && line.front() != comment_character) { 384 | 385 | const size_t sep_pos = line.find_first_of(separator_character); 386 | 387 | if (sep_pos == 0 || // catches ": " 388 | sep_pos >= line.length() - 1) // catches no sep or final char sep 389 | throw runtime_error("bad config file line: " + line); 390 | 391 | string option_label(line.substr(0, sep_pos)); 392 | 393 | if (!all_of(begin(option_label), end(option_label), valid_option_char)) 394 | throw runtime_error("bad option label: " + line); 395 | 396 | string option_value(line.substr(sep_pos + 1)); 397 | // remove leading space 398 | fix_whitespace(option_value); 399 | 400 | if (!all_of(begin(option_value), end(option_value), valid_option_char)) 401 | throw runtime_error("bad option label: " + line); 402 | 403 | // cerr << option_label << '\t' << option_value << endl; 404 | 405 | config_file_options.push_back(line); 406 | } 407 | in.peek(); 408 | ++line_number; 409 | } 410 | } 411 | 412 | void 413 | OptionParser::parse(const int argc, const char **argv, 414 | vector &arguments) { 415 | // The "2" below corresponds to the "about" and "help" options 416 | assert(options.size() >= 2); 417 | 418 | // The '1' and '+ 1' below is to skip over the program name 419 | arguments.clear(); 420 | assert(argc >= 1); 421 | copy(argv + 1, argv + argc, back_inserter(arguments)); 422 | 423 | // search for configuration file given in commnadline 424 | int i = 0; 425 | int arg_num = argc - 1; 426 | while (i < arg_num) 427 | if (arguments[i] == "--config") { 428 | vector config_file_options; 429 | string config_filename; 430 | if (i + 1 < argc - 1) 431 | config_filename = arguments[i+1]; 432 | else 433 | // ads: need to check that this is really a filename 434 | throw runtime_error("--config requires config filename"); 435 | read_config_file(config_filename, config_file_options); 436 | for (size_t j = 0; j < options.size(); ++j) 437 | options[j].parse_config_file(config_file_options); 438 | 439 | // ads: do we need to remove this arg? what if we need to know 440 | // that a config file was used? 441 | arguments.erase(arguments.begin() + i); 442 | arguments.erase(arguments.begin() + i); 443 | arg_num -= 2; 444 | } 445 | else 446 | ++i; 447 | 448 | // parse options given in commmand line 449 | for (size_t i = 0; i < options.size(); ++i) 450 | if (!options[i].parse(arguments) && first_missing_option_name.empty()) 451 | first_missing_option_name = options[i].format_option_name(); 452 | 453 | leftover_args = arguments; 454 | } 455 | 456 | void 457 | OptionParser::parse(const int argc, const char **argv, 458 | vector &arguments, string config_filename) { 459 | // The "2" below corresponds to the "about" and "help" options 460 | assert(options.size() >= 2); 461 | 462 | if (!config_filename.empty()) { 463 | vector config_file_options; 464 | read_config_file(config_filename, config_file_options); 465 | for (size_t i = 0; i < options.size(); ++i) 466 | options[i].parse_config_file(config_file_options); 467 | } 468 | arguments.clear(); 469 | 470 | // The '1' and '+ 1' below is to skip over the program name 471 | assert(argc >= 1); 472 | copy(argv + 1, argv + argc, back_inserter(arguments)); 473 | 474 | for (size_t i = 0; i < options.size(); ++i) 475 | if (!options[i].parse(arguments) && first_missing_option_name.empty()) 476 | first_missing_option_name = options[i].format_option_name(); 477 | 478 | leftover_args = arguments; 479 | } 480 | 481 | OptionParser::OptionParser(const string nm, const string descr, 482 | string noflag_msg, const size_t n_left) : 483 | prog_name(nm), prog_descr(descr), noflag_message(noflag_msg), 484 | help_request(false), about_request(false), 485 | show_defaults(false), n_leftover(n_left) { 486 | add_opt("help", '?', "print this help message", false, help_request); 487 | add_opt("about", '\0', "print about message", false, about_request); 488 | } 489 | 490 | //////////////////////////////////////////////////////////////////////// 491 | //////////////////////////////////////////////////////////////////////// 492 | //////////////////////////////////////////////////////////////////////// 493 | ////// 494 | ////// FOR PRINTING MESSAGES 495 | ////// 496 | 497 | string 498 | OptionParser::help_message() const { 499 | // corresponds to the two spaces before and 500 | static const string SPACE_BEFORE_SHORT = " "; 501 | static const string SPACE_BTWN_SHRT_LNG = " "; 502 | static const size_t TOTAL_ADDED_SPACE = 4; 503 | 504 | vector option_names; 505 | size_t max_name_len = 0; 506 | for(size_t i = 0; i < options.size(); ++i) { 507 | option_names.push_back(options[i].format_option_name()); 508 | max_name_len = std::max(max_name_len, option_names.back().length()); 509 | } 510 | 511 | std::ostringstream ss; 512 | ss << "Usage: " << prog_name << " [OPTIONS]"; 513 | if (!noflag_message.empty()) 514 | ss << " " << noflag_message; 515 | ss << endl << endl; 516 | 517 | if (options.size() > 2) { 518 | ss << "Options:" << endl; 519 | // the loop below begins at 2 because the help and usage messages 520 | // are always the first two and treated separately 521 | for (size_t i = 2; i < options.size(); ++i) 522 | ss << SPACE_BEFORE_SHORT << std::left << std::setw(max_name_len) 523 | << option_names[i] << SPACE_BTWN_SHRT_LNG 524 | << options[i].format_option_description(max_name_len + 525 | TOTAL_ADDED_SPACE, 526 | show_defaults) << endl; 527 | } 528 | 529 | ss << endl << "Help options:" << endl; 530 | for (size_t i = 0; i < std::min(2ul, options.size()); ++i) 531 | ss << SPACE_BEFORE_SHORT << std::left << std::setw(max_name_len) 532 | << option_names[i] << SPACE_BTWN_SHRT_LNG 533 | << options[i].format_option_description(max_name_len + 534 | TOTAL_ADDED_SPACE, 535 | show_defaults) << endl; 536 | return ss.str(); 537 | } 538 | 539 | string 540 | OptionParser::about_message() const { 541 | static const char *PROGRAM_NAME_TAG = "PROGRAM: "; 542 | 543 | vector parts; 544 | smithlab::split_whitespace(prog_descr, parts); 545 | 546 | std::ostringstream ss; 547 | ss << PROGRAM_NAME_TAG << prog_name << endl; 548 | ss << parts.front(); 549 | size_t line_len = parts.front().length(); 550 | for (size_t i = 1; i < parts.size(); ++i) { 551 | if (line_len + parts[i].size() >= MAX_LINE_LENGTH) { 552 | line_len = 0; 553 | ss << endl; 554 | } 555 | else ss << ' '; 556 | ss << parts[i]; 557 | line_len += parts[i].length() + 1; // the "+1" is for the space 558 | } 559 | return ss.str(); 560 | } 561 | 562 | 563 | string 564 | OptionParser::invalid_leftover() const { 565 | static const string left_tag("invalid leftover args [should be "); 566 | static const string right_tag("]"); 567 | 568 | std::ostringstream ss; 569 | if (n_leftover != std::numeric_limits::max()) { 570 | ss << left_tag << n_leftover << right_tag << endl; 571 | } 572 | for (size_t i = 0; i < leftover_args.size(); ++i) { 573 | ss << "leftover arg #" << (i + 1) << "=\"" 574 | << leftover_args[i] << "\""; 575 | } 576 | return ss.str(); 577 | } 578 | 579 | 580 | string 581 | OptionParser::option_missing_message() const { 582 | std::ostringstream ss; 583 | ss << "required argument missing: [" << first_missing_option_name << "]"; 584 | return ss.str(); 585 | } 586 | -------------------------------------------------------------------------------- /src/OptionParser.hpp: -------------------------------------------------------------------------------- 1 | /* Part of SMITHLAB software 2 | * 3 | * Copyright (C) 2018 Cold Spring Harbor Laboratory, 4 | * University of Southern California and 5 | * Andrew D. Smith 6 | * Authors: Andrew D. Smith 7 | * 8 | * This program is free software: you can redistribute it and/or 9 | * modify it under the terms of the GNU General Public License as 10 | * published by the Free Software Foundation, either version 3 of the 11 | * License, or (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, but 14 | * WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 | * General Public License for more details. 17 | */ 18 | 19 | #ifndef OPTION_PARSER_HPP 20 | #define OPTION_PARSER_HPP 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | class Option { 27 | public: 28 | Option(const std::string l_name, const char s_name, 29 | const std::string descr, const bool reqd, int &val); 30 | Option(const std::string l_name, const char s_name, 31 | const std::string descr, const bool reqd, unsigned &val); 32 | Option(const std::string l_name, const char s_name, 33 | const std::string descr, const bool reqd, long &val); 34 | Option(const std::string l_name, const char s_name, 35 | const std::string descr, const bool reqd, unsigned long &val); 36 | Option(const std::string l_name, const char s_name, 37 | const std::string descr, const bool reqd, float &val); 38 | Option(const std::string l_name, const char s_name, 39 | const std::string descr, const bool reqd, double &val); 40 | Option(const std::string l_name, const char s_name, 41 | const std::string descr, const bool reqd, std::string &val); 42 | Option(const std::string l_name, const char s_name, 43 | const std::string descr, const bool reqd, bool &val); 44 | Option(const std::string l_name, const char s_name, 45 | const std::string descr, const bool reqd, char &val); 46 | 47 | bool parse(std::vector &command_line); 48 | 49 | void parse_config_file(std::vector &options); 50 | 51 | std::string format_option_name() const; 52 | std::string format_option_description(const size_t offset, 53 | const bool show_default = false) const; 54 | std::string format_default_value() const; 55 | 56 | private: 57 | 58 | unsigned arg_type; 59 | std::string long_name; 60 | char short_name; 61 | std::string description; 62 | bool required; 63 | bool specified; 64 | 65 | // the values of the options: ugly but clean 66 | int *int_value; 67 | unsigned int *uint_value; 68 | long *long_value; 69 | unsigned long *ulong_value; 70 | float *float_value; 71 | double *double_value; 72 | std::string *string_value; 73 | bool *bool_value; 74 | char *char_value; 75 | 76 | void format_option(const std::string &argument); 77 | static void set_max_length(size_t num); 78 | static size_t get_max_length(); 79 | bool option_match(const std::string &other); 80 | }; 81 | 82 | class OptionParser { 83 | public: 84 | 85 | OptionParser(const std::string nm, const std::string descr, 86 | std::string noflag_msg = "", 87 | const size_t n_left = std::numeric_limits::max()); 88 | 89 | void set_show_defaults() {show_defaults = true;} 90 | 91 | void add_opt(const std::string l_name, const char s_name, 92 | const std::string descr, const bool reqd, int &val); 93 | void add_opt(const std::string l_name, const char s_name, 94 | const std::string descr, const bool reqd, unsigned &val); 95 | void add_opt(const std::string l_name, const char s_name, 96 | const std::string descr, const bool reqd, long &val); 97 | void add_opt(const std::string l_name, const char s_name, 98 | const std::string descr, const bool reqd, unsigned long &val); 99 | void add_opt(const std::string l_name, const char s_name, 100 | const std::string descr, const bool reqd, float &val); 101 | void add_opt(const std::string l_name, const char s_name, 102 | const std::string descr, const bool reqd, double &val); 103 | void add_opt(const std::string l_name, const char s_name, 104 | const std::string descr, const bool reqd, std::string &val); 105 | void add_opt(const std::string l_name, const char s_name, 106 | const std::string descr, const bool reqd, bool &val); 107 | void add_opt(const std::string l_name, const char s_name, 108 | const std::string descr, const bool reqd, char &val); 109 | 110 | void parse(const int argc, const char **argv, 111 | std::vector &arguments); 112 | 113 | void parse(const int argc, const char **argv, 114 | std::vector &arguments, 115 | std::string config_filename); 116 | 117 | bool help_requested() const {return help_request;} 118 | std::string help_message() const; 119 | 120 | bool about_requested() const {return about_request;} 121 | std::string about_message() const; 122 | std::string invalid_leftover() const; 123 | 124 | bool option_missing() const { 125 | return !first_missing_option_name.empty(); 126 | } 127 | 128 | bool wrong_number_leftover() const { 129 | return n_leftover != std::numeric_limits::max() && 130 | leftover_args.size() != n_leftover; 131 | } 132 | 133 | std::string option_missing_message() const; 134 | 135 | static const bool OPTIONAL = false; 136 | static const bool REQUIRED = true; 137 | 138 | private: 139 | std::string prog_name; 140 | std::string prog_descr; 141 | std::string noflag_message; 142 | std::vector