├── .clang-format ├── .cppcheck_suppress ├── .github └── workflows │ ├── clang-format.yml │ ├── cppcheck.yml │ ├── cpplint.yml │ └── ubuntu-builds.yml ├── .gitmodules ├── .readthedocs.yaml ├── CPPLINT.cfg ├── LICENSE ├── Makefile ├── Makefile.am ├── README.md ├── autogen.sh ├── configure.ac ├── data ├── SRR1003759_5M_subset.mr ├── SRR1106616_5M_subset.bam ├── SRR1301329_1M_hist.txt ├── Shakespeare_hist.txt ├── Shakespeare_vals.txt └── additional_data.txt ├── docs ├── FullExperiment_copy.pdf ├── InitialExperimentComplexityCurves_copy.pdf ├── RELEASE_NOTES.txt ├── TCR_richness_vs_age_lm.pdf ├── biblio.bib ├── compare_RNA_Capture_junction_complexity.pdf ├── comparing_scWGA_coverage.pdf ├── manual.pdf └── manual.tex ├── documentation ├── README.md ├── docs │ ├── Makefile │ ├── index.md │ ├── quickstart.md │ └── requirements.txt └── mkdocs.yml ├── m4 ├── ax_cxx_check_lib.m4 ├── ax_cxx_compile_stdcxx.m4 └── ax_cxx_compile_stdcxx_17.m4 ├── src ├── Makefile ├── bam_record_utils.cpp ├── bam_record_utils.hpp ├── bound_pop.cpp ├── bound_pop.hpp ├── c_curve.cpp ├── c_curve.hpp ├── common.cpp ├── common.hpp ├── continued_fraction.cpp ├── continued_fraction.hpp ├── dnmt_error.hpp ├── gc_extrap.cpp ├── gc_extrap.hpp ├── lc_extrap.cpp ├── lc_extrap.hpp ├── load_data_for_complexity.cpp ├── load_data_for_complexity.hpp ├── moment_sequence.cpp ├── moment_sequence.hpp ├── pop_size.cpp ├── pop_size.hpp └── preseq.cpp └── tests ├── data ├── c_curve_input.hist ├── gc_extrap_input.mr └── lc_extrap_input.vals ├── md5sum.txt └── scripts ├── test_c_curve.test ├── test_gc_extrap.test └── test_lc_extrap.test /.clang-format: -------------------------------------------------------------------------------- 1 | # This file is part of preseq 2 | # 3 | # Copyright (C) 2024: Andrew D. Smith 4 | # 5 | # Authors: Andrew D. Smith 6 | # 7 | # This is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This software is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | BasedOnStyle: LLVM 18 | ColumnLimit: 80 19 | IndentWidth: 2 20 | AlwaysBreakAfterReturnType: TopLevel 21 | ContinuationIndentWidth: 2 22 | ConstructorInitializerIndentWidth: 2 23 | BraceWrapping: 24 | BeforeElse: true 25 | BeforeCatch: true 26 | BreakBeforeBraces: Custom 27 | BreakConstructorInitializers: AfterColon 28 | SpacesBeforeTrailingComments: 2 29 | -------------------------------------------------------------------------------- /.cppcheck_suppress: -------------------------------------------------------------------------------- 1 | # This file is part of preseq 2 | # 3 | # Copyright (C) 2024: Andrew D. Smith 4 | # 5 | # Authors: Andrew D. Smith 6 | # 7 | # This is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This software is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | missingIncludeSystem 18 | constVariablePointer 19 | checkersReport 20 | unusedFunction:src/bam_record_utils.hpp 21 | unusedFunction:src/bam_record_utils.cpp 22 | *:src/smithlab_cpp* 23 | unusedStructMember:src/*.hpp 24 | -------------------------------------------------------------------------------- /.github/workflows/clang-format.yml: -------------------------------------------------------------------------------- 1 | # This file is part of preseq 2 | # 3 | # Copyright (C) 2024: Andrew D. Smith 4 | # 5 | # Authors: Andrew D. Smith 6 | # 7 | # This is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This software is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | name: Source formatting with clang-format 18 | 19 | on: 20 | push: 21 | branches: [ "master" ] 22 | pull_request: 23 | branches: [ "master" ] 24 | workflow_dispatch: 25 | 26 | jobs: 27 | clang-format: 28 | runs-on: ubuntu-24.04 29 | 30 | steps: 31 | - name: Checkout repository 32 | uses: actions/checkout@v4 33 | 34 | - name: Install dependencies 35 | run: sudo apt-get install -y clang-format 36 | 37 | - name: Run clang-format 38 | run: | 39 | clang-format --dry-run -Werror $(git ls-files '*.*pp') 40 | -------------------------------------------------------------------------------- /.github/workflows/cppcheck.yml: -------------------------------------------------------------------------------- 1 | # This file is part of preseq 2 | # 3 | # Copyright (C) 2024: Andrew D. Smith 4 | # 5 | # Authors: Andrew D. Smith 6 | # 7 | # This is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This software is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | name: Linting with cppcheck 18 | 19 | on: 20 | push: 21 | branches: [ "master" ] 22 | pull_request: 23 | branches: [ "master" ] 24 | workflow_dispatch: 25 | 26 | jobs: 27 | cppcheck: 28 | runs-on: ubuntu-24.04 29 | strategy: 30 | matrix: 31 | python-version: ["3.12"] 32 | 33 | steps: 34 | - name: Checkout repository 35 | uses: actions/checkout@v4 36 | 37 | - name: Python setup ${{ matrix.python-version }} 38 | uses: actions/setup-python@v5 39 | with: 40 | python-version: ${{ matrix.python-version }} 41 | 42 | - name: Install cppcheck 43 | run: | 44 | conda install -c conda-forge cppcheck 45 | 46 | - name: Run cppcheck 47 | run: | 48 | ${CONDA}/bin/cppcheck \ 49 | --std=c++17 \ 50 | --enable=all \ 51 | --check-level=exhaustive \ 52 | --suppressions-list=.cppcheck_suppress \ 53 | -I src/smithlab_cpp \ 54 | $(git ls-files '*.*pp') 55 | -------------------------------------------------------------------------------- /.github/workflows/cpplint.yml: -------------------------------------------------------------------------------- 1 | # This file is part of preseq 2 | # 3 | # Copyright (C) 2024: Andrew D. Smith 4 | # 5 | # Authors: Andrew D. Smith 6 | # 7 | # This is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This software is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | name: Linting with cpplint 18 | 19 | on: 20 | push: 21 | branches: [ "master" ] 22 | pull_request: 23 | branches: [ "master" ] 24 | workflow_dispatch: 25 | 26 | jobs: 27 | cpplint: 28 | runs-on: ubuntu-24.04 29 | strategy: 30 | matrix: 31 | python-version: ["3.12"] 32 | 33 | steps: 34 | - name: Checkout repository 35 | uses: actions/checkout@v4 36 | 37 | - name: Python setup ${{ matrix.python-version }} 38 | uses: actions/setup-python@v5 39 | with: 40 | python-version: ${{ matrix.python-version }} 41 | 42 | - name: Install cpplint 43 | run: | 44 | python -m pip install --upgrade pip 45 | pip install cpplint 46 | 47 | - name: Run cpplint 48 | run: | 49 | cpplint --quiet --repository=. $(git ls-files '*.*pp') 50 | -------------------------------------------------------------------------------- /.github/workflows/ubuntu-builds.yml: -------------------------------------------------------------------------------- 1 | # This file is part of preseq 2 | # 3 | # Copyright (C) 2024: Andrew D. Smith 4 | # 5 | # Authors: Andrew D. Smith 6 | # 7 | # This is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This software is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | name: Building preseq on Ubuntu 18 | 19 | on: 20 | workflow_dispatch: 21 | 22 | jobs: 23 | build: 24 | runs-on: ubuntu-24.04 25 | steps: 26 | - uses: actions/checkout@v4 27 | with: 28 | submodules: recursive 29 | - name: Install dependencies 30 | run: sudo apt-get install -y libhts-dev 31 | 32 | - name: Generate configure script 33 | run: ./autogen.sh 34 | 35 | - name: Configure for g++ 36 | run: ./configure CXX="g++" 37 | 38 | - name: Build with g++ 39 | run: make -j2 40 | 41 | - name: Test the g++ build 42 | run: make check 43 | 44 | - name: Cleanup after the g++ build 45 | run: make distclean 46 | 47 | - name: Configure for clang++ 48 | run: ./configure CXX="clang++" 49 | 50 | - name: Build with clang++ 51 | run: make -j2 52 | 53 | - name: Test the clang++ build 54 | run: make check 55 | 56 | - name: Cleanup after the clang++ build 57 | run: make distclean 58 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "preseqR"] 2 | path = preseqR 3 | url = ../preseqR.git 4 | [submodule "src/smithlab_cpp"] 5 | path = src/smithlab_cpp 6 | url = ../smithlab_cpp.git 7 | [submodule "src/bamxx"] 8 | path = src/bamxx 9 | url = ../bamxx.git 10 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | build: 3 | os: ubuntu-20.04 4 | tools: 5 | python: "3.9" 6 | 7 | mkdocs: 8 | configuration: documentation/mkdocs.yml 9 | fail_on_warning: false 10 | 11 | python: 12 | install: 13 | - requirements: documentation/docs/requirements.txt 14 | 15 | formats: 16 | - pdf 17 | - epub 18 | -------------------------------------------------------------------------------- /CPPLINT.cfg: -------------------------------------------------------------------------------- 1 | # This file is part of preseq 2 | # 3 | # Copyright (C) 2024: Andrew D. Smith 4 | # 5 | # Authors: Andrew D. Smith 6 | # 7 | # This is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This software is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | set noparent 18 | filter=-runtime/references 19 | filter=-build/include_subdir 20 | filter=-build/include_order 21 | filter=-build/c++11 22 | filter=-build/c++17 23 | # Formatting below handled by clang-format 24 | filter=-whitespace/line_length 25 | filter=-whitespace/newline 26 | filter=-readability/braces 27 | filter=-whitespace/semicolon 28 | filter=-whitespace/indent 29 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2011-2020 University of Southern California and 2 | # Andrew D. Smith and Timothy Daley 3 | # 4 | # Authors: Timothy Daley and Andrew D. Smith 5 | # 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | 16 | ifndef install_dir 17 | install_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) 18 | endif 19 | 20 | all: 21 | @make -C src 22 | 23 | install: 24 | @make -C src install_dir=$(install_dir) install 25 | 26 | clean: 27 | @make -C src clean 28 | 29 | distclean: clean 30 | @rm -rf $(install_dir)/bin 31 | 32 | .PHONY: all distclean clean install 33 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | # This file is part of preseq 2 | # 3 | # Copyright (C) 2018-2024: Andrew D. Smith 4 | # 5 | # Authors: Andrew D. Smith 6 | # 7 | # This is free software: you can redistribute it and/or modify it 8 | # under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # This software is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | ACLOCAL_AMFLAGS = -I m4 18 | 19 | SUBDIRS := src/smithlab_cpp 20 | install installdirs: SUBDIRS := $(filter-out src/smithlab_cpp, $(SUBDIRS)) 21 | AM_CPPFLAGS = -I $(top_srcdir)/src/smithlab_cpp 22 | if ENABLE_HTS 23 | AM_CPPFLAGS += -I $(top_srcdir)/src/bamxx 24 | endif 25 | 26 | AM_CPPFLAGS += -Wall -Wextra -Wpedantic -Wno-unknown-attributes 27 | if ENABLE_HTS 28 | AM_CPPFLAGS += -DHAVE_HTSLIB 29 | endif 30 | 31 | EXTRA_DIST = \ 32 | README.md \ 33 | LICENSE \ 34 | preseqR \ 35 | data \ 36 | docs \ 37 | tests/md5sum.txt \ 38 | tests/data/lc_extrap_input.vals \ 39 | tests/data/gc_extrap_input.mr \ 40 | tests/data/c_curve_input.hist \ 41 | tests/scripts/test_c_curve.test \ 42 | tests/scripts/test_gc_extrap.test \ 43 | tests/scripts/test_lc_extrap.test 44 | 45 | TESTS = \ 46 | tests/scripts/test_c_curve.test \ 47 | tests/scripts/test_lc_extrap.test \ 48 | tests/scripts/test_gc_extrap.test 49 | 50 | TEST_EXTENSIONS = .test 51 | 52 | LDADD = src/smithlab_cpp/libsmithlab_cpp.a 53 | 54 | bin_PROGRAMS = preseq 55 | 56 | preseq_SOURCES = \ 57 | src/preseq.cpp \ 58 | src/common.hpp \ 59 | src/common.cpp \ 60 | src/c_curve.hpp \ 61 | src/c_curve.cpp \ 62 | src/gc_extrap.hpp \ 63 | src/gc_extrap.cpp \ 64 | src/lc_extrap.hpp \ 65 | src/lc_extrap.cpp \ 66 | src/bound_pop.hpp \ 67 | src/bound_pop.cpp \ 68 | src/pop_size.hpp \ 69 | src/pop_size.cpp \ 70 | src/continued_fraction.hpp \ 71 | src/continued_fraction.cpp \ 72 | src/load_data_for_complexity.hpp \ 73 | src/load_data_for_complexity.cpp \ 74 | src/moment_sequence.hpp \ 75 | src/moment_sequence.cpp 76 | 77 | if ENABLE_HTS 78 | preseq_SOURCES += \ 79 | src/bamxx/bamxx.hpp \ 80 | src/bam_record_utils.hpp \ 81 | src/bam_record_utils.cpp 82 | endif 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GitHub all releases](https://img.shields.io/github/downloads/smithlabcode/preseq/total?label=GitHub%20downloads)](https://github.com/smithlabcode/preseq/releases) 2 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/preseq/badges/version.svg)](https://anaconda.org/bioconda/preseq) 3 | [![Install with Conda](https://anaconda.org/bioconda/preseq/badges/platforms.svg)](https://anaconda.org/bioconda/preseq) 4 | [![Install with Conda](https://img.shields.io/conda/dn/bioconda/preseq?label=Conda%20downloads)](https://anaconda.org/bioconda/preseq) 5 | [![Documentation Status](https://readthedocs.org/projects/preseq/badge/?version=latest)](https://preseq.readthedocs.io/en/latest/?badge=latest) 6 | [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) 7 | 8 | This is the README file for the preseq package. The preseq package is 9 | aimed at predicting the yield of distinct reads from a genomic library 10 | from an initial sequencing experiment. The estimates can then be used 11 | to examine the utility of further sequencing, optimize the sequencing 12 | depth, or to screen multiple libraries to avoid low complexity 13 | samples. 14 | 15 | SYSTEM REQUIREMENTS 16 | ======================================================================== 17 | The preseq software will only run on 64-bit UNIX-like operating 18 | systems and was developed on both Linux and Mac. The preseq software 19 | requires a C++ compiler that supports C++11. 20 | 21 | INSTALLATION 22 | ======================================================================== 23 | ### Installing from a release 24 | 25 | 1. Download `preseq-x.tar.gz` from the releases tab of this repository. 26 | 2. Unpack the archive: 27 | ```console 28 | $ tar -zxvf preseq-x.tar.gz 29 | ``` 30 | 3. Move into the preseq directory and create a build directory: 31 | ```console 32 | $ cd preseq-x 33 | $ mkdir build && cd build 34 | ``` 35 | 4. Run the configuration script: 36 | ```console 37 | $ ../configure 38 | ``` 39 | If you do not want to install preseq system-wide, or if you do 40 | not have admin privileges, specify a prefix directory: 41 | ```console 42 | $ ../configure --prefix=/some/reasonable/place 43 | ``` 44 | Finally, if you want to build with HTSlib support (for the `to-mr` 45 | program) then you need to specify the following: 46 | ```console 47 | $ ../configure --enable-hts 48 | ``` 49 | And if you installed HTSlib yourself in some non-standard directory, 50 | you must specify the location like this: 51 | ```console 52 | $ ../configure --enable-hts CPPFLAGS='-I /path/to/htslib/headers' \ 53 | LDFLAGS='-L/path/to/htslib/lib' 54 | ``` 55 | 5. Compile and install the tools: 56 | ```console 57 | $ make 58 | $ make install 59 | ``` 60 | 61 | ### Installing from source 62 | 63 | Developers looking to use the latest commits can compile the cloned 64 | repository using the `Makefile` within the `src` directory. The 65 | process is simple: 66 | ```console 67 | $ cd src/ 68 | $ make 69 | ``` 70 | If the desired input is in `.bam` format, `htslib` is required. Type 71 | ```console 72 | $ make HAVE_HTSLIB=1 all 73 | ``` 74 | The HTSLib library can be obtained here: 75 | http://www.htslib.org/download. 76 | 77 | INPUT FILE FORMATS 78 | ======================================================================== 79 | The input to preseq can be in 3 general formats: 80 | 1. Mapped read locations in BED or BAM file format. The file should be 81 | sorted by chromosome, start position, end position, and finally 82 | strand if in BED format. If the file is in BAM format, then the 83 | file should be sorted using `bamtools` or `samtools sort`. 84 | 2. The "counts histogram" which will have, for each count 1,2,..., the 85 | number of unique "species" (e.g. reads, or anything else) that 86 | appear with that count. Examples can be found in the data directory 87 | within the preseqR subdirectory. Note these should not have a count 88 | for "0", and they should not have any header above the counts. Just 89 | two columns of numbers, with the first column sorted and unique. 90 | 3. The counts themselves, so just a file with one count on each 91 | line. These will be made into the "counts histogram" inside preseq 92 | right away. 93 | 94 | USAGE EXAMPLES 95 | ======================================================================== 96 | Each program included in this software package will print a list of 97 | options if executed without any command line arguments. Many of the 98 | programs use similar options (for example, output files are specified 99 | with '-o'). 100 | 101 | We have provided a data directory to test each of our programs. 102 | Change to the `data` directory and try some of our commands. 103 | To predict the yield of a future experiment, use `lc_extrap`. 104 | For the most basic usage of `lc_extrap` to compute the expected yield, 105 | use the command on the following data: 106 | ```console 107 | $ preseq lc_extrap -o yield_estimates.txt SRR1003759_5M_subset.mr 108 | ``` 109 | If the input file is in `.bam` format, use the `-B` flag: 110 | ```console 111 | $ preseq lc_extrap -B -o yield_estimates.txt SRR1106616_5M_subset.bam 112 | ``` 113 | For the counts histogram format, use the `-H` flag: 114 | ```console 115 | $ preseq lc_extrap -H -o yield_estimates.txt SRR1301329_1M_read.txt 116 | ``` 117 | 118 | The yield estimates will appear in yield_estimates.txt, and will be a 119 | column of future experiment sizes in `TOTAL_READS`, a column of the 120 | corresponding expected distinct reads in `EXPECTED_DISTINCT`, followed 121 | by two columns giving the corresponding confidence intervals. 122 | 123 | To investigate the past yield of an experiment, use `c_curve`. 124 | `c_curve` can take in the same file formats as `lc_extrap` by using 125 | the same flags. The estimates will appear in estimates.txt with two 126 | columns. The first column gives the total number of reads in a 127 | theoretically smaller experiment and the second gives the 128 | corresponding number of distinct reads. 129 | 130 | `bound_pop` provides an estimate for the species richness of the 131 | sampled population. The input file formats and corresponding flags are 132 | identical to `c_curve` and `lc_extrap`. The output provides the median 133 | species richness in the first column and the confidence intervals in 134 | the next two columns. 135 | 136 | Finally, `gc_extrap` predicts the expected genomic coverage for a 137 | future experiment. It produces the coverage in an output format 138 | identical to `lc_extrap`. `gc_extrap` can only take in files in BED 139 | and mapped reads format (using the `-B` flag for BED): 140 | ```console 141 | $ preseq gc_extrap -B -o coverage_estimates.txt SRR1003759_5M_subset.mr 142 | ``` 143 | 144 | More data is available in the `additional_data.txt` file in the `data` 145 | directory. For an extended write-up on our programs, please read the 146 | manual in the `docs` directory. 147 | 148 | HISTORY 149 | ======================================================================== 150 | Preseq was originally developed by Timothy Daley and Andrew D. Smith 151 | at University of Southern California. 152 | 153 | **v3.2.0:** 154 | Updates to the repo in preparation for putting preseq in conda 155 | 156 | **v3.1.2:** 157 | Two headers were added. 158 | 159 | **v3.1.0:** 160 | A mode `pop_size` has been added that uses the continued fraction 161 | approximation to the Good-Toulmin model and extrapolates as far as 162 | possible. Although `bound_pop` provides a good and reliable 163 | lower-bound, this new mode will give a more accurate estimate of the 164 | population size (e.g. total number of distinct molecules). It's not 165 | perfect yet, and in some cases if the population is more than a 166 | billion times larger than the sample, it will still only give a lower 167 | bound. But it works well on most data sets. 168 | 169 | **v3.0.2:** 170 | GSL has been completely removed, and a data directory has been added 171 | for users to test our programs. 172 | 173 | **v3.0.1:** 174 | We no longer require users to have GSL for all modules except for 175 | `bound_pop`. Users interested in using `bound_pop` can install GSL and 176 | follow the instructions above to configure with GSL. 177 | 178 | **v3.0.0:** 179 | The main change to this version is that if BAM/SAM format will be used 180 | as input, the HTSLib library must be installed on the system when 181 | preseq is built. Installation instructions above have been updated 182 | correspondingly. We also updated to use C++11, so a more recent 183 | compiler is required, but these days C++11 is usually supported. 184 | 185 | **v2.0.3:** 186 | A bug in defect mode was fixed and a rng seed was added to allow for 187 | reproducibility. 188 | 189 | **v2.0.0:** 190 | We have added a new module, `bound_pop`, to estimate a lower bound of 191 | the population sampled from. Interpolation is calculated by 192 | expectation rather than subsampling, dramatically improving the speed. 193 | 194 | **v1.0.2:** 195 | We have switched the dependency on the BamTools API to SAMTools, which 196 | we believe will be more convenient for most users of preseq. Minor 197 | bugs have been fixed, and algorithms have been refined to more 198 | accurately construct counts histograms and extrapolate the complexity 199 | curve. More options have been added to `lc_extrap`. `c_curve` and 200 | `lc_extrap` are now both under a single binary for easier use, and 201 | commands will now be written as `preseq lc_extrap [OPTIONS]` 202 | Furthermore, there are updates to the manual for any minor issues 203 | encountered when compiling the preseq binary. 204 | 205 | We released an R package called 206 | [preseqR](http://cran.r-project.org/web/packages/preseqR/index.html) 207 | along with preseq. This makes most of the preseq functionality 208 | available in the R statistical environment, and includes some new 209 | functionality. The preseqR directory contains all required source code 210 | to build this R package. 211 | 212 | CONTACT INFORMATION 213 | ======================================================================== 214 | Andrew D. Smith and Timothy Daley 215 | 216 | http://smithlabresearch.org 217 | 218 | LICENSE 219 | ======================================================================== 220 | ```txt 221 | The preseq software for estimating library complexity 222 | Copyright (C) 2014-2022 Timothy Daley, Andrew D Smith, Chao Deng 223 | University of Southern California 224 | 225 | This program is free software: you can redistribute it and/or modify 226 | it under the terms of the GNU General Public License as published by 227 | the Free Software Foundation, either version 3 of the License, or (at 228 | your option) any later version. 229 | 230 | This program is distributed in the hope that it will be useful, 231 | but WITHOUT ANY WARRANTY; without even the implied warranty of 232 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 233 | GNU General Public License for more details. 234 | 235 | You should have received a copy of the GNU General Public License 236 | along with this program. If not, see . 237 | ``` 238 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Run 'autoreconf -i' to generate 'configure', 'Makefile.in', etc. 4 | # 5 | # The first time this is run on a new cloned git repo the configure 6 | # script will not be present, only the configure.ac and 7 | # Makefile.am. The rest must be generated by `autoreconf -i`. 8 | # 9 | # If you are working with a distribution (file ending with ".tar.gz" 10 | # or similar) then this script should not be needed, and should not be 11 | # present, as all the files should already exist. You should only run 12 | # this script if you know what you are doing with autoreconf. 13 | # 14 | # This script will only work with an argument to confirm the help 15 | # message has been read. 16 | 17 | runautoreconf() { 18 | autoreconf -i; 19 | } 20 | 21 | if test -d .git && test "$(basename "${PWD}")" = "preseq" 22 | then 23 | runautoreconf 24 | exit 0 25 | else 26 | echo " It seems you are either attempting to run this script " 27 | echo " from the wrong directory, or in a source tree that was " 28 | echo " not obtained by cloning the preseq git repo. " 29 | echo " " 30 | echo " ./autogen.sh generates the configure script. Only run " 31 | echo " " 32 | echo " Only run this if you know what you are doing with " 33 | echo " autoreconf and are simply avoiding doing that. If you " 34 | echo " just want to use the software, download a release and " 35 | echo " this script will not be needed. " 36 | exit 1 37 | fi 38 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | dnl This file is part of preseq 2 | dnl 3 | dnl Copyright (C) 2018-2024: Andrew D. Smith 4 | dnl 5 | dnl Authors: Andrew D. Smith 6 | dnl 7 | dnl This is free software: you can redistribute it and/or modify it 8 | dnl under the terms of the GNU General Public License as published by 9 | dnl the Free Software Foundation, either version 3 of the License, or 10 | dnl (at your option) any later version. 11 | dnl 12 | dnl This software is distributed in the hope that it will be useful, 13 | dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | dnl General Public License for more details. 16 | 17 | AC_INIT([preseq], [3.2.0], [andrewds@usc.edu], 18 | [preseq], [https://github.com/smithlabcode/preseq]) 19 | dnl the config.h is not currently #included in the source, and only 20 | dnl used to keep command lines short. 21 | AC_CONFIG_HEADERS([config.h]) 22 | AM_INIT_AUTOMAKE([subdir-objects foreign]) 23 | 24 | AC_CONFIG_MACRO_DIR([m4]) 25 | AC_LANG(C++) 26 | AC_PROG_CXX 27 | AX_CXX_COMPILE_STDCXX_17([noext], [mandatory]) 28 | AC_PROG_RANLIB 29 | 30 | dnl recursively configure smithlab_cpp 31 | AC_CONFIG_SUBDIRS([src/smithlab_cpp]) 32 | 33 | dnl check for HTSLib if requested 34 | hts_fail_msg=" 35 | 36 | Failed to locate HTSLib on your system. Please use the LDFLAGS and 37 | CPPFLAGS variables to specify the directories where the HTSLib library 38 | and headers can be found. 39 | " 40 | AC_ARG_ENABLE([hts], 41 | [AS_HELP_STRING([--enable-hts], [Enable HTSLib @<:@yes@:>@])], 42 | [enable_hts=yes], [enable_hts=no]) 43 | AS_IF([test "x$enable_hts" = "xyes"], 44 | [AC_CHECK_LIB([hts], [hts_version], [], 45 | [AC_MSG_FAILURE([$hts_fail_msg])])] 46 | ) 47 | AM_CONDITIONAL([ENABLE_HTS], [test "x$enable_hts" = "xyes"]) 48 | 49 | AC_CONFIG_FILES([Makefile]) 50 | 51 | dnl make the test data files available in the build tree 52 | AC_CONFIG_LINKS([ 53 | tests/md5sum.txt:tests/md5sum.txt 54 | tests/c_curve_input.hist:tests/data/c_curve_input.hist 55 | tests/lc_extrap_input.vals:tests/data/lc_extrap_input.vals 56 | tests/gc_extrap_input.mr:tests/data/gc_extrap_input.mr 57 | ]) 58 | 59 | AC_OUTPUT 60 | -------------------------------------------------------------------------------- /data/SRR1106616_5M_subset.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/data/SRR1106616_5M_subset.bam -------------------------------------------------------------------------------- /data/SRR1301329_1M_hist.txt: -------------------------------------------------------------------------------- 1 | 1 982419 2 | 2 6060 3 | 3 214 4 | 4 63 5 | 5 32 6 | 6 21 7 | 7 14 8 | 8 9 9 | 9 6 10 | 10 3 11 | 11 6 12 | 12 2 13 | 13 2 14 | 14 2 15 | 15 3 16 | 16 2 17 | 24 2 18 | 31 1 19 | -------------------------------------------------------------------------------- /data/Shakespeare_hist.txt: -------------------------------------------------------------------------------- 1 | 1 14376 2 | 2 4343 3 | 3 2292 4 | 4 1463 5 | 5 1043 6 | 6 837 7 | 7 638 8 | 8 519 9 | 9 430 10 | 10 364 11 | 11 305 12 | 12 259 13 | 13 242 14 | 14 223 15 | 15 187 16 | 16 181 17 | 17 179 18 | 18 130 19 | 19 127 20 | 20 128 21 | 21 104 22 | 22 105 23 | 23 99 24 | 24 112 25 | 25 93 26 | 26 74 27 | 27 83 28 | 28 76 29 | 29 72 30 | 30 63 31 | 31 73 32 | 32 47 33 | 33 56 34 | 34 59 35 | 35 53 36 | 36 45 37 | 37 34 38 | 38 49 39 | 39 45 40 | 40 52 41 | 41 49 42 | 42 41 43 | 43 30 44 | 44 35 45 | 45 37 46 | 46 21 47 | 47 41 48 | 48 30 49 | 49 28 50 | 50 19 51 | 51 25 52 | 52 19 53 | 53 28 54 | 54 27 55 | 55 31 56 | 56 19 57 | 57 19 58 | 58 22 59 | 59 23 60 | 60 14 61 | 61 30 62 | 62 19 63 | 63 21 64 | 64 18 65 | 65 15 66 | 66 10 67 | 67 15 68 | 68 14 69 | 69 11 70 | 70 16 71 | 71 13 72 | 72 12 73 | 73 10 74 | 74 16 75 | 75 18 76 | 76 11 77 | 77 8 78 | 78 15 79 | 79 12 80 | 80 7 81 | 81 13 82 | 82 12 83 | 83 11 84 | 84 8 85 | 85 10 86 | 86 11 87 | 87 7 88 | 88 12 89 | 89 9 90 | 90 8 91 | 91 4 92 | 92 7 93 | 93 6 94 | 94 7 95 | 95 10 96 | 96 10 97 | 97 15 98 | 98 7 99 | 99 7 100 | 100 5 101 | 815 845 102 | 1305 1 103 | -------------------------------------------------------------------------------- /data/additional_data.txt: -------------------------------------------------------------------------------- 1 | If you would like additional data, please use the following links: 2 | 3 | http://smithlabresearch.org/downloads/preseq/library_complex_test_data/SRR1003759.tar.bz2 4 | http://smithlabresearch.org/downloads/preseq/library_complex_test_data/SRR1041830.tar.bz2 5 | http://smithlabresearch.org/downloads/preseq/library_complex_test_data/SRR1106616.tar.bz2 6 | http://smithlabresearch.org/downloads/preseq/library_complex_test_data/SRR975260.tar.bz2 7 | http://smithlabresearch.org/downloads/preseq/library_complex_test_data/SRX314956.tar.bz2 8 | 9 | -------------------------------------------------------------------------------- /docs/FullExperiment_copy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/docs/FullExperiment_copy.pdf -------------------------------------------------------------------------------- /docs/InitialExperimentComplexityCurves_copy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/docs/InitialExperimentComplexityCurves_copy.pdf -------------------------------------------------------------------------------- /docs/RELEASE_NOTES.txt: -------------------------------------------------------------------------------- 1 | preseq release 3.0 (July 22, 2020) 2 | ------------------------------------------------------------------ 3 | 4 | Notable changes in preseq software: 5 | 6 | * Added GNU autotools for building preseq 7 | 8 | * Updated to use HTSLib directly, rather than source files from 9 | HTSLib, when preseq is built with support for BAM/SAM input 10 | format. 11 | 12 | preseq beta release 2.0.2 (December 1, 2015) 13 | ------------------------------------------------------------------ 14 | 15 | Notable changes in preseq software: 16 | 17 | * Fix a bug in defect mode to allow for estimation without 18 | checking the curves for defects. In such case, more 19 | bootstraps will help to smooth the curve. 20 | 21 | 22 | preseq beta release 2.0.0 (October 29, 2015) 23 | 24 | ------------------------------------------------------------------ 25 | 26 | Notable changes in preseq software: 27 | 28 | * Include the module bound_pop. This module constructs a 29 | nonparametric moment-based estimator of species richness, 30 | the total number of species or classes in the population. 31 | 32 | 33 | preseq beta release 1.0.3 (December 15, 2014) 34 | 35 | ------------------------------------------------------------------ 36 | 37 | Notable changes in preseq software: 38 | 39 | * Include defect mode to extrapolate without testing for 40 | defects. 41 | 42 | 43 | preseq beta release 1.0.2 (Aug 25, 2014) 44 | 45 | ------------------------------------------------------------------ 46 | 47 | Notable changes in preseq software: 48 | 49 | * Included gc_extrap option to predict genomic coverage for 50 | single cell sequencing experiments. 51 | 52 | * Changed the method of finding optimal continued fraction 53 | to improve performance for high variable (e.g. RNAseq) 54 | libraries. 55 | 56 | 57 | preseq beta release 0.0.3 (Aug 5, 2013) 58 | 59 | ------------------------------------------------------------------ 60 | 61 | Notable changes in preseq software: 62 | 63 | * Prediction of the complexity curve is done using the observed data 64 | when possible. Previous versions bootstrapped the histogram and 65 | used the median estimate. Bootstrapping is only done to compute 66 | confidence intervals. 67 | 68 | * Addition of quick mode option with flag -Q. The complexity is 69 | predicted with the observed data and bootstrapping is not done, 70 | speeding up the computation time tremendously. 71 | 72 | * Fixed a bug associated with the unistd.h header for GCC versions 73 | 4.7+. 74 | 75 | * Extensively updated the manual with examples and FAQ 76 | 77 | 78 | preseq beta release 1.0.2 (Aug 25, 2014) 79 | 80 | ------------------------------------------------------------------ 81 | 82 | * Functions for input in header file load_data_for_complexity 83 | 84 | * Fix samtools linking problem 85 | 86 | If you have any questions, comments or bugs, please contact us at 87 | tdaley@usc.edu. Thank you for using preseq. 88 | -------------------------------------------------------------------------------- /docs/TCR_richness_vs_age_lm.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/docs/TCR_richness_vs_age_lm.pdf -------------------------------------------------------------------------------- /docs/biblio.bib: -------------------------------------------------------------------------------- 1 | @article{heck1975explicit, 2 | title={Explicit Calculation of the Rarefaction Diversity Measurement and the Determination of Sufficient Sample Size}, 3 | author={Heck, Jr, Kenneth L and van Belle, Gerald and Simberloff, Daniel}, 4 | journal={Ecology}, 5 | volume={56}, 6 | number={6}, 7 | pages={1459--1461}, 8 | year={1975}, 9 | publisher={JSTOR} 10 | } 11 | 12 | @article{willis2015inference, 13 | title={Inference for changes in biodiversity}, 14 | author={Willis, Amy and Bunge, John and Whitman, Thea}, 15 | journal={arXiv preprint arXiv:1506.05710}, 16 | year={2015} 17 | } 18 | 19 | @article{britanova2014age, 20 | title={Age-related decrease in TCR repertoire diversity measured with deep and normalized sequence profiling}, 21 | author={Britanova, Olga V and Putintseva, Ekaterina V and Shugay, Mikhail and Merzlyak, Ekaterina M and Turchaninova, Maria A and Staroverov, Dmitriy B and Bolotin, Dmitriy A and Lukyanov, Sergey and Bogdanova, Ekaterina A and Mamedov, Ilgar Z and others}, 22 | journal={The Journal of Immunology}, 23 | volume={192}, 24 | number={6}, 25 | pages={2689--2698}, 26 | year={2014}, 27 | publisher={Am Assoc Immnol} 28 | } 29 | 30 | @article{daley2014modeling, 31 | title={Modeling genome coverage in single-cell sequencing}, 32 | author={Daley, Timothy and Smith, Andrew D}, 33 | journal={Bioinformatics}, 34 | pages={btu540}, 35 | year={2014}, 36 | publisher={Oxford Univ Press} 37 | } 38 | 39 | 40 | 41 | @article{fu2015uniform, 42 | title={Uniform and accurate single-cell sequencing based on emulsion whole-genome amplification}, 43 | author={Fu, Yusi and Li, Chunmei and Lu, Sijia and Zhou, Wenxiong and Tang, Fuchou and Xie, X Sunney and Huang, Yanyi}, 44 | journal={Proceedings of the National Academy of Sciences}, 45 | volume={112}, 46 | number={38}, 47 | pages={11923--11928}, 48 | year={2015}, 49 | publisher={National Acad Sciences} 50 | } 51 | 52 | @article{chao1987estimating, 53 | title={Estimating the population size for capture-recapture data with unequal catchability}, 54 | author={Chao, Anne}, 55 | journal={Biometrics}, 56 | pages={783--791}, 57 | year={1987}, 58 | publisher={JSTOR} 59 | } 60 | 61 | @article{zelterman1988robust, 62 | title={Robust estimation in truncated discrete distributions with application to capture-recapture experiments}, 63 | author={Zelterman, Daniel}, 64 | journal={Journal of statistical planning and inference}, 65 | volume={18}, 66 | number={2}, 67 | pages={225--237}, 68 | year={1988}, 69 | publisher={Elsevier} 70 | } 71 | 72 | @article{good1956number, 73 | author = "Good, I. J. and Toulmin, G. H.", 74 | title = {The number of new species, and the increase in population coverage, when a sample is increased}, 75 | journal = {Biometrika}, 76 | volume = {43}, 77 | year = {1956}, 78 | pages = {45--63} 79 | } 80 | 81 | @article{kivioja2011counting, 82 | title={Counting absolute numbers of molecules using unique molecular identifiers}, 83 | author={Kivioja, T. and V{\"a}h{\"a}rautio, A. and Karlsson, K. and Bonke, M. and Enge, M. and Linnarsson, S. and Taipale, J.}, 84 | journal={Nature Methods}, 85 | year={2012}, 86 | volume={9}, 87 | pages={72--74}, 88 | publisher={Nature Publishing Group} 89 | } 90 | 91 | @article{lu2012probing, 92 | title={Probing meiotic recombination and aneuploidy of single sperm cells by whole-genome sequencing}, 93 | author={Lu, Sijia and Zong, Chenghang and Fan, Wei and Yang, Mingyu and Li, Jinsen and Chapman, Alec R and Zhu, Ping and Hu, Xuesong and Xu, Liya and Yan, Liying and others}, 94 | journal={Science}, 95 | volume={338}, 96 | number={6114}, 97 | pages={1627--1630}, 98 | year={2012}, 99 | publisher={American Association for the Advancement of Science} 100 | } 101 | 102 | @article{mercer2011targeted, 103 | title={Targeted {RNA} sequencing reveals the deep complexity of the human transcriptome}, 104 | author={Mercer, T.R. and Gerhardt, D.J. and Dinger, M.E. and Crawford, J. and Trapnell, C. and Jeddeloh, J.A. and Mattick, J.S. and Rinn, J.L.}, 105 | journal={Nature Biotechnology}, 106 | volume={30}, 107 | number={1}, 108 | pages={99--104}, 109 | year={2011}, 110 | publisher={Nature Publishing Group} 111 | } 112 | 113 | @article{van2010most, 114 | title={Most dark matter transcripts are associated with known genes}, 115 | author={van Bakel, Harm and Nislow, Corey and Blencowe, Benjamin J and Hughes, Timothy R}, 116 | journal={PLoS biology}, 117 | volume={8}, 118 | number={5}, 119 | pages={e1000371}, 120 | year={2010}, 121 | publisher={Public Library of Science} 122 | } 123 | 124 | @article{clark2011reality, 125 | title={The reality of pervasive transcription}, 126 | author={Clark, Michael B and Amaral, Paulo P and Schlesinger, Felix J and Dinger, Marcel E and Taft, Ryan J and Rinn, John L and Ponting, Chris P and Stadler, Peter F and Morris, Kevin V and Morillon, Antonin and others}, 127 | journal={PLoS biology}, 128 | volume={9}, 129 | number={7}, 130 | pages={e1000625}, 131 | year={2011}, 132 | publisher={Public Library of Science} 133 | } -------------------------------------------------------------------------------- /docs/compare_RNA_Capture_junction_complexity.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/docs/compare_RNA_Capture_junction_complexity.pdf -------------------------------------------------------------------------------- /docs/comparing_scWGA_coverage.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/docs/comparing_scWGA_coverage.pdf -------------------------------------------------------------------------------- /docs/manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/docs/manual.pdf -------------------------------------------------------------------------------- /documentation/README.md: -------------------------------------------------------------------------------- 1 | # preseq documentation 2 | 3 | This is the (new) documentation for preseq that uses 4 | [mkdocs](https://mkdocs.readthedocs.io) to generate readthedocs pages. 5 | The public web verison of this documentation is available at 6 | [preseq.readthedocs.io](https://preseq.readthedocs.io), but for users 7 | who wish to see the documentation on a web browser offline, you can 8 | build the documentation locally as described below. 9 | 10 | ### Dependencies 11 | 12 | To build the documentation locally, install mkdocs 13 | ```console 14 | pip install -U mkdocs 15 | ``` 16 | 17 | ### Local compilation 18 | 19 | Build the HTML documentation by running 20 | ```console 21 | mkdocs build 22 | ``` 23 | which will create a `site` directory where markdown files are 24 | converted to HTML 25 | 26 | Create a local host for the HTML documentation by running 27 | ```console 28 | mkdocs serve 29 | ``` 30 | This will create the documentation, usually at http://localhost:8000 . 31 | -------------------------------------------------------------------------------- /documentation/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /documentation/docs/quickstart.md: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | ## Installation via conda 5 | 6 | If you know how to use conda then preseq is available among the 7 | bioconda recipes. You can install it as follows if you have a conda 8 | environment activated: 9 | 10 | ```console 11 | $ conda install -c bioconda preseq 12 | ``` 13 | 14 | The instructions for installing conda are 15 | [here](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html). 16 | -------------------------------------------------------------------------------- /documentation/docs/requirements.txt: -------------------------------------------------------------------------------- 1 | jinja2>=3.1.4 2 | mkdocs>=1.3.1 3 | babel>=2.9.0 4 | click>=7.0 5 | Jinja2>=3.1.4 6 | Markdown>=3.2.1,<3.4 7 | PyYAML>=5.2 8 | watchdog>=2.0.0 9 | mdx_gh_links>=0.2 10 | ghp-import>=1.0 11 | pyyaml_env_tag>=0.1 12 | mkdocs-redirects>=1.0.1 13 | importlib_metadata>=4.3 14 | packaging>=20.5 15 | mergedeep>=1.3.4 16 | pygments>=2.12 17 | pymdown-extensions 18 | mkdocs-material 19 | -------------------------------------------------------------------------------- /documentation/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: preseq 2 | strict: true 3 | 4 | theme: readthedocs 5 | nav: 6 | - Home: 'index.md' 7 | - 'Installation': 'quickstart.md' 8 | - 'preseq on GitHub' : https://github.com/smithlabcode/preseq 9 | -------------------------------------------------------------------------------- /m4/ax_cxx_check_lib.m4: -------------------------------------------------------------------------------- 1 | dnl @synopsis AX_CXX_CHECK_LIB(libname, functioname, action-if, action-if-not) 2 | dnl 3 | dnl The standard AC_CHECK_LIB can not test functions in namespaces. 4 | dnl Therefore AC_CHECK_LIB(cgicc, cgicc::Cgicc::getVersion) will always 5 | dnl fail. We need to decompose the functionname into a series of namespaces 6 | dnl where it gets declared so that it can be used for a link test. 7 | dnl 8 | dnl In the first version I did allow namespace::functionname to be a 9 | dnl reference to a void-argument global functionname (just wrapped in a 10 | dnl namespace) like its C counterparts would be - but in reality such 11 | dnl thing does not exist. The only global / static functions are always 12 | dnl made const-functions which is an attribute mangled along into the 13 | dnl library function export name. 14 | dnl 15 | dnl The normal usage will ask for a test of a class-member function which 16 | dnl should be presented with a full function spec with arguments given in 17 | dnl parentheses following the function name - if the function to test for 18 | dnl does expect arguments then you should add default initial values in the 19 | dnl prototype (even if they do not exist originally, these are used only 20 | dnl locally to build a correct function call in the configure test script). 21 | dnl 22 | dnl In the current version if you do omit the parenthesis from the macro 23 | dnl argument then the macro will assume that you want to check for the 24 | dnl class name - which is really to check for default constructor being 25 | dnl exported from the given library name. 26 | dnl 27 | dnl EXAMPLE: 28 | dnl AX_CXX_CHECK_LIB(cgicc, [cgicc::HTTPCookie]) 29 | dnl AX_CXX_CHECK_LIB(cgicc, [cgicc::Cgicc::getVersion () const], 30 | dnl AX_CXX_CHECK_LIB(boost_regex, [boost::RegEx::Position (int i = 0) const]) 31 | dnl 32 | dnl Result: 33 | dnl Just as the usual AX_CXX_CHECK_LIB - defines HAVE_LIBCGICC 34 | dnl and adds the libraries to the default library path (and 35 | dnl uses internally the normal ac_check_lib cache symbol 36 | dnl like ac_cv_lib_cgicc_cgicc__Cgicc) 37 | dnl 38 | dnl Footnote: The C++ language is not good at creating stable library 39 | dnl interfaces at the binary level - a lot of functionality is usually being 40 | dnl given as inline functions plus there is hardly a chance to create opaque 41 | dnl types. Therefore most C++ library tests will only do compile tests using 42 | dnl the header files. Doing a check_lib is however good to check the link 43 | dnl dependency before hitting it as an error in the build later. 44 | dnl 45 | dnl @category C++ 46 | dnl @author Guido U. Draheim 47 | dnl @vesion 2006-12-18 48 | 49 | AC_DEFUN([AX_CXX_CHECK_LIB], 50 | [m4_ifval([$3], , [AH_CHECK_LIB([$1])])dnl 51 | AS_LITERAL_IF([$1], 52 | [AS_VAR_PUSHDEF([ac_Lib], [ac_cv_lib_$1_$2])], 53 | [AS_VAR_PUSHDEF([ac_Lib], [ac_cv_lib_$1''_$2])])dnl 54 | AC_CACHE_CHECK([for $2 in -l$1], ac_Lib, 55 | [ac_check_lib_save_LIBS=$LIBS 56 | LIBS="-l$1 $5 $LIBS" 57 | case "$2" 58 | in *::*::*\(*) 59 | AC_LINK_IFELSE([AC_LANG_PROGRAM([ 60 | namespace `echo "$2" | sed -e "s/::.*//"` 61 | { class `echo "$2" | sed -e "s/.*::\\(.*\\)::.*/\\1/" -e "s/(.*//"` 62 | { public: int `echo "$2" | sed -e "s/.*:://" -e "/(/!s/..*/&()/"`; 63 | }; 64 | } 65 | ],[`echo "$2" | sed -e "s/(.*//" -e "s/\\(.*\\)::\\(.*\\)/((\\1*)(0))->\\2/g"`()])], 66 | [AS_VAR_SET(ac_Lib, yes)], 67 | [AS_VAR_SET(ac_Lib, no)]) 68 | ;; *::*::*) 69 | AC_LINK_IFELSE([AC_LANG_PROGRAM([ 70 | namespace `echo "$2" | sed -e "s/::.*//"` 71 | { namespace `echo "$2" | sed -e "s/.*::\\(.*\\)::.*/\\1/"` 72 | { class `echo "$2" | sed -e "s/.*:://"` 73 | { public: `echo "$2" | sed -e "s/.*:://"` (); 74 | }; 75 | } 76 | } 77 | ],[new $2()])], 78 | [AS_VAR_SET(ac_Lib, yes)], 79 | [AS_VAR_SET(ac_Lib, no)]) 80 | ;; *::*\(*) 81 | AC_LINK_IFELSE([AC_LANG_PROGRAM([ 82 | class `echo "$2" | sed -e "s/\\(.*\\)::.*/\\1/" -e "s/(.*//"` 83 | { public: int `echo "$2" | sed -e "s/.*:://" -e "/(/!s/..*/&()/"`; 84 | }; 85 | ],[`echo "$2" | sed -e "s/(.*//" -e "s/\\(.*\\)::\\(.*\\)/((\\1*)(0))->\\2/g"`()])], 86 | [AS_VAR_SET(ac_Lib, yes)], 87 | [AS_VAR_SET(ac_Lib, no)]) 88 | ;; *::*) 89 | AC_LINK_IFELSE([AC_LANG_PROGRAM([ 90 | namespace `echo "$2" | sed -e "s/::.*//"` 91 | { class `echo "$2" | sed -e "s/.*:://"` 92 | { public: `echo "$2" | sed -e "s/.*:://"` (); 93 | }; 94 | } 95 | ],[new $2()])], 96 | [AS_VAR_SET(ac_Lib, yes)], 97 | [AS_VAR_SET(ac_Lib, no)]) 98 | ;; *) 99 | AC_LINK_IFELSE([AC_LANG_CALL([], [$2])], 100 | [AS_VAR_SET(ac_Lib, yes)], 101 | [AS_VAR_SET(ac_Lib, no)]) 102 | ;; esac 103 | LIBS=$ac_check_lib_save_LIBS]) 104 | AS_IF([test AS_VAR_GET(ac_Lib) = yes], 105 | [m4_default([$3], [AC_DEFINE_UNQUOTED(AS_TR_CPP(HAVE_LIB$1)) 106 | LIBS="-l$1 $LIBS" 107 | ])], 108 | [$4])dnl 109 | AS_VAR_POPDEF([ac_Lib])dnl 110 | ])# AC_CHECK_LIB 111 | -------------------------------------------------------------------------------- /m4/ax_cxx_compile_stdcxx.m4: -------------------------------------------------------------------------------- 1 | # =========================================================================== 2 | # https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx.html 3 | # =========================================================================== 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_CXX_COMPILE_STDCXX(VERSION, [ext|noext], [mandatory|optional]) 8 | # 9 | # DESCRIPTION 10 | # 11 | # Check for baseline language coverage in the compiler for the specified 12 | # version of the C++ standard. If necessary, add switches to CXX and 13 | # CXXCPP to enable support. VERSION may be '11' (for the C++11 standard) 14 | # or '14' (for the C++14 standard). 15 | # 16 | # The second argument, if specified, indicates whether you insist on an 17 | # extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g. 18 | # -std=c++11). If neither is specified, you get whatever works, with 19 | # preference for an extended mode. 20 | # 21 | # The third argument, if specified 'mandatory' or if left unspecified, 22 | # indicates that baseline support for the specified C++ standard is 23 | # required and that the macro should error out if no mode with that 24 | # support is found. If specified 'optional', then configuration proceeds 25 | # regardless, after defining HAVE_CXX${VERSION} if and only if a 26 | # supporting mode is found. 27 | # 28 | # LICENSE 29 | # 30 | # Copyright (c) 2008 Benjamin Kosnik 31 | # Copyright (c) 2012 Zack Weinberg 32 | # Copyright (c) 2013 Roy Stogner 33 | # Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov 34 | # Copyright (c) 2015 Paul Norman 35 | # Copyright (c) 2015 Moritz Klammler 36 | # Copyright (c) 2016, 2018 Krzesimir Nowak 37 | # Copyright (c) 2019 Enji Cooper 38 | # 39 | # Copying and distribution of this file, with or without modification, are 40 | # permitted in any medium without royalty provided the copyright notice 41 | # and this notice are preserved. This file is offered as-is, without any 42 | # warranty. 43 | 44 | #serial 11 45 | 46 | dnl This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro 47 | dnl (serial version number 13). 48 | 49 | AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl 50 | m4_if([$1], [11], [ax_cxx_compile_alternatives="11 0x"], 51 | [$1], [14], [ax_cxx_compile_alternatives="14 1y"], 52 | [$1], [17], [ax_cxx_compile_alternatives="17 1z"], 53 | [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl 54 | m4_if([$2], [], [], 55 | [$2], [ext], [], 56 | [$2], [noext], [], 57 | [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX])])dnl 58 | m4_if([$3], [], [ax_cxx_compile_cxx$1_required=true], 59 | [$3], [mandatory], [ax_cxx_compile_cxx$1_required=true], 60 | [$3], [optional], [ax_cxx_compile_cxx$1_required=false], 61 | [m4_fatal([invalid third argument `$3' to AX_CXX_COMPILE_STDCXX])]) 62 | AC_LANG_PUSH([C++])dnl 63 | ac_success=no 64 | 65 | m4_if([$2], [noext], [], [dnl 66 | if test x$ac_success = xno; then 67 | for alternative in ${ax_cxx_compile_alternatives}; do 68 | switch="-std=gnu++${alternative}" 69 | cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch]) 70 | AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch, 71 | $cachevar, 72 | [ac_save_CXX="$CXX" 73 | CXX="$CXX $switch" 74 | AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])], 75 | [eval $cachevar=yes], 76 | [eval $cachevar=no]) 77 | CXX="$ac_save_CXX"]) 78 | if eval test x\$$cachevar = xyes; then 79 | CXX="$CXX $switch" 80 | if test -n "$CXXCPP" ; then 81 | CXXCPP="$CXXCPP $switch" 82 | fi 83 | ac_success=yes 84 | break 85 | fi 86 | done 87 | fi]) 88 | 89 | m4_if([$2], [ext], [], [dnl 90 | if test x$ac_success = xno; then 91 | dnl HP's aCC needs +std=c++11 according to: 92 | dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf 93 | dnl Cray's crayCC needs "-h std=c++11" 94 | for alternative in ${ax_cxx_compile_alternatives}; do 95 | for switch in -std=c++${alternative} +std=c++${alternative} "-h std=c++${alternative}"; do 96 | cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch]) 97 | AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch, 98 | $cachevar, 99 | [ac_save_CXX="$CXX" 100 | CXX="$CXX $switch" 101 | AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])], 102 | [eval $cachevar=yes], 103 | [eval $cachevar=no]) 104 | CXX="$ac_save_CXX"]) 105 | if eval test x\$$cachevar = xyes; then 106 | CXX="$CXX $switch" 107 | if test -n "$CXXCPP" ; then 108 | CXXCPP="$CXXCPP $switch" 109 | fi 110 | ac_success=yes 111 | break 112 | fi 113 | done 114 | if test x$ac_success = xyes; then 115 | break 116 | fi 117 | done 118 | fi]) 119 | AC_LANG_POP([C++]) 120 | if test x$ax_cxx_compile_cxx$1_required = xtrue; then 121 | if test x$ac_success = xno; then 122 | AC_MSG_ERROR([*** A compiler with support for C++$1 language features is required.]) 123 | fi 124 | fi 125 | if test x$ac_success = xno; then 126 | HAVE_CXX$1=0 127 | AC_MSG_NOTICE([No compiler with C++$1 support was found]) 128 | else 129 | HAVE_CXX$1=1 130 | AC_DEFINE(HAVE_CXX$1,1, 131 | [define if the compiler supports basic C++$1 syntax]) 132 | fi 133 | AC_SUBST(HAVE_CXX$1) 134 | ]) 135 | 136 | 137 | dnl Test body for checking C++11 support 138 | 139 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11], 140 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 141 | ) 142 | 143 | 144 | dnl Test body for checking C++14 support 145 | 146 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14], 147 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 148 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_14 149 | ) 150 | 151 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_17], 152 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 153 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_14 154 | _AX_CXX_COMPILE_STDCXX_testbody_new_in_17 155 | ) 156 | 157 | dnl Tests for new features in C++11 158 | 159 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[ 160 | 161 | // If the compiler admits that it is not ready for C++11, why torture it? 162 | // Hopefully, this will speed up the test. 163 | 164 | #ifndef __cplusplus 165 | 166 | #error "This is not a C++ compiler" 167 | 168 | #elif __cplusplus < 201103L 169 | 170 | #error "This is not a C++11 compiler" 171 | 172 | #else 173 | 174 | namespace cxx11 175 | { 176 | 177 | namespace test_static_assert 178 | { 179 | 180 | template 181 | struct check 182 | { 183 | static_assert(sizeof(int) <= sizeof(T), "not big enough"); 184 | }; 185 | 186 | } 187 | 188 | namespace test_final_override 189 | { 190 | 191 | struct Base 192 | { 193 | virtual ~Base() {} 194 | virtual void f() {} 195 | }; 196 | 197 | struct Derived : public Base 198 | { 199 | virtual ~Derived() override {} 200 | virtual void f() override {} 201 | }; 202 | 203 | } 204 | 205 | namespace test_double_right_angle_brackets 206 | { 207 | 208 | template < typename T > 209 | struct check {}; 210 | 211 | typedef check single_type; 212 | typedef check> double_type; 213 | typedef check>> triple_type; 214 | typedef check>>> quadruple_type; 215 | 216 | } 217 | 218 | namespace test_decltype 219 | { 220 | 221 | int 222 | f() 223 | { 224 | int a = 1; 225 | decltype(a) b = 2; 226 | return a + b; 227 | } 228 | 229 | } 230 | 231 | namespace test_type_deduction 232 | { 233 | 234 | template < typename T1, typename T2 > 235 | struct is_same 236 | { 237 | static const bool value = false; 238 | }; 239 | 240 | template < typename T > 241 | struct is_same 242 | { 243 | static const bool value = true; 244 | }; 245 | 246 | template < typename T1, typename T2 > 247 | auto 248 | add(T1 a1, T2 a2) -> decltype(a1 + a2) 249 | { 250 | return a1 + a2; 251 | } 252 | 253 | int 254 | test(const int c, volatile int v) 255 | { 256 | static_assert(is_same::value == true, ""); 257 | static_assert(is_same::value == false, ""); 258 | static_assert(is_same::value == false, ""); 259 | auto ac = c; 260 | auto av = v; 261 | auto sumi = ac + av + 'x'; 262 | auto sumf = ac + av + 1.0; 263 | static_assert(is_same::value == true, ""); 264 | static_assert(is_same::value == true, ""); 265 | static_assert(is_same::value == true, ""); 266 | static_assert(is_same::value == false, ""); 267 | static_assert(is_same::value == true, ""); 268 | return (sumf > 0.0) ? sumi : add(c, v); 269 | } 270 | 271 | } 272 | 273 | namespace test_noexcept 274 | { 275 | 276 | int f() { return 0; } 277 | int g() noexcept { return 0; } 278 | 279 | static_assert(noexcept(f()) == false, ""); 280 | static_assert(noexcept(g()) == true, ""); 281 | 282 | } 283 | 284 | namespace test_constexpr 285 | { 286 | 287 | template < typename CharT > 288 | unsigned long constexpr 289 | strlen_c_r(const CharT *const s, const unsigned long acc) noexcept 290 | { 291 | return *s ? strlen_c_r(s + 1, acc + 1) : acc; 292 | } 293 | 294 | template < typename CharT > 295 | unsigned long constexpr 296 | strlen_c(const CharT *const s) noexcept 297 | { 298 | return strlen_c_r(s, 0UL); 299 | } 300 | 301 | static_assert(strlen_c("") == 0UL, ""); 302 | static_assert(strlen_c("1") == 1UL, ""); 303 | static_assert(strlen_c("example") == 7UL, ""); 304 | static_assert(strlen_c("another\0example") == 7UL, ""); 305 | 306 | } 307 | 308 | namespace test_rvalue_references 309 | { 310 | 311 | template < int N > 312 | struct answer 313 | { 314 | static constexpr int value = N; 315 | }; 316 | 317 | answer<1> f(int&) { return answer<1>(); } 318 | answer<2> f(const int&) { return answer<2>(); } 319 | answer<3> f(int&&) { return answer<3>(); } 320 | 321 | void 322 | test() 323 | { 324 | int i = 0; 325 | const int c = 0; 326 | static_assert(decltype(f(i))::value == 1, ""); 327 | static_assert(decltype(f(c))::value == 2, ""); 328 | static_assert(decltype(f(0))::value == 3, ""); 329 | } 330 | 331 | } 332 | 333 | namespace test_uniform_initialization 334 | { 335 | 336 | struct test 337 | { 338 | static const int zero {}; 339 | static const int one {1}; 340 | }; 341 | 342 | static_assert(test::zero == 0, ""); 343 | static_assert(test::one == 1, ""); 344 | 345 | } 346 | 347 | namespace test_lambdas 348 | { 349 | 350 | void 351 | test1() 352 | { 353 | auto lambda1 = [](){}; 354 | auto lambda2 = lambda1; 355 | lambda1(); 356 | lambda2(); 357 | } 358 | 359 | int 360 | test2() 361 | { 362 | auto a = [](int i, int j){ return i + j; }(1, 2); 363 | auto b = []() -> int { return '0'; }(); 364 | auto c = [=](){ return a + b; }(); 365 | auto d = [&](){ return c; }(); 366 | auto e = [a, &b](int x) mutable { 367 | const auto identity = [](int y){ return y; }; 368 | for (auto i = 0; i < a; ++i) 369 | a += b--; 370 | return x + identity(a + b); 371 | }(0); 372 | return a + b + c + d + e; 373 | } 374 | 375 | int 376 | test3() 377 | { 378 | const auto nullary = [](){ return 0; }; 379 | const auto unary = [](int x){ return x; }; 380 | using nullary_t = decltype(nullary); 381 | using unary_t = decltype(unary); 382 | const auto higher1st = [](nullary_t f){ return f(); }; 383 | const auto higher2nd = [unary](nullary_t f1){ 384 | return [unary, f1](unary_t f2){ return f2(unary(f1())); }; 385 | }; 386 | return higher1st(nullary) + higher2nd(nullary)(unary); 387 | } 388 | 389 | } 390 | 391 | namespace test_variadic_templates 392 | { 393 | 394 | template 395 | struct sum; 396 | 397 | template 398 | struct sum 399 | { 400 | static constexpr auto value = N0 + sum::value; 401 | }; 402 | 403 | template <> 404 | struct sum<> 405 | { 406 | static constexpr auto value = 0; 407 | }; 408 | 409 | static_assert(sum<>::value == 0, ""); 410 | static_assert(sum<1>::value == 1, ""); 411 | static_assert(sum<23>::value == 23, ""); 412 | static_assert(sum<1, 2>::value == 3, ""); 413 | static_assert(sum<5, 5, 11>::value == 21, ""); 414 | static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, ""); 415 | 416 | } 417 | 418 | // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae 419 | // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function 420 | // because of this. 421 | namespace test_template_alias_sfinae 422 | { 423 | 424 | struct foo {}; 425 | 426 | template 427 | using member = typename T::member_type; 428 | 429 | template 430 | void func(...) {} 431 | 432 | template 433 | void func(member*) {} 434 | 435 | void test(); 436 | 437 | void test() { func(0); } 438 | 439 | } 440 | 441 | } // namespace cxx11 442 | 443 | #endif // __cplusplus >= 201103L 444 | 445 | ]]) 446 | 447 | 448 | dnl Tests for new features in C++14 449 | 450 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[ 451 | 452 | // If the compiler admits that it is not ready for C++14, why torture it? 453 | // Hopefully, this will speed up the test. 454 | 455 | #ifndef __cplusplus 456 | 457 | #error "This is not a C++ compiler" 458 | 459 | #elif __cplusplus < 201402L 460 | 461 | #error "This is not a C++14 compiler" 462 | 463 | #else 464 | 465 | namespace cxx14 466 | { 467 | 468 | namespace test_polymorphic_lambdas 469 | { 470 | 471 | int 472 | test() 473 | { 474 | const auto lambda = [](auto&&... args){ 475 | const auto istiny = [](auto x){ 476 | return (sizeof(x) == 1UL) ? 1 : 0; 477 | }; 478 | const int aretiny[] = { istiny(args)... }; 479 | return aretiny[0]; 480 | }; 481 | return lambda(1, 1L, 1.0f, '1'); 482 | } 483 | 484 | } 485 | 486 | namespace test_binary_literals 487 | { 488 | 489 | constexpr auto ivii = 0b0000000000101010; 490 | static_assert(ivii == 42, "wrong value"); 491 | 492 | } 493 | 494 | namespace test_generalized_constexpr 495 | { 496 | 497 | template < typename CharT > 498 | constexpr unsigned long 499 | strlen_c(const CharT *const s) noexcept 500 | { 501 | auto length = 0UL; 502 | for (auto p = s; *p; ++p) 503 | ++length; 504 | return length; 505 | } 506 | 507 | static_assert(strlen_c("") == 0UL, ""); 508 | static_assert(strlen_c("x") == 1UL, ""); 509 | static_assert(strlen_c("test") == 4UL, ""); 510 | static_assert(strlen_c("another\0test") == 7UL, ""); 511 | 512 | } 513 | 514 | namespace test_lambda_init_capture 515 | { 516 | 517 | int 518 | test() 519 | { 520 | auto x = 0; 521 | const auto lambda1 = [a = x](int b){ return a + b; }; 522 | const auto lambda2 = [a = lambda1(x)](){ return a; }; 523 | return lambda2(); 524 | } 525 | 526 | } 527 | 528 | namespace test_digit_separators 529 | { 530 | 531 | constexpr auto ten_million = 100'000'000; 532 | static_assert(ten_million == 100000000, ""); 533 | 534 | } 535 | 536 | namespace test_return_type_deduction 537 | { 538 | 539 | auto f(int& x) { return x; } 540 | decltype(auto) g(int& x) { return x; } 541 | 542 | template < typename T1, typename T2 > 543 | struct is_same 544 | { 545 | static constexpr auto value = false; 546 | }; 547 | 548 | template < typename T > 549 | struct is_same 550 | { 551 | static constexpr auto value = true; 552 | }; 553 | 554 | int 555 | test() 556 | { 557 | auto x = 0; 558 | static_assert(is_same::value, ""); 559 | static_assert(is_same::value, ""); 560 | return x; 561 | } 562 | 563 | } 564 | 565 | } // namespace cxx14 566 | 567 | #endif // __cplusplus >= 201402L 568 | 569 | ]]) 570 | 571 | 572 | dnl Tests for new features in C++17 573 | 574 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_17], [[ 575 | 576 | // If the compiler admits that it is not ready for C++17, why torture it? 577 | // Hopefully, this will speed up the test. 578 | 579 | #ifndef __cplusplus 580 | 581 | #error "This is not a C++ compiler" 582 | 583 | #elif __cplusplus < 201703L 584 | 585 | #error "This is not a C++17 compiler" 586 | 587 | #else 588 | 589 | #include 590 | #include 591 | #include 592 | 593 | namespace cxx17 594 | { 595 | 596 | namespace test_constexpr_lambdas 597 | { 598 | 599 | constexpr int foo = [](){return 42;}(); 600 | 601 | } 602 | 603 | namespace test::nested_namespace::definitions 604 | { 605 | 606 | } 607 | 608 | namespace test_fold_expression 609 | { 610 | 611 | template 612 | int multiply(Args... args) 613 | { 614 | return (args * ... * 1); 615 | } 616 | 617 | template 618 | bool all(Args... args) 619 | { 620 | return (args && ...); 621 | } 622 | 623 | } 624 | 625 | namespace test_extended_static_assert 626 | { 627 | 628 | static_assert (true); 629 | 630 | } 631 | 632 | namespace test_auto_brace_init_list 633 | { 634 | 635 | auto foo = {5}; 636 | auto bar {5}; 637 | 638 | static_assert(std::is_same, decltype(foo)>::value); 639 | static_assert(std::is_same::value); 640 | } 641 | 642 | namespace test_typename_in_template_template_parameter 643 | { 644 | 645 | template typename X> struct D; 646 | 647 | } 648 | 649 | namespace test_fallthrough_nodiscard_maybe_unused_attributes 650 | { 651 | 652 | int f1() 653 | { 654 | return 42; 655 | } 656 | 657 | [[nodiscard]] int f2() 658 | { 659 | [[maybe_unused]] auto unused = f1(); 660 | 661 | switch (f1()) 662 | { 663 | case 17: 664 | f1(); 665 | [[fallthrough]]; 666 | case 42: 667 | f1(); 668 | } 669 | return f1(); 670 | } 671 | 672 | } 673 | 674 | namespace test_extended_aggregate_initialization 675 | { 676 | 677 | struct base1 678 | { 679 | int b1, b2 = 42; 680 | }; 681 | 682 | struct base2 683 | { 684 | base2() { 685 | b3 = 42; 686 | } 687 | int b3; 688 | }; 689 | 690 | struct derived : base1, base2 691 | { 692 | int d; 693 | }; 694 | 695 | derived d1 {{1, 2}, {}, 4}; // full initialization 696 | derived d2 {{}, {}, 4}; // value-initialized bases 697 | 698 | } 699 | 700 | namespace test_general_range_based_for_loop 701 | { 702 | 703 | struct iter 704 | { 705 | int i; 706 | 707 | int& operator* () 708 | { 709 | return i; 710 | } 711 | 712 | const int& operator* () const 713 | { 714 | return i; 715 | } 716 | 717 | iter& operator++() 718 | { 719 | ++i; 720 | return *this; 721 | } 722 | }; 723 | 724 | struct sentinel 725 | { 726 | int i; 727 | }; 728 | 729 | bool operator== (const iter& i, const sentinel& s) 730 | { 731 | return i.i == s.i; 732 | } 733 | 734 | bool operator!= (const iter& i, const sentinel& s) 735 | { 736 | return !(i == s); 737 | } 738 | 739 | struct range 740 | { 741 | iter begin() const 742 | { 743 | return {0}; 744 | } 745 | 746 | sentinel end() const 747 | { 748 | return {5}; 749 | } 750 | }; 751 | 752 | void f() 753 | { 754 | range r {}; 755 | 756 | for (auto i : r) 757 | { 758 | [[maybe_unused]] auto v = i; 759 | } 760 | } 761 | 762 | } 763 | 764 | namespace test_lambda_capture_asterisk_this_by_value 765 | { 766 | 767 | struct t 768 | { 769 | int i; 770 | int foo() 771 | { 772 | return [*this]() 773 | { 774 | return i; 775 | }(); 776 | } 777 | }; 778 | 779 | } 780 | 781 | namespace test_enum_class_construction 782 | { 783 | 784 | enum class byte : unsigned char 785 | {}; 786 | 787 | byte foo {42}; 788 | 789 | } 790 | 791 | namespace test_constexpr_if 792 | { 793 | 794 | template 795 | int f () 796 | { 797 | if constexpr(cond) 798 | { 799 | return 13; 800 | } 801 | else 802 | { 803 | return 42; 804 | } 805 | } 806 | 807 | } 808 | 809 | namespace test_selection_statement_with_initializer 810 | { 811 | 812 | int f() 813 | { 814 | return 13; 815 | } 816 | 817 | int f2() 818 | { 819 | if (auto i = f(); i > 0) 820 | { 821 | return 3; 822 | } 823 | 824 | switch (auto i = f(); i + 4) 825 | { 826 | case 17: 827 | return 2; 828 | 829 | default: 830 | return 1; 831 | } 832 | } 833 | 834 | } 835 | 836 | namespace test_template_argument_deduction_for_class_templates 837 | { 838 | 839 | template 840 | struct pair 841 | { 842 | pair (T1 p1, T2 p2) 843 | : m1 {p1}, 844 | m2 {p2} 845 | {} 846 | 847 | T1 m1; 848 | T2 m2; 849 | }; 850 | 851 | void f() 852 | { 853 | [[maybe_unused]] auto p = pair{13, 42u}; 854 | } 855 | 856 | } 857 | 858 | namespace test_non_type_auto_template_parameters 859 | { 860 | 861 | template 862 | struct B 863 | {}; 864 | 865 | B<5> b1; 866 | B<'a'> b2; 867 | 868 | } 869 | 870 | namespace test_structured_bindings 871 | { 872 | 873 | int arr[2] = { 1, 2 }; 874 | std::pair pr = { 1, 2 }; 875 | 876 | auto f1() -> int(&)[2] 877 | { 878 | return arr; 879 | } 880 | 881 | auto f2() -> std::pair& 882 | { 883 | return pr; 884 | } 885 | 886 | struct S 887 | { 888 | int x1 : 2; 889 | volatile double y1; 890 | }; 891 | 892 | S f3() 893 | { 894 | return {}; 895 | } 896 | 897 | auto [ x1, y1 ] = f1(); 898 | auto& [ xr1, yr1 ] = f1(); 899 | auto [ x2, y2 ] = f2(); 900 | auto& [ xr2, yr2 ] = f2(); 901 | const auto [ x3, y3 ] = f3(); 902 | 903 | } 904 | 905 | namespace test_exception_spec_type_system 906 | { 907 | 908 | struct Good {}; 909 | struct Bad {}; 910 | 911 | void g1() noexcept; 912 | void g2(); 913 | 914 | template 915 | Bad 916 | f(T*, T*); 917 | 918 | template 919 | Good 920 | f(T1*, T2*); 921 | 922 | static_assert (std::is_same_v); 923 | 924 | } 925 | 926 | namespace test_inline_variables 927 | { 928 | 929 | template void f(T) 930 | {} 931 | 932 | template inline T g(T) 933 | { 934 | return T{}; 935 | } 936 | 937 | template<> inline void f<>(int) 938 | {} 939 | 940 | template<> int g<>(int) 941 | { 942 | return 5; 943 | } 944 | 945 | } 946 | 947 | } // namespace cxx17 948 | 949 | #endif // __cplusplus < 201703L 950 | 951 | ]]) 952 | -------------------------------------------------------------------------------- /m4/ax_cxx_compile_stdcxx_17.m4: -------------------------------------------------------------------------------- 1 | # ============================================================================= 2 | # https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_17.html 3 | # ============================================================================= 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_CXX_COMPILE_STDCXX_17([ext|noext], [mandatory|optional]) 8 | # 9 | # DESCRIPTION 10 | # 11 | # Check for baseline language coverage in the compiler for the C++17 12 | # standard; if necessary, add switches to CXX and CXXCPP to enable 13 | # support. 14 | # 15 | # This macro is a convenience alias for calling the AX_CXX_COMPILE_STDCXX 16 | # macro with the version set to C++17. The two optional arguments are 17 | # forwarded literally as the second and third argument respectively. 18 | # Please see the documentation for the AX_CXX_COMPILE_STDCXX macro for 19 | # more information. If you want to use this macro, you also need to 20 | # download the ax_cxx_compile_stdcxx.m4 file. 21 | # 22 | # LICENSE 23 | # 24 | # Copyright (c) 2015 Moritz Klammler 25 | # Copyright (c) 2016 Krzesimir Nowak 26 | # 27 | # Copying and distribution of this file, with or without modification, are 28 | # permitted in any medium without royalty provided the copyright notice 29 | # and this notice are preserved. This file is offered as-is, without any 30 | # warranty. 31 | 32 | #serial 2 33 | 34 | AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX]) 35 | AC_DEFUN([AX_CXX_COMPILE_STDCXX_17], [AX_CXX_COMPILE_STDCXX([17], [$1], [$2])]) 36 | 37 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2011-2020 University of Southern California and 2 | # Andrew D. Smith and Timothy Daley 3 | # 4 | # Authors: Timothy Daley and Andrew D. Smith 5 | # 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | 16 | PROGS = preseq 17 | ifdef HAVE_HTSLIB 18 | PROGS += to-mr 19 | endif 20 | 21 | INCLUDEDIRS = smithlab_cpp 22 | INCLUDEARGS = $(addprefix -I, $(INCLUDEDIRS)) 23 | LIBS = -lz 24 | CXX = g++ 25 | CXXFLAGS = -std=c++11 -Wall 26 | 27 | ifdef DEBUG 28 | CXXFLAGS += -g 29 | else 30 | CXXFLAGS += -O2 31 | endif 32 | 33 | ifdef HAVE_HTSLIB 34 | CXXFLAGS += -DHAVE_HTSLIB 35 | LIBS += -lhts 36 | endif 37 | 38 | all: $(PROGS) 39 | 40 | $(PROGS): $(addprefix smithlab_cpp/, \ 41 | smithlab_os.o smithlab_utils.o GenomicRegion.o \ 42 | OptionParser.o MappedRead.o) 43 | 44 | ifdef HAVE_HTSLIB 45 | preseq to-mr: $(addprefix smithlab_cpp/, \ 46 | htslib_wrapper_deprecated.o cigar_utils.o) 47 | endif 48 | 49 | preseq: continued_fraction.o load_data_for_complexity.o moment_sequence.o 50 | 51 | %.o: %.cpp %.hpp 52 | $(CXX) $(CXXFLAGS) -c -o $@ $< $(INCLUDEARGS) 53 | 54 | %: %.cpp 55 | $(CXX) $(CXXFLAGS) -o $@ $^ $(INCLUDEARGS) $(LIBS) 56 | 57 | install: $(PROGS) 58 | @mkdir -p $(install_dir)/bin 59 | @install $(PROGS) $(install_dir)/bin 60 | 61 | clean: 62 | @-make -C smithlab_cpp clean 63 | @-rm -f $(PROGS) *.o 64 | 65 | .PHONY: install clean 66 | -------------------------------------------------------------------------------- /src/bam_record_utils.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2020-2023 Masaru Nakajima and Andrew D. Smith 2 | * 3 | * Authors: Masaru Nakajima and Andrew D. Smith 4 | * 5 | * This program is free software: you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation, either version 3 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License for more details. 14 | */ 15 | 16 | #ifndef SRC_BAM_RECORD_UTILS_HPP_ 17 | #define SRC_BAM_RECORD_UTILS_HPP_ 18 | 19 | /* ADS: need to control all the macros from HTSlib pollution. For 20 | functions maybe: 21 | 22 | $ gcc -dM -E sam.h | grep "define [a-z]" | awk '{print $2}' |\ 23 | grep "[(]" | awk -v FS="(" '{print "#undef",$1}' 24 | 25 | This gives about 65 symbols that need to be deleted. For the others 26 | I don't know what to do because some of them have "#define _" which 27 | means they should be system symbols. 28 | */ 29 | 30 | #include 31 | 32 | #include 33 | 34 | #ifdef bam_is_rev 35 | #undef bam_is_rev 36 | #endif 37 | 38 | inline bool 39 | bam_is_rev(const bamxx::bam_rec &b) { 40 | return (b.b->core.flag & BAM_FREVERSE) != 0; 41 | } 42 | 43 | #ifdef bam_is_mrev 44 | #undef bam_is_mrev 45 | #endif 46 | 47 | inline bool 48 | bam_is_mrev(const bamxx::bam_rec &b) { 49 | return (b.b->core.flag & BAM_FMREVERSE) != 0; 50 | } 51 | 52 | #ifdef bam_get_qname 53 | #undef bam_get_qname 54 | #endif 55 | 56 | inline char * 57 | bam_get_qname(const bamxx::bam_rec &b) { 58 | return reinterpret_cast(b.b->data); 59 | } 60 | 61 | #ifdef bam_get_cigar 62 | #undef bam_get_cigar 63 | #endif 64 | 65 | inline uint32_t * 66 | bam_get_cigar(const bamxx::bam_rec &b) { 67 | // start of data + bytes for query/read name 68 | return reinterpret_cast(b.b->data + b.b->core.l_qname); 69 | } 70 | 71 | #ifdef bam_get_seq 72 | #undef bam_get_seq 73 | #endif 74 | 75 | inline uint8_t * 76 | bam_get_seq(const bamxx::bam_rec &b) { 77 | // start of data + bytes for cigar + bytes for query/read name 78 | return b.b->data + b.b->core.l_qname + (b.b->core.n_cigar << 2); 79 | } 80 | 81 | #ifdef bam_get_qual 82 | #undef bam_get_qual 83 | #endif 84 | 85 | inline uint8_t * 86 | bam_get_qual(const bamxx::bam_rec &b) { 87 | return b.b->data + // start of data 88 | b.b->core.l_qname + // bytes for query name 89 | (b.b->core.n_cigar << 2) + // bytes for cigar 90 | ((b.b->core.l_qseq + 1) >> 1); // bytes for packed query/read 91 | } 92 | 93 | #ifdef bam_get_aux 94 | #undef bam_get_aux 95 | #endif 96 | 97 | inline uint8_t * 98 | bam_get_aux(const bamxx::bam_rec &b) { 99 | return b.b->data + b.b->core.l_qname + (b.b->core.n_cigar << 2) + 100 | ((b.b->core.l_qseq + 1) >> 1) + b.b->core.l_qseq; 101 | } 102 | 103 | #ifdef bam_get_l_aux 104 | #undef bam_get_l_aux 105 | #endif 106 | 107 | inline int 108 | bam_get_l_aux(const bamxx::bam_rec &b) { 109 | return b.b->l_data - (b.b->core.l_qname + (b.b->core.n_cigar << 2) + 110 | ((b.b->core.l_qseq + 1) >> 1) + b.b->core.l_qseq); 111 | } 112 | 113 | #ifdef bam_cigar_op 114 | #undef bam_cigar_op 115 | #endif 116 | 117 | inline uint32_t 118 | bam_cigar_op(const uint32_t c) { 119 | return c & BAM_CIGAR_MASK; 120 | } 121 | 122 | #ifdef bam_cigar_oplen 123 | #undef bam_cigar_oplen 124 | #endif 125 | 126 | inline uint32_t 127 | bam_cigar_oplen(const uint32_t c) { 128 | return c >> BAM_CIGAR_SHIFT; 129 | } 130 | 131 | inline bool 132 | bam_same_orientation(const bamxx::bam_rec &a, const bamxx::bam_rec &b) { 133 | return ((a.b->core.flag ^ b.b->core.flag) & BAM_FREVERSE) != 0; 134 | } 135 | 136 | int 137 | truncate_overlap(const bamxx::bam_rec &a, const uint32_t overlap, 138 | bamxx::bam_rec &c); 139 | 140 | int 141 | merge_overlap(const bamxx::bam_rec &a, const bamxx::bam_rec &b, 142 | const uint32_t head, bamxx::bam_rec &c); 143 | 144 | int 145 | merge_non_overlap(const bamxx::bam_rec &a, const bamxx::bam_rec &b, 146 | const uint32_t spacer, bamxx::bam_rec &c); 147 | 148 | int 149 | keep_better_end(const bamxx::bam_rec &a, const bamxx::bam_rec &b, 150 | bamxx::bam_rec &c); 151 | 152 | size_t 153 | correct_cigar(bamxx::bam_rec &b); 154 | 155 | void 156 | flip_conversion(bamxx::bam_rec &aln); 157 | 158 | inline bool 159 | is_a_rich(const bamxx::bam_rec &b) { 160 | return bam_aux2A(bam_aux_get(b.b, "CV")) == 'A'; 161 | } 162 | 163 | void 164 | standardize_format(const std::string &input_format, bamxx::bam_rec &aln); 165 | 166 | void 167 | apply_cigar(const bamxx::bam_rec &aln, std::string &to_inflate, 168 | const char inflation_symbol); 169 | 170 | void 171 | get_seq_str(const bamxx::bam_rec &aln, std::string &seq_str); 172 | 173 | inline bool 174 | are_mates(const bamxx::bam_rec &one, const bamxx::bam_rec &two) { 175 | return one.b->core.mtid == two.b->core.tid && 176 | one.b->core.mpos == two.b->core.pos && bam_same_orientation(one, two); 177 | // below is a consistency check and should not be necessary 178 | /* && 179 | two->core.mtid == one->core.tid && 180 | two->core.mpos == one->core.pos; */ 181 | } 182 | 183 | inline int32_t 184 | get_l_qseq(const bamxx::bam_rec &b) { 185 | return b.b->core.l_qseq; 186 | } 187 | 188 | inline size_t 189 | get_n_targets(const bamxx::bam_header &bh) { 190 | return bh.h->n_targets; 191 | } 192 | 193 | inline std::string 194 | get_qname(const bamxx::bam_rec &b) { 195 | return bam_get_qname(b); 196 | } 197 | 198 | inline int32_t 199 | get_tid(const bamxx::bam_rec &b) { 200 | return b.b->core.tid; 201 | } 202 | 203 | inline hts_pos_t 204 | get_pos(const bamxx::bam_rec &b) { 205 | return b.b->core.pos; 206 | } 207 | 208 | inline int32_t 209 | get_mtid(const bamxx::bam_rec &b) { 210 | return b.b->core.mtid; 211 | } 212 | 213 | inline hts_pos_t 214 | get_mpos(const bamxx::bam_rec &b) { 215 | return b.b->core.mpos; 216 | } 217 | 218 | inline uint32_t 219 | get_n_cigar(const bamxx::bam_rec &b) { 220 | return b.b->core.n_cigar; 221 | } 222 | 223 | inline hts_pos_t 224 | get_endpos(const bamxx::bam_rec &b) { 225 | return bam_endpos(b.b); 226 | } 227 | 228 | inline bool 229 | cigar_eats_ref(const uint32_t c) { 230 | return bam_cigar_type(bam_cigar_op(c)) & 2; 231 | } 232 | 233 | inline bool 234 | cigar_eats_query(const uint32_t c) { 235 | return bam_cigar_type(bam_cigar_op(c)) & 1; 236 | } 237 | 238 | inline bool 239 | cigar_eats_frag(const uint32_t c) { 240 | return bam_cigar_op(c) == BAM_CREF_SKIP; 241 | } 242 | 243 | inline bool 244 | precedes_by_start(const bamxx::bam_rec &a, const bamxx::bam_rec &b) { 245 | // assumes a.get_tid() <= b.get_tid() 246 | return get_tid(a) == get_tid(b) && get_pos(a) < get_pos(b); 247 | } 248 | 249 | inline bool 250 | precedes_by_end_and_strand(const bamxx::bam_rec &a, const bamxx::bam_rec &b) { 251 | const auto end_a = bam_endpos(a.b); 252 | const auto end_b = bam_endpos(b.b); 253 | return end_a < end_b || 254 | (end_a == end_b && bam_is_rev(a) == false && bam_is_rev(b) == true); 255 | } 256 | 257 | inline bool 258 | equivalent_chrom_and_start(const bamxx::bam_rec &a, const bamxx::bam_rec &b) { 259 | return a.b->core.pos == b.b->core.pos && a.b->core.tid == b.b->core.tid; 260 | } 261 | 262 | inline bool 263 | equivalent_end_and_strand(const bamxx::bam_rec &a, const bamxx::bam_rec &b) { 264 | return bam_endpos(a.b) == bam_endpos(b.b) && bam_is_rev(a) == bam_is_rev(b); 265 | } 266 | 267 | template 268 | int 269 | bam_aux_update_int(bamxx::bam_rec &b, const char tag[2], T val) { 270 | return bam_aux_update_int(b.b, tag, val); 271 | } 272 | 273 | inline std::string 274 | sam_hdr_tid2name(const bamxx::bam_header &hdr, const int32_t tid) { 275 | return std::string(sam_hdr_tid2name(hdr.h, tid)); 276 | } 277 | 278 | inline uint32_t 279 | sam_hdr_tid2len(const bamxx::bam_header &hdr, const int32_t tid) { 280 | return sam_hdr_tid2len(hdr.h, tid); 281 | } 282 | 283 | inline std::string 284 | sam_hdr_tid2name(const bamxx::bam_header &hdr, const bamxx::bam_rec &aln) { 285 | return std::string(sam_hdr_tid2name(hdr.h, aln.b->core.tid)); 286 | } 287 | 288 | std::string 289 | to_string(const bamxx::bam_header &hdr, const bamxx::bam_rec &aln); 290 | 291 | inline size_t 292 | rlen_from_cigar(const bamxx::bam_rec &aln) { 293 | return bam_cigar2rlen(get_n_cigar(aln), bam_get_cigar(aln)); 294 | } 295 | 296 | #endif // SRC_BAM_RECORD_UTILS_HPP_ 297 | -------------------------------------------------------------------------------- /src/bound_pop.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2024 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Timothy Daley and Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see 18 | * . 19 | */ 20 | 21 | #include "bound_pop.hpp" 22 | 23 | #include "common.hpp" 24 | #include "load_data_for_complexity.hpp" 25 | #include "moment_sequence.hpp" 26 | 27 | #include 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include // std::mt19937 35 | #include 36 | #include 37 | #include 38 | 39 | using std::cerr; 40 | using std::endl; 41 | using std::isfinite; 42 | using std::min; 43 | using std::mt19937; 44 | using std::runtime_error; 45 | using std::string; 46 | using std::uint32_t; 47 | using std::vector; 48 | 49 | static void 50 | report_bootstrapped_moments(const vector &bootstrap_moments, 51 | const MomentSequence &bootstrap_mom_seq, 52 | const vector &points, 53 | const vector &weights, 54 | const double estimated_unobs) { 55 | cerr << "bootstrapped_moments=" << endl; 56 | for (size_t i = 0; i < bootstrap_moments.size(); i++) 57 | cerr << bootstrap_moments[i] << endl; 58 | for (size_t k = 0; k < bootstrap_mom_seq.alpha.size(); k++) 59 | cerr << "alpha_" << k << '\t'; 60 | cerr << endl; 61 | for (size_t k = 0; k < bootstrap_mom_seq.alpha.size(); k++) 62 | cerr << bootstrap_mom_seq.alpha[k] << '\t'; 63 | cerr << endl; 64 | 65 | for (size_t k = 0; k < bootstrap_mom_seq.beta.size(); k++) 66 | cerr << "beta_" << k << '\t'; 67 | cerr << endl; 68 | for (size_t k = 0; k < bootstrap_mom_seq.beta.size(); k++) 69 | cerr << bootstrap_mom_seq.beta[k] << '\t'; 70 | cerr << endl; 71 | cerr << "points=" << "\t"; 72 | for (size_t i = 0; i < points.size(); i++) 73 | cerr << points[i] << "\t"; 74 | cerr << endl; 75 | cerr << "weights=" << "\t"; 76 | for (size_t i = 0; i < weights.size(); i++) 77 | cerr << weights[i] << "\t"; 78 | cerr << endl; 79 | cerr << "estimated_unobs=" << "\t" << estimated_unobs << endl; 80 | } 81 | 82 | // BOUND_UNOBS: bounding n_0 83 | int 84 | bound_pop_main(const int argc, const char *argv[]) { 85 | try { 86 | bool verbose = false; 87 | bool PAIRED_END = false; 88 | bool HIST_INPUT = false; 89 | bool VALS_INPUT = false; 90 | bool QUICK_MODE = false; 91 | 92 | string outfile; 93 | string histogram_outfile; 94 | 95 | #ifdef HAVE_HTSLIB 96 | bool BAM_FORMAT_INPUT = false; 97 | size_t MAX_SEGMENT_LENGTH = 5000; 98 | uint32_t n_threads{1}; 99 | #endif 100 | 101 | size_t max_num_points = 10; 102 | double tolerance = 1e-20; 103 | size_t n_bootstraps = 500; 104 | double c_level = 0.95; 105 | size_t max_iter = 100; 106 | uint32_t seed = 408; 107 | 108 | const string description = R"( 109 | Estimate a bound on the size of the underlying population based on 110 | counts of observed species in an initial sample. 111 | )"; 112 | string program_name = std::filesystem::path(argv[0]).filename(); 113 | program_name += " " + string(argv[1]); 114 | 115 | /********** GET COMMAND LINE ARGUMENTS FOR BOUND_POP ***********/ 116 | OptionParser opt_parse(program_name, description, ""); 117 | opt_parse.add_opt("output", 'o', 118 | "species richness output file " 119 | "(default: stdout)", 120 | false, outfile); 121 | opt_parse.add_opt("max_num_points", 'p', 122 | "maximum number of points in " 123 | "quadrature estimates", 124 | false, max_num_points); 125 | opt_parse.add_opt("tolerance", 't', "numerical tolerance", false, 126 | tolerance); 127 | opt_parse.add_opt("bootstraps", 'n', "number of bootstraps", false, 128 | n_bootstraps); 129 | opt_parse.add_opt("clevel", 'c', "level for confidence intervals", false, 130 | c_level); 131 | opt_parse.add_opt("verbose", 'v', "print more information", false, verbose); 132 | opt_parse.add_opt("pe", 'P', "input is paired end read file", false, 133 | PAIRED_END); 134 | opt_parse.add_opt("hist", 'H', 135 | "input is a text file containing the " 136 | "observed histogram", 137 | false, HIST_INPUT); 138 | opt_parse.add_opt("hist-out", '\0', 139 | "output histogram to this file (for non-hist input)", 140 | false, histogram_outfile); 141 | opt_parse.add_opt("vals", 'V', 142 | "input is a text file containing only the " 143 | "observed duplicate counts", 144 | false, VALS_INPUT); 145 | #ifdef HAVE_HTSLIB 146 | opt_parse.add_opt("bam", 'B', "input is in BAM format", false, 147 | BAM_FORMAT_INPUT); 148 | opt_parse.add_opt("seg_len", 'l', 149 | "maximum segment length when merging " 150 | "paired end bam reads", 151 | false, MAX_SEGMENT_LENGTH); 152 | opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM", 153 | false, n_threads); 154 | #endif 155 | opt_parse.add_opt("quick", 'Q', 156 | "quick mode, estimate without bootstrapping", false, 157 | QUICK_MODE); 158 | opt_parse.add_opt("seed", 'r', "seed for random number generator", false, 159 | seed); 160 | opt_parse.set_show_defaults(); 161 | 162 | vector leftover_args; 163 | opt_parse.parse(argc - 1, argv + 1, leftover_args); 164 | if (argc == 2 || opt_parse.help_requested()) { 165 | cerr << opt_parse.help_message() << endl; 166 | cerr << opt_parse.about_message() << endl; 167 | return EXIT_SUCCESS; 168 | } 169 | if (opt_parse.option_missing()) { 170 | cerr << opt_parse.option_missing_message() << endl; 171 | return EXIT_SUCCESS; 172 | } 173 | if (leftover_args.empty()) { 174 | cerr << opt_parse.help_message() << endl; 175 | return EXIT_SUCCESS; 176 | } 177 | const string input_file_name = leftover_args.front(); 178 | // **************************************************************** 179 | 180 | vector counts_hist; 181 | size_t n_obs = 0; 182 | 183 | // LOAD VALUES 184 | if (HIST_INPUT) { 185 | if (verbose) 186 | cerr << "HIST_INPUT" << endl; 187 | n_obs = load_histogram(input_file_name, counts_hist); 188 | } 189 | else if (VALS_INPUT) { 190 | if (verbose) 191 | cerr << "VALS_INPUT" << endl; 192 | n_obs = load_counts(input_file_name, counts_hist); 193 | } 194 | #ifdef HAVE_HTSLIB 195 | else if (BAM_FORMAT_INPUT && PAIRED_END) { 196 | if (verbose) 197 | cerr << "PAIRED_END_BAM_INPUT" << endl; 198 | n_obs = load_counts_BAM_pe(n_threads, input_file_name, counts_hist); 199 | } 200 | else if (BAM_FORMAT_INPUT) { 201 | if (verbose) 202 | cerr << "BAM_INPUT" << endl; 203 | n_obs = load_counts_BAM_se(n_threads, input_file_name, counts_hist); 204 | } 205 | #endif 206 | else if (PAIRED_END) { 207 | if (verbose) 208 | cerr << "PAIRED_END_BED_INPUT" << endl; 209 | n_obs = load_counts_BED_pe(input_file_name, counts_hist); 210 | } 211 | else { // default is single end bed file 212 | if (verbose) 213 | cerr << "BED_INPUT" << endl; 214 | n_obs = load_counts_BED_se(input_file_name, counts_hist); 215 | } 216 | 217 | const double distinct_obs = 218 | accumulate(begin(counts_hist), end(counts_hist), 0.0); 219 | 220 | vector measure_moments; 221 | // mu_r = (r + 1)! n_{r+1} / n_1 222 | size_t idx = 1; 223 | while (idx < counts_hist.size() && counts_hist[idx]) { 224 | // idx + 1 because function calculates (x-1)! 225 | measure_moments.push_back( 226 | exp(factorial(idx + 1) + log(counts_hist[idx]) - log(counts_hist[1]))); 227 | if (!isfinite(measure_moments.back())) { 228 | measure_moments.pop_back(); 229 | break; 230 | } 231 | ++idx; 232 | } 233 | 234 | if (verbose) { 235 | cerr << "TOTAL OBSERVATIONS = " << n_obs << endl 236 | << "DISTINCT OBSERVATIONS = " << distinct_obs << endl 237 | << "MAX COUNT = " << counts_hist.size() - 1 << endl; 238 | 239 | cerr << "OBSERVED MOMENTS" << endl; 240 | for (size_t i = 0; i < measure_moments.size(); i++) 241 | cerr << std::setprecision(16) << measure_moments[i] << endl; 242 | } 243 | 244 | if (!histogram_outfile.empty()) 245 | report_histogram(histogram_outfile, counts_hist); 246 | 247 | if (QUICK_MODE) { 248 | if (measure_moments.size() < 2 * max_num_points) 249 | max_num_points = static_cast(floor(measure_moments.size() / 2)); 250 | else 251 | measure_moments.resize(2 * max_num_points); 252 | size_t n_points = 0; 253 | n_points = ensure_pos_def_mom_seq(measure_moments, tolerance, verbose); 254 | if (verbose) 255 | cerr << "n_points = " << n_points << endl; 256 | 257 | MomentSequence obs_mom_seq(measure_moments); 258 | 259 | if (verbose) { 260 | for (size_t k = 0; k < obs_mom_seq.alpha.size(); k++) 261 | cerr << "alpha_" << k << '\t'; 262 | cerr << endl; 263 | for (size_t k = 0; k < obs_mom_seq.alpha.size(); k++) 264 | cerr << obs_mom_seq.alpha[k] << '\t'; 265 | cerr << endl; 266 | 267 | for (size_t k = 0; k < obs_mom_seq.beta.size(); k++) 268 | cerr << "beta_" << k << '\t'; 269 | cerr << endl; 270 | for (size_t k = 0; k < obs_mom_seq.beta.size(); k++) 271 | cerr << obs_mom_seq.beta[k] << '\t'; 272 | cerr << endl; 273 | } 274 | 275 | vector points, weights; 276 | obs_mom_seq.Lower_quadrature_rules(n_points, tolerance, max_iter, points, 277 | weights); 278 | 279 | // renormalize if needed 280 | const double weights_sum = accumulate(begin(weights), end(weights), 0.0); 281 | if (weights_sum != 1.0) 282 | for (size_t i = 0; i < weights.size(); i++) 283 | weights[i] = weights[i] / weights_sum; 284 | 285 | if (verbose) { 286 | cerr << "points = " << endl; 287 | for (size_t i = 0; i < points.size(); i++) 288 | cerr << points[i] << '\t'; 289 | cerr << endl; 290 | 291 | cerr << "weights = " << endl; 292 | for (size_t i = 0; i < weights.size(); i++) 293 | cerr << weights[i] << '\t'; 294 | cerr << endl; 295 | } 296 | 297 | double estimated_unobs = 0.0; 298 | 299 | for (size_t i = 0; i < weights.size(); i++) 300 | estimated_unobs += counts_hist[1] * weights[i] / points[i]; 301 | 302 | if (estimated_unobs > 0.0) 303 | estimated_unobs += distinct_obs; 304 | else { 305 | estimated_unobs = distinct_obs; 306 | n_points = 0; 307 | } 308 | 309 | std::ofstream of; 310 | if (!outfile.empty()) 311 | of.open(outfile); 312 | std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); 313 | 314 | out.setf(std::ios_base::fixed, std::ios_base::floatfield); 315 | out.precision(1); 316 | 317 | out << "quadrature_estimated_unobs" << '\t' << "n_points" << endl; 318 | out << estimated_unobs << '\t' << n_points << endl; 319 | } 320 | // NOT QUICK MODE, BOOTSTRAP 321 | else { 322 | vector quad_estimates; 323 | 324 | // setup rng 325 | mt19937 rng(seed); 326 | 327 | // hist may be sparse, to speed up bootstrapping 328 | // sample only from positive entries 329 | vector counts_hist_distinct_counts; 330 | vector distinct_counts_hist; 331 | for (size_t i = 0; i < counts_hist.size(); i++) 332 | if (counts_hist[i] > 0) { 333 | counts_hist_distinct_counts.push_back(i); 334 | distinct_counts_hist.push_back(counts_hist[i]); 335 | } 336 | 337 | for (size_t iter = 0; 338 | iter < max_iter && quad_estimates.size() < n_bootstraps; ++iter) { 339 | if (verbose) 340 | cerr << "iter=" << "\t" << iter << endl; 341 | 342 | vector sample_hist; 343 | resample_hist(rng, counts_hist_distinct_counts, distinct_counts_hist, 344 | sample_hist); 345 | 346 | const double sampled_distinct = 347 | accumulate(begin(sample_hist), end(sample_hist), 0.0); 348 | 349 | // initialize moments, 0th moment is 1 350 | vector bootstrap_moments(1, 1.0); 351 | // moments[r] = (r + 1)! n_{r+1} / n_1 352 | for (size_t i = 0; i < 2 * max_num_points; i++) { 353 | bootstrap_moments.push_back(exp( 354 | factorial(i + 3) + log(sample_hist[i + 2]) - log(sample_hist[1]))); 355 | } 356 | 357 | size_t n_points = 0; 358 | n_points = 359 | ensure_pos_def_mom_seq(bootstrap_moments, tolerance, verbose); 360 | n_points = min(n_points, max_num_points); 361 | if (verbose) 362 | cerr << "n_points = " << n_points << endl; 363 | 364 | MomentSequence bootstrap_mom_seq(bootstrap_moments); 365 | 366 | vector points; 367 | vector weights; 368 | bootstrap_mom_seq.Lower_quadrature_rules(n_points, tolerance, max_iter, 369 | points, weights); 370 | 371 | // renormalize if needed 372 | const double weights_sum = 373 | accumulate(begin(weights), end(weights), 0.0); 374 | if (weights_sum != 1.0) 375 | for (size_t i = 0; i < weights.size(); i++) 376 | weights[i] = weights[i] / weights_sum; 377 | 378 | double estimated_unobs = 0.0; 379 | 380 | for (size_t i = 0; i < weights.size(); i++) 381 | estimated_unobs += counts_hist[1] * weights[i] / points[i]; 382 | 383 | if (estimated_unobs > 0.0) 384 | estimated_unobs += sampled_distinct; 385 | else { 386 | estimated_unobs = sampled_distinct; 387 | n_points = 0; 388 | } 389 | 390 | if (verbose) 391 | report_bootstrapped_moments(bootstrap_moments, bootstrap_mom_seq, 392 | points, weights, estimated_unobs); 393 | 394 | quad_estimates.push_back(estimated_unobs); 395 | } 396 | 397 | double median_estimate, lower_ci, upper_ci; 398 | median_and_ci(quad_estimates, c_level, median_estimate, lower_ci, 399 | upper_ci); 400 | 401 | std::ofstream of; 402 | if (!outfile.empty()) 403 | of.open(outfile); 404 | std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); 405 | 406 | out.setf(std::ios_base::fixed, std::ios_base::floatfield); 407 | out.precision(1); 408 | 409 | out << "median_estimated_unobs" << '\t' << "lower_ci" << '\t' 410 | << "upper_ci" << endl; 411 | out << median_estimate << '\t' << lower_ci << '\t' << upper_ci << endl; 412 | } 413 | } 414 | catch (const std::exception &e) { 415 | cerr << e.what() << endl; 416 | return EXIT_FAILURE; 417 | } 418 | return EXIT_SUCCESS; 419 | } 420 | -------------------------------------------------------------------------------- /src/bound_pop.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2024 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Timothy Daley and Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see 18 | * . 19 | */ 20 | 21 | #ifndef SRC_BOUND_POP_HPP_ 22 | #define SRC_BOUND_POP_HPP_ 23 | 24 | int 25 | bound_pop_main(const int argc, const char *argv[]); 26 | 27 | #endif // SRC_BOUND_POP_HPP_ 28 | -------------------------------------------------------------------------------- /src/c_curve.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2024 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Timothy Daley and Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see 18 | * . 19 | */ 20 | 21 | #include "c_curve.hpp" 22 | 23 | #include "common.hpp" 24 | #include "continued_fraction.hpp" 25 | #include "load_data_for_complexity.hpp" 26 | #include "moment_sequence.hpp" 27 | 28 | #include 29 | #include 30 | #include 31 | 32 | #include 33 | #include // std::size_t 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | using std::accumulate; 44 | using std::cbegin; 45 | using std::cend; 46 | using std::cerr; 47 | using std::endl; 48 | using std::mt19937; 49 | using std::size; 50 | using std::size_t; 51 | using std::string; 52 | using std::uint32_t; 53 | using std::vector; 54 | 55 | template 56 | T 57 | median_from_sorted_vector(const vector &sorted_data, const size_t stride, 58 | const size_t n) { 59 | if (n == 0 || sorted_data.empty()) 60 | return 0.0; 61 | const size_t lhs = (n - 1) / 2; 62 | const size_t rhs = n / 2; 63 | if (lhs == rhs) 64 | return sorted_data[lhs * stride]; 65 | return (sorted_data[lhs * stride] + sorted_data[rhs * stride]) / 2.0; 66 | } 67 | 68 | int 69 | c_curve_main(const int argc, const char *argv[]) { 70 | try { 71 | bool verbose = false; 72 | bool PAIRED_END = false; 73 | bool HIST_INPUT = false; 74 | bool VALS_INPUT = false; 75 | uint32_t seed = 408; 76 | 77 | string outfile; 78 | string histogram_outfile; 79 | 80 | double step_size = 1e6; 81 | #ifdef HAVE_HTSLIB 82 | bool BAM_FORMAT_INPUT = false; 83 | size_t MAX_SEGMENT_LENGTH = 5000; 84 | uint32_t n_threads{1}; 85 | #endif 86 | 87 | const string description = 88 | R"( 89 | Generate the complexity curve for data. This does not extrapolate, but 90 | instead resamples from the given data. 91 | )"; 92 | string program_name = std::filesystem::path(argv[0]).filename(); 93 | program_name += " " + string(argv[1]); 94 | 95 | /********** GET COMMAND LINE ARGUMENTS FOR C_CURVE ***********/ 96 | OptionParser opt_parse(program_name, description, ""); 97 | opt_parse.add_opt("output", 'o', "yield output file (default: stdout)", 98 | false, outfile); 99 | opt_parse.add_opt("step", 's', "step size in extrapolations", false, 100 | step_size); 101 | opt_parse.add_opt("verbose", 'v', "print more information", false, verbose); 102 | opt_parse.add_opt("pe", 'P', "input paired end read file", false, 103 | PAIRED_END); 104 | opt_parse.add_opt("hist", 'H', 105 | "input is text file containing observed histogram", false, 106 | HIST_INPUT); 107 | opt_parse.add_opt("hist-out", '\0', 108 | "output histogram to this file (for non-hist input)", 109 | false, histogram_outfile); 110 | opt_parse.add_opt("vals", 'V', 111 | "input is text file containing only observed counts", 112 | false, VALS_INPUT); 113 | #ifdef HAVE_HTSLIB 114 | opt_parse.add_opt("bam", 'B', "input is in BAM format", false, 115 | BAM_FORMAT_INPUT); 116 | opt_parse.add_opt("seg_len", 'l', 117 | "maximum segment length when merging " 118 | "paired end bam reads", 119 | false, MAX_SEGMENT_LENGTH); 120 | opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM", 121 | false, n_threads); 122 | #endif 123 | opt_parse.add_opt("seed", 'r', "seed for random number generator", false, 124 | seed); 125 | opt_parse.set_show_defaults(); 126 | 127 | vector leftover_args; 128 | opt_parse.parse(argc - 1, argv + 1, leftover_args); 129 | if (argc == 2 || opt_parse.help_requested()) { 130 | cerr << opt_parse.help_message() << endl; 131 | cerr << opt_parse.about_message() << endl; 132 | return EXIT_SUCCESS; 133 | } 134 | if (opt_parse.about_requested()) { 135 | cerr << opt_parse.about_message() << endl; 136 | return EXIT_SUCCESS; 137 | } 138 | if (opt_parse.option_missing()) { 139 | cerr << opt_parse.option_missing_message() << endl; 140 | return EXIT_SUCCESS; 141 | } 142 | if (leftover_args.empty()) { 143 | cerr << opt_parse.help_message() << endl; 144 | return EXIT_SUCCESS; 145 | } 146 | const string input_file_name = leftover_args.front(); 147 | /******************************************************************/ 148 | 149 | // Setup the random number generator 150 | mt19937 rng(seed); 151 | 152 | vector counts_hist; 153 | size_t n_reads = 0; 154 | 155 | // LOAD VALUES 156 | if (HIST_INPUT) { 157 | if (verbose) 158 | cerr << "INPUT_HIST" << endl; 159 | n_reads = load_histogram(input_file_name, counts_hist); 160 | } 161 | else if (VALS_INPUT) { 162 | if (verbose) 163 | cerr << "VALS_INPUT" << endl; 164 | n_reads = load_counts(input_file_name, counts_hist); 165 | } 166 | #ifdef HAVE_HTSLIB 167 | else if (BAM_FORMAT_INPUT && PAIRED_END) { 168 | if (verbose) 169 | cerr << "PAIRED_END_BAM_INPUT" << endl; 170 | n_reads = load_counts_BAM_pe(n_threads, input_file_name, counts_hist); 171 | } 172 | else if (BAM_FORMAT_INPUT) { 173 | if (verbose) 174 | cerr << "BAM_INPUT" << endl; 175 | n_reads = load_counts_BAM_se(n_threads, input_file_name, counts_hist); 176 | } 177 | #endif 178 | else if (PAIRED_END) { 179 | if (verbose) 180 | cerr << "PAIRED_END_BED_INPUT" << endl; 181 | n_reads = load_counts_BED_pe(input_file_name, counts_hist); 182 | } 183 | else { // default is single end bed file 184 | if (verbose) 185 | cerr << "BED_INPUT" << endl; 186 | n_reads = load_counts_BED_se(input_file_name, counts_hist); 187 | } 188 | 189 | const size_t max_observed_count = size(counts_hist) - 1; 190 | const double distinct_reads = 191 | accumulate(cbegin(counts_hist), cend(counts_hist), 0.0); 192 | 193 | const size_t total_reads = get_counts_from_hist(counts_hist); 194 | 195 | const size_t distinct_counts = 196 | std::count_if(cbegin(counts_hist), cend(counts_hist), 197 | [](const double x) { return x > 0.0; }); 198 | 199 | if (verbose) 200 | cerr << "TOTAL READS = " << n_reads << endl 201 | << "COUNTS_SUM = " << total_reads << endl 202 | << "DISTINCT READS = " << distinct_reads << endl 203 | << "DISTINCT COUNTS = " << distinct_counts << endl 204 | << "MAX COUNT = " << max_observed_count << endl 205 | << "COUNTS OF 1 = " << counts_hist[1] << endl; 206 | 207 | if (!histogram_outfile.empty()) 208 | report_histogram(histogram_outfile, counts_hist); 209 | 210 | const size_t upper_limit = n_reads; // set upper limit equal to number of 211 | // molecules 212 | 213 | // setup for output of the complexity curve 214 | std::ofstream of; 215 | if (!outfile.empty()) 216 | of.open(outfile); 217 | std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); 218 | 219 | // prints the complexity curve 220 | out << "total_reads" << "\t" << "distinct_reads" << endl; 221 | out << 0 << '\t' << 0 << endl; 222 | for (size_t i = step_size; i <= upper_limit; i += step_size) { 223 | if (verbose) 224 | cerr << "sample size: " << i << endl; 225 | out << i << "\t" 226 | << interpolate_distinct(counts_hist, total_reads, distinct_reads, i) 227 | << endl; 228 | } 229 | } 230 | catch (const std::exception &e) { 231 | cerr << "ERROR:\t" << e.what() << endl; 232 | return EXIT_FAILURE; 233 | } 234 | return EXIT_SUCCESS; 235 | } 236 | -------------------------------------------------------------------------------- /src/c_curve.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2024 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Timothy Daley and Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see 18 | * . 19 | */ 20 | 21 | #ifndef SRC_C_CURVE_HPP_ 22 | #define SRC_C_CURVE_HPP_ 23 | 24 | int 25 | c_curve_main(const int argc, const char *argv[]); 26 | 27 | #endif // SRC_C_CURVE_HPP_ 28 | -------------------------------------------------------------------------------- /src/common.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2024 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Timothy Daley and Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see 18 | * . 19 | */ 20 | 21 | #include "common.hpp" 22 | 23 | #include "continued_fraction.hpp" 24 | 25 | #include 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | using std::array; 40 | using std::begin; 41 | using std::cbegin; 42 | using std::cend; 43 | using std::cerr; 44 | using std::end; 45 | using std::endl; 46 | using std::min; 47 | using std::mt19937; 48 | using std::runtime_error; 49 | using std::size_t; 50 | using std::string; 51 | using std::uint32_t; 52 | using std::vector; 53 | 54 | double 55 | GoodToulmin2xExtrap(const vector &counts_hist) { 56 | double two_fold_extrap = 0.0; 57 | for (size_t i = 0; i < counts_hist.size(); i++) 58 | two_fold_extrap += pow(-1.0, i + 1) * counts_hist[i]; 59 | return two_fold_extrap; 60 | } 61 | 62 | // Lanczos approximation for gamma function for x >= 0.5 - essentially an 63 | // approximation for (x-1)! 64 | double 65 | factorial(double x) { 66 | // constants 67 | static constexpr double LogRootTwoPi = 0.9189385332046727; 68 | static constexpr double Euler = 2.71828182845904523536028747135; 69 | array Lanczos{0.99999999999980993227684700473478, 70 | 676.520368121885098567009190444019, 71 | -1259.13921672240287047156078755283, 72 | 771.3234287776530788486528258894, 73 | -176.61502916214059906584551354, 74 | 12.507343278686904814458936853, 75 | -0.13857109526572011689554707, 76 | 9.984369578019570859563e-6, 77 | 1.50563273514931155834e-7}; 78 | 79 | // Approximation for factorial is actually x-1 80 | x -= 1.0; 81 | 82 | double Ag = Lanczos[0]; 83 | for (auto k = 1u; k < size(Lanczos); k++) 84 | Ag += Lanczos[k] / (x + k); 85 | 86 | const double term1 = (x + 0.5) * log((x + 7.5) / Euler); 87 | const double term2 = LogRootTwoPi + log(Ag); 88 | 89 | return term1 + (term2 - 7.0); 90 | } 91 | 92 | // interpolate by explicit calculating the expectation 93 | // for sampling without replacement; 94 | // see K.L Heck 1975 95 | // N total sample size; S the total number of distincts 96 | // n sub sample size 97 | double 98 | interpolate_distinct(const vector &hist, const size_t N, const size_t S, 99 | const size_t n) { 100 | const double denom = 101 | factorial(N + 1) - factorial(n + 1) - factorial(N - n + 1); 102 | 103 | vector numer(hist.size(), 0); 104 | for (size_t i = 1; i < hist.size(); i++) { 105 | // N - i -n + 1 should be greater than 0 106 | if (N < i + n) { 107 | numer[i] = 0; 108 | } 109 | else { 110 | const double x = 111 | (factorial(N - i + 1) - factorial(n + 1) - factorial(N - i - n + 1)); 112 | numer[i] = exp(x - denom) * hist[i]; 113 | } 114 | } 115 | return S - accumulate(cbegin(numer), cend(numer), 0); 116 | } 117 | 118 | static void 119 | extrapolate_curve(const ContinuedFraction &the_cf, 120 | const double initial_distinct, const double vals_sum, 121 | const double initial_sample_size, const double step_size, 122 | const double max_sample_size, vector &estimates) { 123 | double curr_samp_sz = initial_sample_size; 124 | while (curr_samp_sz < max_sample_size) { 125 | const double fold = (curr_samp_sz - vals_sum) / vals_sum; 126 | assert(fold >= 0.0); 127 | estimates.push_back(initial_distinct + fold * the_cf(fold)); 128 | curr_samp_sz += step_size; 129 | } 130 | } 131 | 132 | bool 133 | extrap_single_estimate(const bool VERBOSE, const bool allow_defects, 134 | const vector &hist, size_t max_terms, 135 | const int diagonal, const double step_size, 136 | const double max_extrap, 137 | vector &yield_estimate) { 138 | yield_estimate.clear(); 139 | 140 | const double vals_sum = get_counts_from_hist(hist); 141 | const double initial_distinct = accumulate(cbegin(hist), cend(hist), 0.0); 142 | 143 | // interpolate complexity curve by random sampling w/out replacement 144 | const size_t upper_limit = vals_sum; 145 | const size_t step = step_size; 146 | size_t sample = static_cast(step_size); 147 | for (; sample < upper_limit; sample += step) 148 | yield_estimate.push_back( 149 | interpolate_distinct(hist, upper_limit, initial_distinct, sample)); 150 | 151 | // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE 152 | size_t first_zero = 1; 153 | while (first_zero < hist.size() && hist[first_zero] > 0) 154 | ++first_zero; 155 | 156 | // Ensure we are not using a zero term 157 | max_terms = min(max_terms, first_zero - 1); 158 | 159 | // refit curve for lower bound (degree of approx is 1 less than 160 | // max_terms) 161 | max_terms = max_terms - (max_terms % 2 == 1); 162 | 163 | if (allow_defects) { 164 | vector ps_coeffs; 165 | for (size_t j = 1; j <= max_terms; j++) 166 | ps_coeffs.push_back(hist[j] * std::pow(-1.0, j + 1)); 167 | 168 | const ContinuedFraction defect_cf(ps_coeffs, diagonal, max_terms); 169 | 170 | extrapolate_curve(defect_cf, initial_distinct, vals_sum, sample, step_size, 171 | max_extrap, yield_estimate); 172 | 173 | if (VERBOSE) 174 | cerr << defect_cf << endl; 175 | // NO FAIL! defect mode doesn't care about failure 176 | } 177 | else { 178 | const ContinuedFractionApproximation lower_cfa(diagonal, max_terms); 179 | const ContinuedFraction lower_cf( 180 | lower_cfa.optimal_cont_frac_distinct(hist)); 181 | 182 | // extrapolate curve 183 | if (lower_cf.is_valid()) { 184 | extrapolate_curve(lower_cf, initial_distinct, vals_sum, sample, step_size, 185 | max_extrap, yield_estimate); 186 | } 187 | else { 188 | // FAIL! lower_cf unacceptable, need to bootstrap to obtain 189 | // estimates 190 | return false; 191 | } 192 | 193 | if (VERBOSE) 194 | cerr << lower_cf << endl; 195 | } 196 | // SUCCESS!! 197 | return true; 198 | } 199 | 200 | void 201 | extrap_bootstrap(const bool VERBOSE, const bool allow_defects, 202 | const uint32_t seed, const vector &orig_hist, 203 | const size_t n_bootstraps, const size_t orig_max_terms, 204 | const int diagonal, const double bin_step_size, 205 | const double max_extrap, const size_t max_iter, 206 | vector> &bootstrap_estimates) { 207 | // clear returning vectors 208 | bootstrap_estimates.clear(); 209 | 210 | // setup rng 211 | mt19937 rng(seed); 212 | 213 | const double initial_distinct = 214 | std::accumulate(cbegin(orig_hist), cend(orig_hist), 0.0); 215 | 216 | vector orig_hist_distinct_counts; 217 | vector distinct_orig_hist; 218 | for (size_t i = 0; i < orig_hist.size(); i++) 219 | if (orig_hist[i] > 0) { 220 | orig_hist_distinct_counts.push_back(i); 221 | distinct_orig_hist.push_back(orig_hist[i]); 222 | } 223 | 224 | for (size_t iter = 0; 225 | (iter < max_iter && bootstrap_estimates.size() < n_bootstraps); ++iter) { 226 | if (VERBOSE && iter > 0 && iter % 72 == 0) 227 | cerr << endl; // bootstrap success progress only 72 char wide 228 | 229 | vector yield_vector; 230 | vector hist; 231 | resample_hist(rng, orig_hist_distinct_counts, distinct_orig_hist, hist); 232 | 233 | const double sample_vals_sum = get_counts_from_hist(hist); 234 | 235 | // resize boot_hist to remove excess zeros 236 | while (hist.back() == 0) 237 | hist.pop_back(); 238 | 239 | // compute complexity curve by random sampling w/out replacement 240 | const size_t distinct = accumulate(cbegin(hist), cend(hist), 0.0); 241 | size_t curr_sample_sz = bin_step_size; 242 | while (curr_sample_sz < sample_vals_sum) { 243 | yield_vector.push_back( 244 | interpolate_distinct(hist, sample_vals_sum, distinct, curr_sample_sz)); 245 | curr_sample_sz += bin_step_size; 246 | } 247 | 248 | // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE 249 | size_t first_zero = 1; 250 | while (first_zero < hist.size() && hist[first_zero] > 0) 251 | ++first_zero; 252 | 253 | size_t max_terms = min(orig_max_terms, first_zero - 1); 254 | // refit curve for lower bound (degree of approx is 1 less than 255 | // max_terms) 256 | max_terms = max_terms - (max_terms % 2 == 1); 257 | 258 | bool successful_bootstrap = false; 259 | // defect mode, simple extrapolation 260 | if (allow_defects) { 261 | vector ps_coeffs; 262 | for (size_t j = 1; j <= max_terms; j++) 263 | ps_coeffs.push_back(hist[j] * std::pow(-1.0, j + 1)); 264 | 265 | const ContinuedFraction defect_cf(ps_coeffs, diagonal, max_terms); 266 | 267 | extrapolate_curve(defect_cf, initial_distinct, sample_vals_sum, 268 | curr_sample_sz, bin_step_size, max_extrap, 269 | yield_vector); 270 | // no checking of curve in defect mode 271 | bootstrap_estimates.push_back(yield_vector); 272 | successful_bootstrap = true; 273 | } 274 | else { 275 | // refit curve for lower bound 276 | const ContinuedFractionApproximation lower_cfa(diagonal, max_terms); 277 | const ContinuedFraction lower_cf( 278 | lower_cfa.optimal_cont_frac_distinct(hist)); 279 | 280 | // extrapolate the curve start 281 | if (lower_cf.is_valid()) { 282 | extrapolate_curve(lower_cf, initial_distinct, sample_vals_sum, 283 | curr_sample_sz, bin_step_size, max_extrap, 284 | yield_vector); 285 | // sanity check 286 | if (check_yield_estimates_stability(yield_vector)) { 287 | bootstrap_estimates.push_back(yield_vector); 288 | successful_bootstrap = true; 289 | } 290 | } 291 | } 292 | if (VERBOSE) 293 | cerr << (successful_bootstrap ? '.' : '_'); 294 | } 295 | if (VERBOSE) 296 | cerr << endl; 297 | if (bootstrap_estimates.size() < n_bootstraps) 298 | throw runtime_error("too many defects in the approximation, " 299 | "consider running in defect mode"); 300 | } 301 | 302 | void 303 | vector_median_and_ci(const vector> &bootstrap_estimates, 304 | const double ci_level, vector &yield_estimates, 305 | vector &lower_ci_lognorm, 306 | vector &upper_ci_lognorm) { 307 | yield_estimates.clear(); 308 | lower_ci_lognorm.clear(); 309 | upper_ci_lognorm.clear(); 310 | assert(!bootstrap_estimates.empty()); 311 | 312 | const size_t n_est = bootstrap_estimates.size(); 313 | vector estimates_row(n_est, 0.0); 314 | for (size_t i = 0; i < bootstrap_estimates[0].size(); i++) { 315 | // estimates is in wrong order, work locally on const val 316 | for (size_t k = 0; k < n_est; ++k) 317 | estimates_row[k] = bootstrap_estimates[k][i]; 318 | 319 | double median_estimate, lower_ci_estimate, upper_ci_estimate; 320 | median_and_ci(estimates_row, ci_level, median_estimate, lower_ci_estimate, 321 | upper_ci_estimate); 322 | std::sort(begin(estimates_row), end(estimates_row)); 323 | 324 | yield_estimates.push_back(median_estimate); 325 | lower_ci_lognorm.push_back(lower_ci_estimate); 326 | upper_ci_lognorm.push_back(upper_ci_estimate); 327 | } 328 | } 329 | 330 | void 331 | write_predicted_complexity_curve(const string &outfile, const double c_level, 332 | const double step_size, 333 | const vector &yield_estimates, 334 | const vector &yield_lower_ci_lognorm, 335 | const vector &yield_upper_ci_lognorm) { 336 | std::ofstream of; 337 | if (!outfile.empty()) 338 | of.open(outfile); 339 | std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); 340 | 341 | // clang-format off 342 | out << "TOTAL_READS" << '\t' 343 | << "EXPECTED_DISTINCT" << '\t' 344 | << "LOWER_" << c_level << "CI" << '\t' 345 | << "UPPER_" << c_level << "CI" << '\n'; 346 | // clang-format on 347 | 348 | out.setf(std::ios_base::fixed, std::ios_base::floatfield); 349 | out.precision(1); 350 | 351 | out << 0 << '\t' << 0 << '\t' << 0 << '\t' << 0 << endl; 352 | for (size_t i = 0; i < yield_estimates.size(); ++i) 353 | out << (i + 1) * step_size << '\t' << yield_estimates[i] << '\t' 354 | << yield_lower_ci_lognorm[i] << '\t' << yield_upper_ci_lognorm[i] 355 | << endl; 356 | } 357 | 358 | // vals_hist[j] = n_{j} = # (counts = j) 359 | // vals_hist_distinct_counts[k] = kth index j s.t. vals_hist[j] > 0 360 | // stores kth index of vals_hist that is positive 361 | // distinct_counts_hist[k] = vals_hist[vals_hist_distinct_counts[k]] 362 | // stores the kth positive value of vals_hist 363 | void 364 | resample_hist(mt19937 &gen, const vector &vals_hist_distinct_counts, 365 | const vector &distinct_counts_hist, 366 | vector &out_hist) { 367 | const size_t hist_size = distinct_counts_hist.size(); 368 | vector sample_distinct_counts_hist(hist_size, 0); 369 | 370 | const uint32_t distinct = 371 | accumulate(cbegin(distinct_counts_hist), cend(distinct_counts_hist), 0.0); 372 | 373 | multinomial(gen, distinct_counts_hist, distinct, sample_distinct_counts_hist); 374 | 375 | out_hist.clear(); 376 | out_hist.resize(vals_hist_distinct_counts.back() + 1, 0.0); 377 | for (size_t i = 0; i < hist_size; i++) 378 | out_hist[vals_hist_distinct_counts[i]] = sample_distinct_counts_hist[i]; 379 | } 380 | 381 | template 382 | T 383 | median_from_sorted_vector(const vector sorted_data, const size_t stride, 384 | const size_t n) { 385 | if (n == 0 || sorted_data.empty()) 386 | return 0.0; 387 | 388 | const size_t lhs = (n - 1) / 2; 389 | const size_t rhs = n / 2; 390 | 391 | if (lhs == rhs) 392 | return sorted_data[lhs * stride]; 393 | 394 | return (sorted_data[lhs * stride] + sorted_data[rhs * stride]) / 2.0; 395 | } 396 | 397 | template 398 | T 399 | quantile_from_sorted_vector(const vector &sorted_data, const size_t stride, 400 | const size_t n, const double f) { 401 | const double index = f * (n - 1); 402 | const size_t lhs = static_cast(index); 403 | const double delta = index - lhs; 404 | 405 | if (n == 0 || sorted_data.empty()) 406 | return 0.0; 407 | 408 | if (lhs == n - 1) 409 | return sorted_data[lhs * stride]; 410 | 411 | return (1 - delta) * sorted_data[lhs * stride] + 412 | delta * sorted_data[(lhs + 1) * stride]; 413 | } 414 | 415 | // Confidence interval stuff 416 | void 417 | median_and_ci(vector estimates, // by val so we can sort them 418 | const double ci_level, double &median_estimate, 419 | double &lower_ci_estimate, double &upper_ci_estimate) { 420 | assert(!estimates.empty()); 421 | 422 | std::sort(begin(estimates), end(estimates)); 423 | 424 | const double alpha = 1.0 - ci_level; 425 | const size_t N = estimates.size(); 426 | 427 | median_estimate = median_from_sorted_vector(estimates, 1, N); 428 | lower_ci_estimate = quantile_from_sorted_vector(estimates, 1, N, alpha / 2); 429 | upper_ci_estimate = 430 | quantile_from_sorted_vector(estimates, 1, N, 1.0 - alpha / 2); 431 | } 432 | -------------------------------------------------------------------------------- /src/common.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2024 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Timothy Daley and Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see 18 | * . 19 | */ 20 | 21 | #ifndef SRC_COMMON_HPP_ 22 | #define SRC_COMMON_HPP_ 23 | 24 | #include // std::size_t 25 | #include // std::uint64_t 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include // has std::size 32 | 33 | double 34 | GoodToulmin2xExtrap(const std::vector &counts_hist); 35 | 36 | double 37 | interpolate_distinct(const std::vector &hist, const std::size_t N, 38 | const std::size_t S, const std::size_t n); 39 | 40 | bool 41 | extrap_single_estimate(const bool VERBOSE, const bool allow_defects, 42 | const std::vector &hist, std::size_t max_terms, 43 | const int diagonal, const double step_size, 44 | const double max_extrap, 45 | std::vector &yield_estimate); 46 | 47 | void 48 | extrap_bootstrap(const bool VERBOSE, const bool allow_defects, 49 | const std::uint32_t seed, const std::vector &orig_hist, 50 | const std::size_t n_bootstraps, 51 | const std::size_t orig_max_terms, const int diagonal, 52 | const double bin_step_size, const double max_extrap, 53 | const std::size_t max_iter, 54 | std::vector> &bootstrap_estimates); 55 | 56 | void 57 | vector_median_and_ci( 58 | const std::vector> &bootstrap_estimates, 59 | const double ci_level, std::vector &yield_estimates, 60 | std::vector &lower_ci_lognorm, std::vector &upper_ci_lognorm); 61 | 62 | void 63 | write_predicted_complexity_curve( 64 | const std::string &outfile, const double c_level, const double step_size, 65 | const std::vector &yield_estimates, 66 | const std::vector &yield_lower_ci_lognorm, 67 | const std::vector &yield_upper_ci_lognorm); 68 | 69 | template 70 | T 71 | get_counts_from_hist(const std::vector &h) { 72 | T c = 0.0; 73 | for (auto i = 0u; i < std::size(h); ++i) 74 | c += i * h[i]; 75 | return c; 76 | } 77 | 78 | double 79 | factorial(double x); 80 | 81 | void 82 | resample_hist(std::mt19937 &gen, 83 | const std::vector &vals_hist_distinct_counts, 84 | const std::vector &distinct_counts_hist, 85 | std::vector &out_hist); 86 | 87 | void 88 | median_and_ci(std::vector estimates, // by val so we can sort them 89 | const double ci_level, double &median_estimate, 90 | double &lower_ci_estimate, double &upper_ci_estimate); 91 | 92 | template 93 | void 94 | multinomial(std::mt19937 &gen, const std::vector &mult_probs, 95 | uint_type trials, std::vector &result) { 96 | typedef std::binomial_distribution binom_dist; 97 | 98 | result.clear(); 99 | result.resize(std::size(mult_probs)); 100 | 101 | double remaining_prob = 102 | std::accumulate(std::begin(mult_probs), std::end(mult_probs), 0.0); 103 | 104 | auto r = std::begin(result); 105 | auto p = std::begin(mult_probs); 106 | 107 | while (p != std::end(mult_probs)) { // iterate to sample for each category 108 | *r = binom_dist(trials, (*p) / remaining_prob)(gen); // take the sample 109 | 110 | remaining_prob -= *p++; // update remaining probability mass 111 | trials -= *r++; // update remaining trials needed 112 | } 113 | 114 | if (trials > 0) 115 | throw std::runtime_error("multinomial sampling failed"); 116 | } 117 | 118 | template 119 | void 120 | report_histogram(const std::string &outfile, const H &h) { 121 | std::ofstream out(outfile); 122 | if (!out) 123 | throw std::runtime_error("failed to open output file: " + outfile); 124 | for (auto i = 0u; i < std::size(h); ++i) 125 | if (h[i] > 0) 126 | out << i << '\t' << static_cast(h[i]) << '\n'; 127 | } 128 | 129 | #endif // SRC_COMMON_HPP_ 130 | -------------------------------------------------------------------------------- /src/continued_fraction.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Andrew D. Smith and Timothy Daley 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | */ 19 | 20 | #include "continued_fraction.hpp" 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | using std::fabs; 28 | using std::isfinite; 29 | using std::min; 30 | using std::pow; 31 | using std::vector; 32 | 33 | // ADS: the std::pow function is used frequently to get (-1)^x for 34 | // integer x. This doesn't make sense, and should be replaced at some 35 | // point. 36 | 37 | /* QUOTIENT DIFFERENCE ALGORITHM: compute continued fraction 38 | * coefficients vector for power series coefficients & vector for 39 | * continued fraction coefficients 40 | * 41 | * The negative sign for coefficients in the final loop is because we 42 | * evaluate a0/(1 + a1x/(1 + a2x/... while the algorithm is designed 43 | * for the a0/(1 - a1x/(1 - a2x/... see https://dlmf.nist.gov/3.10 44 | */ 45 | static void 46 | quotdiff_algorithm(const vector &ps_coeffs, vector &cf_coeffs) { 47 | const size_t depth = ps_coeffs.size(); // degree of power series 48 | 49 | // q_table[0] never used, and undefined 50 | vector> q_table(depth, vector(depth + 1, 0.0)); 51 | // q_table[1][j] = ratio of ps coefficients 52 | for (size_t j = 0; j < depth - 1; j++) 53 | q_table[1][j] = ps_coeffs[j + 1] / ps_coeffs[j]; 54 | 55 | // e_table[0] is always 0 56 | vector> e_table(depth, vector(depth + 1, 0.0)); 57 | // e_table[1] follows the general recurrence (same as in loop below) 58 | for (size_t j = 0; j < depth - 1; j++) 59 | e_table[1][j] = q_table[1][j + 1] - q_table[1][j] + e_table[0][j + 1]; 60 | 61 | // using intial values of E(i)(j)'s and Q(i)(j)'s, fill rest of the 62 | // q table and e table 63 | for (size_t i = 2; i < depth; i++) { 64 | for (size_t j = 0; j < depth; j++) 65 | q_table[i][j] = 66 | q_table[i - 1][j + 1] * e_table[i - 1][j + 1] / e_table[i - 1][j]; 67 | 68 | for (size_t j = 0; j < depth; j++) 69 | e_table[i][j] = q_table[i][j + 1] - q_table[i][j] + e_table[i - 1][j + 1]; 70 | } 71 | 72 | cf_coeffs.resize(depth); 73 | // first CT coefficient is first PS coefficient 74 | cf_coeffs[0] = ps_coeffs[0]; 75 | // set remaining CF coefficients from e and q table values 76 | for (size_t i = 1; i < depth; ++i) 77 | cf_coeffs[i] = (i % 2 == 0) ? -e_table[i / 2][0] : -q_table[(i + 1) / 2][0]; 78 | } 79 | 80 | /* compute CF coeffs when upper_offset > 0 above the diagonal; this 81 | * means degree of polynomial in numerator of Pade approximant is 82 | * greater than degree of polynomial in the denominator 83 | */ 84 | static void 85 | quotdiff_above_diagonal(const vector &ps_coeffs, const size_t offset, 86 | vector &cf_coeffs, 87 | vector &offset_coeffs) { 88 | // get the high order PS coeffs for approximation by CF 89 | vector high_ps_coeffs(begin(ps_coeffs) + offset, end(ps_coeffs)); 90 | 91 | // use QD algorithm to determine CF coefficients 92 | quotdiff_algorithm(high_ps_coeffs, cf_coeffs); 93 | 94 | // first "offset" coeffs are equal to PS coeffs 95 | offset_coeffs = ps_coeffs; 96 | offset_coeffs.resize(offset); 97 | } 98 | 99 | // calculate CF coeffs when lower_offset > 0 100 | static void 101 | quotdiff_below_diagonal(const vector &ps_coeffs, const size_t offset, 102 | vector &cf_coeffs, 103 | vector &offset_coeffs) { 104 | // need to work with reciprocal series g = 1/f, then invert 105 | vector recip_ps_coeffs(ps_coeffs.size()); 106 | recip_ps_coeffs[0] = 1.0 / ps_coeffs[0]; 107 | for (size_t i = 1; i < ps_coeffs.size(); ++i) { 108 | double x = 0.0; 109 | for (size_t j = 0; j < i; ++j) 110 | x += ps_coeffs[i - j] * recip_ps_coeffs[j]; 111 | 112 | recip_ps_coeffs[i] = -x / ps_coeffs[0]; 113 | } 114 | 115 | // qd to compute cf_coeffs using remaining coeffs 116 | vector high_recip_ps_coeffs(begin(recip_ps_coeffs) + offset, 117 | end(recip_ps_coeffs)); 118 | quotdiff_algorithm(high_recip_ps_coeffs, cf_coeffs); 119 | 120 | // set offset coeffs to 1st "offset" PS coeffs of 1/f (reciprocal) 121 | offset_coeffs = recip_ps_coeffs; 122 | offset_coeffs.resize(offset); 123 | } 124 | 125 | void 126 | truncate_degree(const size_t n_terms, ContinuedFraction &the_cf) { 127 | if (the_cf.degree < n_terms) { 128 | the_cf = ContinuedFraction(); 129 | } 130 | else { 131 | the_cf.ps_coeffs.resize(n_terms); 132 | the_cf.cf_coeffs.resize(n_terms - the_cf.offset_coeffs.size()); 133 | the_cf.degree = n_terms; 134 | } 135 | } 136 | 137 | ContinuedFraction::ContinuedFraction(const vector &ps_cf, const int di, 138 | const size_t dg) : 139 | ps_coeffs(ps_cf), diagonal_idx(di), degree(dg) { 140 | if (diagonal_idx == 0) 141 | quotdiff_algorithm(ps_coeffs, cf_coeffs); 142 | else if (diagonal_idx > 0) 143 | quotdiff_above_diagonal(ps_coeffs, diagonal_idx, cf_coeffs, offset_coeffs); 144 | else // if (cont_frac_estimate.lower_offset > 0) { 145 | quotdiff_below_diagonal(ps_coeffs, -diagonal_idx, cf_coeffs, offset_coeffs); 146 | // NOTE: negative sign "-" (-diagonal_idx > 0) for below diagonal 147 | } 148 | 149 | //////////////////////////////////////////////////////////////////////// 150 | //// FUNCTIONS TO EVALUATE CONTINUED FRACTIONS AT A POINT 151 | 152 | static double 153 | get_rescale_value(const double numerator, const double denominator) { 154 | static const double tolerance = 1e-20; // magic 155 | const double rescale_val = fabs(numerator) + fabs(denominator); 156 | if (rescale_val > 1.0 / tolerance) 157 | return 1.0 / rescale_val; 158 | else if (rescale_val < tolerance) 159 | return 1.0 / rescale_val; 160 | return 1.0; 161 | } 162 | 163 | /* calculate ContinuedFraction approx when there is no offset uses euler's 164 | * recursion 165 | */ 166 | static double 167 | evaluate_on_diagonal(const vector &cf_coeffs, const double val, 168 | const size_t depth) { 169 | // initialize 170 | double current_num = 0.0; 171 | double prev_num1 = cf_coeffs[0]; 172 | double prev_num2 = 0.0; 173 | 174 | double current_denom = 0.0; 175 | double prev_denom1 = 1.0; 176 | double prev_denom2 = 1.0; 177 | 178 | for (size_t i = 1; i < min(cf_coeffs.size(), depth); i++) { 179 | // calculate current values 180 | current_num = prev_num1 + cf_coeffs[i] * val * prev_num2; 181 | current_denom = prev_denom1 + cf_coeffs[i] * val * prev_denom2; 182 | 183 | // update previous values 184 | prev_num2 = prev_num1; 185 | prev_num1 = current_num; 186 | 187 | prev_denom2 = prev_denom1; 188 | prev_denom1 = current_denom; 189 | 190 | // now rescale all values 191 | const double rescale_val = get_rescale_value(current_num, current_denom); 192 | 193 | current_num *= rescale_val; 194 | current_denom *= rescale_val; 195 | 196 | prev_num1 *= rescale_val; 197 | prev_num2 *= rescale_val; 198 | 199 | prev_denom1 *= rescale_val; 200 | prev_denom2 *= rescale_val; 201 | } 202 | return current_num / current_denom; 203 | } 204 | 205 | static double 206 | evaluate_power_series(const vector &ps_coeffs, const double val) { 207 | double x = 0.0; 208 | for (size_t i = 0; i < ps_coeffs.size(); i++) 209 | x += ps_coeffs[i] * pow(val, i); 210 | return x; 211 | } 212 | 213 | /* evaluate CF when upper_offset > 0 using Euler's recursion */ 214 | static double 215 | evaluate_above_diagonal(const vector &cf_coeffs, 216 | const vector &offset_coeffs, const double val, 217 | const size_t depth) { 218 | const double cf_part = 219 | evaluate_on_diagonal(cf_coeffs, val, depth - offset_coeffs.size()); 220 | 221 | const double ps_part = evaluate_power_series(offset_coeffs, val); 222 | 223 | return ps_part + pow(val, offset_coeffs.size()) * cf_part; 224 | } 225 | 226 | // calculate ContinuedFraction approx when lower_offdiag > 0 227 | static double 228 | evaluate_below_diagonal(const vector &cf_coeffs, 229 | const vector &offset_coeffs, const double val, 230 | const size_t depth) { 231 | const double cf_part = 232 | evaluate_on_diagonal(cf_coeffs, val, depth - offset_coeffs.size()); 233 | 234 | const double ps_part = evaluate_power_series(offset_coeffs, val); 235 | 236 | // recall that if lower_offset > 0, we are working with 1/f, invert approx 237 | return 1.0 / (ps_part + pow(val, offset_coeffs.size()) * cf_part); 238 | } 239 | 240 | // evaluate CF at a given point 241 | double 242 | ContinuedFraction::operator()(const double val) const { 243 | if (diagonal_idx > 0) 244 | return evaluate_above_diagonal(cf_coeffs, offset_coeffs, val, degree); 245 | else if (diagonal_idx < 0) 246 | return evaluate_below_diagonal(cf_coeffs, offset_coeffs, val, degree); 247 | else 248 | return evaluate_on_diagonal(cf_coeffs, val, degree); 249 | } 250 | 251 | std::ostream & 252 | operator<<(std::ostream &the_stream, const ContinuedFraction &cf) { 253 | using std::ios_base; 254 | using std::setw; 255 | 256 | ios_base::fmtflags orig_flags = the_stream.flags(); 257 | the_stream.setf(ios_base::fixed, ios_base::floatfield); 258 | the_stream.precision(2); 259 | the_stream << "OFFSET_COEFFS" << '\t' << "PS_COEFFS" << '\n'; 260 | const size_t offset = cf.offset_coeffs.size(); 261 | for (size_t i = 0; i < offset; ++i) 262 | the_stream << setw(12) << cf.offset_coeffs[i] << '\t' << setw(12) 263 | << cf.ps_coeffs[i] << '\n'; 264 | the_stream << "CF_COEFFS" << '\n'; 265 | for (size_t i = 0; i < cf.cf_coeffs.size(); ++i) 266 | the_stream << setw(12) << cf.cf_coeffs[i] << '\t' << setw(12) 267 | << cf.ps_coeffs[i + offset] << '\n'; 268 | the_stream.flags(orig_flags); 269 | return the_stream; 270 | } 271 | 272 | // estimate yields by evaluating the CF at given points 273 | void 274 | ContinuedFraction::extrapolate_distinct(const double max_value, 275 | const double step_size, 276 | vector &estimates) const { 277 | estimates.clear(); 278 | estimates.push_back(0); 279 | for (double t = step_size; t <= max_value; t += step_size) 280 | estimates.push_back(t * operator()(t)); 281 | } 282 | 283 | //////////////////////////////////////////////////////////////////////// 284 | //////////////// CONTINUED FRACTION APPROXIMATION CLASS BELOW 285 | 286 | typedef ContinuedFractionApproximation CFA; 287 | 288 | const size_t CFA::min_allowed_degree = 4; 289 | const double CFA::search_max_val = 100; 290 | const double CFA::search_step_size = 0.05; 291 | 292 | /* check if a sequence of estimates are "stable": in [0, infty, 293 | * increasing, negative 2nd deriv 294 | */ 295 | bool 296 | check_yield_estimates_stability(const vector &estimates) { 297 | // require estimates are non-negative and finite 298 | for (size_t i = 0; i < estimates.size(); ++i) 299 | if (!std::isfinite(estimates[i]) || estimates[i] < 0.0) 300 | return false; 301 | 302 | // require estimate to be increasing 303 | for (size_t i = 1; i < estimates.size(); ++i) 304 | if (estimates[i] < estimates[i - 1]) 305 | return false; 306 | 307 | // require negative second derivative 308 | for (size_t i = 2; i < estimates.size(); ++i) 309 | if (estimates[i - 1] - estimates[i - 2] < estimates[i] - estimates[i - 1]) 310 | return false; 311 | 312 | return !estimates.empty(); 313 | } 314 | 315 | /* Finds the optimal number of terms (i.e. degree, depth, etc.) of the 316 | * continued fraction by checking for stability of estimates at 317 | * specific points for yield. New way for searching for optimal CF 318 | */ 319 | ContinuedFraction 320 | CFA::optimal_cont_frac_distinct(const vector &counts_hist) const { 321 | // we expect to use an underestimate, but this is dealt with outside 322 | // by ensuring we have an even number of max terms 323 | 324 | if (max_terms >= counts_hist.size()) 325 | return ContinuedFraction(); 326 | 327 | vector ps_coeffs; 328 | for (size_t j = 1; j <= max_terms; j++) 329 | ps_coeffs.push_back(counts_hist[j] * pow(-1.0, j + 1)); 330 | 331 | ContinuedFraction full_cf(ps_coeffs, diagonal_idx, max_terms); 332 | 333 | // if max terms in {3,4,5,6}, check only that degree 334 | if (max_terms >= 3 && max_terms <= 6) { 335 | vector estimates; 336 | full_cf.extrapolate_distinct(search_max_val, search_step_size, estimates); 337 | if (check_yield_estimates_stability(estimates)) 338 | return full_cf; 339 | } 340 | else { 341 | // if max terms >= 7, start at 7 and check increasing cont frac's 342 | for (size_t i = 7 + (max_terms % 2 == 0); i <= max_terms; i += 2) { 343 | ContinuedFraction trunc_cf(full_cf); 344 | truncate_degree(i, trunc_cf); 345 | vector estimates; 346 | trunc_cf.extrapolate_distinct(search_max_val, search_step_size, 347 | estimates); 348 | if (check_yield_estimates_stability(estimates)) 349 | return trunc_cf; 350 | } 351 | } 352 | // no stable continued fraction: return null 353 | return ContinuedFraction(); 354 | } 355 | -------------------------------------------------------------------------------- /src/continued_fraction.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Andrew D. Smith and Timothy Daley 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | */ 19 | 20 | #ifndef SRC_CONTINUED_FRACTION_HPP_ 21 | #define SRC_CONTINUED_FRACTION_HPP_ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | struct ContinuedFraction { 30 | // Constructors 31 | ContinuedFraction() : diagonal_idx(0), degree(0ul) {} 32 | ContinuedFraction(const std::vector &ps_cf, const int di, 33 | const size_t dg); 34 | 35 | // Evaluate the continued fraction 36 | double operator()(const double val) const; 37 | 38 | ////////////////////////////////////////// 39 | // Extrapolation functions 40 | 41 | // Evaluate the continued fraction estimating distinct 42 | // along a curve from 0 to max_value 43 | void extrapolate_distinct(const double max_value, const double step_size, 44 | std::vector &estimates) const; 45 | 46 | bool is_valid() const { return !cf_coeffs.empty(); } 47 | 48 | std::vector ps_coeffs; 49 | std::vector cf_coeffs; 50 | std::vector offset_coeffs; 51 | int diagonal_idx; 52 | size_t degree; 53 | }; 54 | 55 | // get continued fraction with lower degree 56 | void 57 | truncate_degree(const size_t truncated_degree, ContinuedFraction &cf); 58 | 59 | std::ostream & 60 | operator<<(std::ostream &out, const ContinuedFraction &cf); 61 | 62 | class ContinuedFractionApproximation { 63 | public: 64 | ContinuedFractionApproximation(const int di, const size_t mt) : 65 | diagonal_idx(di), max_terms(mt) {} 66 | 67 | // find best cont frac approx for estimating distinct 68 | ContinuedFraction 69 | optimal_cont_frac_distinct(const std::vector &counts_hist) const; 70 | 71 | private: 72 | int diagonal_idx; // the diagonal to work with for estimates 73 | size_t max_terms; // the maximum number of terms to try for a CF 74 | 75 | /* note: these never change */ 76 | static const size_t min_allowed_degree; 77 | 78 | // largest value to search for lowerbound and stability 79 | static const double search_max_val; 80 | 81 | // step size for search of lowerbound and stability 82 | static const double search_step_size; 83 | }; 84 | 85 | bool 86 | check_yield_estimates_stability(const std::vector &estimates); 87 | 88 | #endif // SRC_CONTINUED_FRACTION_HPP_ 89 | -------------------------------------------------------------------------------- /src/dnmt_error.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2023 Andrew D. Smith 2 | * 3 | * Authors: Andrew Smith 4 | * 5 | * This program is free software: you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation, either version 3 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | * General Public License for more details. 14 | */ 15 | 16 | #ifndef SRC_DNMT_ERROR_HPP_ 17 | #define SRC_DNMT_ERROR_HPP_ 18 | 19 | #include // for int64_t 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | struct dnmt_error : public std::exception { 26 | std::int64_t err{}; // error possibly from HTSlib 27 | int the_errno{}; // ERRNO at time of construction 28 | std::string msg; // the message 29 | std::string the_what; // to report 30 | dnmt_error(const std::int64_t err, const std::string &msg) : 31 | err{err}, the_errno{errno}, msg{msg} { 32 | std::ostringstream oss; 33 | // clang-format off 34 | oss << "[error: " << err << "][" << "ERRNO: " << the_errno << "]" 35 | << "[" << strerror(the_errno) << "][" << msg << "]"; 36 | // clang-format on 37 | the_what = oss.str(); 38 | } 39 | explicit dnmt_error(const std::string &_msg) : dnmt_error(0, _msg) {} 40 | const char *what() const noexcept override { return the_what.data(); } 41 | }; 42 | 43 | #endif // SRC_DNMT_ERROR_HPP_ 44 | -------------------------------------------------------------------------------- /src/gc_extrap.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2024 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Timothy Daley and Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see 18 | * . 19 | */ 20 | 21 | #include "gc_extrap.hpp" 22 | 23 | #include "common.hpp" 24 | #include "load_data_for_complexity.hpp" 25 | 26 | #include 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | using std::cbegin; 39 | using std::cend; 40 | using std::cerr; 41 | using std::endl; 42 | using std::min; 43 | using std::runtime_error; 44 | using std::size_t; 45 | using std::string; 46 | using std::uint32_t; 47 | using std::vector; 48 | 49 | // ADS: functions same, header different (above and this one) 50 | static void 51 | write_predicted_coverage_curve(const string &outfile, const double c_level, 52 | const double base_step_size, 53 | const size_t bin_size, 54 | const vector &cvrg_estimates, 55 | const vector &cvrg_lower_ci_lognorm, 56 | const vector &cvrg_upper_ci_lognorm) { 57 | static constexpr double one_hundred = 100.0; 58 | std::ofstream of; 59 | if (!outfile.empty()) 60 | of.open(outfile); 61 | std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); 62 | 63 | const double percentile = one_hundred * c_level; 64 | // clang-format off 65 | out << "TOTAL_BASES" << '\t' 66 | << "EXPECTED_COVERED_BASES" << '\t' 67 | << "LOWER_" << percentile << "%CI" << '\t' 68 | << "UPPER_" << percentile << "%CI" 69 | << endl; 70 | // clang-format on 71 | 72 | out.setf(std::ios_base::fixed, std::ios_base::floatfield); 73 | out.precision(1); 74 | 75 | out << 0 << '\t' << 0 << '\t' << 0 << '\t' << 0 << endl; 76 | for (size_t i = 0; i < cvrg_estimates.size(); ++i) 77 | out << (i + 1) * base_step_size << '\t' << cvrg_estimates[i] * bin_size 78 | << '\t' << cvrg_lower_ci_lognorm[i] * bin_size << '\t' 79 | << cvrg_upper_ci_lognorm[i] * bin_size << endl; 80 | } 81 | 82 | int 83 | gc_extrap_main(const int argc, const char *argv[]) { 84 | try { 85 | const size_t MIN_REQUIRED_COUNTS = 4; 86 | 87 | string outfile; 88 | string histogram_outfile; 89 | 90 | int diagonal = 0; 91 | size_t orig_max_terms = 100; 92 | size_t bin_size = 10; 93 | bool verbose = false; 94 | double base_step_size = 1.0e8; 95 | size_t max_width = 10000; 96 | bool SINGLE_ESTIMATE = false; 97 | double max_extrap = 1.0e12; 98 | size_t n_bootstraps = 100; 99 | uint32_t seed = 408; 100 | bool allow_defects = false; 101 | 102 | bool NO_SEQUENCE = false; 103 | double c_level = 0.95; 104 | #ifdef HAVE_HTSLIB 105 | bool BAM_FORMAT_INPUT = false; 106 | uint32_t n_threads{1}; 107 | #endif 108 | 109 | const string description = R"( 110 | Extrapolate the size of the covered genome by mapped reads. This 111 | approach is described in Daley & Smith (2014). The method is the same 112 | as for lc_extrap: using rational function approximation to a 113 | power-series expansion for the number of "unobserved" bases in the 114 | initial sample. The gc_extrap method is adapted to deal with 115 | individual nucleotides rather than distinct reads. 116 | )"; 117 | string program_name = std::filesystem::path(argv[0]).filename(); 118 | program_name += " " + string(argv[1]); 119 | 120 | // ********* GET COMMAND LINE ARGUMENTS FOR GC EXTRAP ********** 121 | OptionParser opt_parse(program_name, description, ""); 122 | opt_parse.add_opt("output", 'o', 123 | "coverage yield output file (default: stdout)", false, 124 | outfile); 125 | opt_parse.add_opt("max_width", 'w', 126 | "max fragment length, " 127 | "set equal to read length for single end reads", 128 | false, max_width); 129 | opt_parse.add_opt("bin_size", 'b', "bin size", false, bin_size); 130 | opt_parse.add_opt("extrap", 'e', "maximum extrapolation in base pairs", 131 | false, max_extrap); 132 | opt_parse.add_opt("step", 's', "step size in bases between extrapolations", 133 | false, base_step_size); 134 | opt_parse.add_opt("bootstraps", 'n', "number of bootstraps", false, 135 | n_bootstraps); 136 | opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, 137 | c_level); 138 | opt_parse.add_opt("terms", 'x', "maximum number of terms", false, 139 | orig_max_terms); 140 | opt_parse.add_opt("verbose", 'v', "print more information", false, verbose); 141 | opt_parse.add_opt("hist-out", '\0', "output histogram to this file", false, 142 | histogram_outfile); 143 | opt_parse.add_opt("bed", 'B', 144 | "input is in bed format without sequence information", 145 | false, NO_SEQUENCE); 146 | opt_parse.add_opt("quick", 'Q', 147 | "quick mode: run gc_extrap without " 148 | "bootstrapping for confidence intervals", 149 | false, SINGLE_ESTIMATE); 150 | opt_parse.add_opt("defects", 'D', 151 | "defects mode to extrapolate without testing for defects", 152 | false, allow_defects); 153 | #ifdef HAVE_HTSLIB 154 | opt_parse.add_opt("bam", '\0', "input is in BAM format", false, 155 | BAM_FORMAT_INPUT); 156 | opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM", 157 | false, n_threads); 158 | #endif 159 | opt_parse.add_opt("seed", 'r', "seed for random number generator", false, 160 | seed); 161 | opt_parse.set_show_defaults(); 162 | 163 | vector leftover_args; 164 | opt_parse.parse(argc - 1, argv + 1, leftover_args); 165 | if (argc == 2 || opt_parse.help_requested()) { 166 | cerr << opt_parse.help_message() << endl; 167 | cerr << opt_parse.about_message() << endl; 168 | return EXIT_SUCCESS; 169 | } 170 | if (opt_parse.option_missing()) { 171 | cerr << opt_parse.option_missing_message() << endl; 172 | return EXIT_SUCCESS; 173 | } 174 | if (leftover_args.empty()) { 175 | cerr << opt_parse.help_message() << endl; 176 | return EXIT_SUCCESS; 177 | } 178 | const string infile = leftover_args.front(); 179 | // **************************************************************** 180 | 181 | vector coverage_hist; 182 | size_t n_reads = 0; 183 | if (verbose) 184 | cerr << "LOADING READS" << endl; 185 | 186 | if (NO_SEQUENCE) { 187 | if (verbose) 188 | cerr << "BED FORMAT" << endl; 189 | n_reads = load_coverage_counts_GR(infile, seed, bin_size, max_width, 190 | coverage_hist); 191 | } 192 | #ifdef HAVE_HTSLIB 193 | else if (BAM_FORMAT_INPUT) { 194 | if (verbose) 195 | cerr << "BAM_INPUT" << endl; 196 | n_reads = load_coverage_counts_BAM(n_threads, infile, seed, bin_size, 197 | max_width, coverage_hist); 198 | } 199 | #endif 200 | else { 201 | if (verbose) 202 | cerr << "MAPPED READ FORMAT" << endl; 203 | n_reads = load_coverage_counts_MR(infile, seed, bin_size, max_width, 204 | coverage_hist); 205 | } 206 | 207 | const double total_bins = get_counts_from_hist(coverage_hist); 208 | 209 | const double distinct_bins = 210 | accumulate(cbegin(coverage_hist), cend(coverage_hist), 0.0); 211 | 212 | const double avg_bins_per_read = total_bins / n_reads; 213 | const double bin_step_size = base_step_size / bin_size; 214 | 215 | const size_t max_observed_count = coverage_hist.size() - 1; 216 | 217 | // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE 218 | size_t first_zero = 1; 219 | while (first_zero < coverage_hist.size() && coverage_hist[first_zero] > 0) 220 | ++first_zero; 221 | 222 | orig_max_terms = min(orig_max_terms, first_zero - 1); 223 | 224 | if (verbose) 225 | cerr << "TOTAL READS = " << n_reads << endl 226 | << "BASE STEP SIZE = " << base_step_size << endl 227 | << "BIN STEP SIZE = " << bin_step_size << endl 228 | << "TOTAL BINS = " << total_bins << endl 229 | << "BINS PER READ = " << avg_bins_per_read << endl 230 | << "DISTINCT BINS = " << distinct_bins << endl 231 | << "TOTAL BASES = " << total_bins * bin_size << endl 232 | << "TOTAL COVERED BASES = " << distinct_bins * bin_size << endl 233 | << "MAX COVERAGE COUNT = " << max_observed_count << endl 234 | << "COUNTS OF 1 = " << coverage_hist[1] << endl; 235 | 236 | if (!histogram_outfile.empty()) 237 | report_histogram(histogram_outfile, coverage_hist); 238 | 239 | // catch if all reads are distinct 240 | if (orig_max_terms < MIN_REQUIRED_COUNTS) 241 | throw runtime_error("max count before zero is les than min required " 242 | "count (4), sample not sufficiently deep or " 243 | "duplicates removed"); 244 | 245 | // check to make sure library is not overly saturated 246 | const double two_fold_extrap = GoodToulmin2xExtrap(coverage_hist); 247 | if (two_fold_extrap < 0.0) 248 | throw runtime_error("Library expected to saturate in doubling of " 249 | "experiment size, unable to extrapolate"); 250 | 251 | if (verbose) 252 | cerr << "[ESTIMATING COVERAGE CURVE]" << endl; 253 | 254 | vector coverage_estimates; 255 | 256 | if (SINGLE_ESTIMATE) { 257 | bool SINGLE_ESTIMATE_SUCCESS = extrap_single_estimate( 258 | verbose, allow_defects, coverage_hist, orig_max_terms, diagonal, 259 | bin_step_size, max_extrap / bin_size, coverage_estimates); 260 | // IF FAILURE, EXIT 261 | if (!SINGLE_ESTIMATE_SUCCESS) 262 | throw runtime_error("SINGLE ESTIMATE FAILED, NEED TO RUN IN " 263 | "FULL MODE FOR ESTIMATES"); 264 | 265 | std::ofstream of; 266 | if (!outfile.empty()) 267 | of.open(outfile); 268 | std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); 269 | 270 | out << "TOTAL_BASES\tEXPECTED_DISTINCT" << endl; 271 | 272 | out.setf(std::ios_base::fixed, std::ios_base::floatfield); 273 | out.precision(1); 274 | 275 | out << 0 << '\t' << 0 << endl; 276 | for (size_t i = 0; i < coverage_estimates.size(); ++i) 277 | out << (i + 1) * base_step_size << '\t' 278 | << coverage_estimates[i] * bin_size << endl; 279 | } 280 | else { 281 | if (verbose) 282 | cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl; 283 | 284 | const size_t max_iter = 10 * n_bootstraps; 285 | 286 | vector> bootstrap_estimates; 287 | extrap_bootstrap(verbose, allow_defects, seed, coverage_hist, 288 | n_bootstraps, orig_max_terms, diagonal, bin_step_size, 289 | max_extrap / bin_size, max_iter, bootstrap_estimates); 290 | 291 | if (verbose) 292 | cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl; 293 | vector coverage_upper_ci_lognorm, coverage_lower_ci_lognorm; 294 | vector_median_and_ci(bootstrap_estimates, c_level, coverage_estimates, 295 | coverage_lower_ci_lognorm, 296 | coverage_upper_ci_lognorm); 297 | 298 | if (verbose) 299 | cerr << "[WRITING OUTPUT]" << endl; 300 | 301 | write_predicted_coverage_curve( 302 | outfile, c_level, base_step_size, bin_size, coverage_estimates, 303 | coverage_lower_ci_lognorm, coverage_upper_ci_lognorm); 304 | } 305 | } 306 | catch (const std::exception &e) { 307 | cerr << e.what() << endl; 308 | return EXIT_FAILURE; 309 | } 310 | return EXIT_SUCCESS; 311 | } 312 | -------------------------------------------------------------------------------- /src/gc_extrap.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2024 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Timothy Daley and Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see 18 | * . 19 | */ 20 | 21 | #ifndef SRC_GC_EXTRAP_HPP_ 22 | #define SRC_GC_EXTRAP_HPP_ 23 | 24 | int 25 | gc_extrap_main(const int argc, const char *argv[]); 26 | 27 | #endif // SRC_GC_EXTRAP_HPP_ 28 | -------------------------------------------------------------------------------- /src/lc_extrap.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2024 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Timothy Daley and Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see 18 | * . 19 | */ 20 | 21 | #include "lc_extrap.hpp" 22 | 23 | #include "common.hpp" 24 | #include "load_data_for_complexity.hpp" 25 | 26 | #include 27 | 28 | #include 29 | #include // std::size_t 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | using std::begin; 40 | using std::cbegin; 41 | using std::cend; 42 | using std::cerr; 43 | using std::end; 44 | using std::endl; 45 | using std::runtime_error; 46 | using std::size_t; 47 | using std::string; 48 | using std::to_string; 49 | using std::uint32_t; 50 | using std::vector; 51 | 52 | int 53 | lc_extrap_main(const int argc, const char **argv) { 54 | try { 55 | static const size_t min_required_counts = 4; 56 | static const string min_required_counts_error_message = 57 | "max count before zero is less than min required count (" + 58 | to_string(min_required_counts) + ") duplicates removed"; 59 | 60 | string outfile; 61 | string histogram_outfile; 62 | 63 | size_t orig_max_terms = 100; 64 | double max_extrap = 1.0e10; 65 | double step_size = 1e6; 66 | size_t n_bootstraps = 100; 67 | int diagonal = 0; 68 | double c_level = 0.95; 69 | uint32_t seed = 408; 70 | 71 | /* FLAGS */ 72 | bool verbose = false; 73 | bool VALS_INPUT = false; 74 | bool PAIRED_END = false; 75 | bool HIST_INPUT = false; 76 | bool SINGLE_ESTIMATE = false; 77 | bool allow_defects = false; 78 | 79 | #ifdef HAVE_HTSLIB 80 | bool BAM_FORMAT_INPUT = false; 81 | size_t MAX_SEGMENT_LENGTH = 5000; 82 | uint32_t n_threads{1}; 83 | #endif 84 | 85 | const string description = 86 | R"( 87 | Extrapolate the complexity of a library. This is the approach 88 | described in Daley & Smith (2013). The method applies rational 89 | function approximation via continued fractions with the 90 | original goal of estimating the number of distinct reads that a 91 | sequencing library would yield upon deeper sequencing. This 92 | method has been used for many different purposes since then. 93 | )"; 94 | string program_name = std::filesystem::path(argv[0]).filename(); 95 | program_name += " " + string(argv[1]); 96 | 97 | /********** GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/ 98 | 99 | OptionParser opt_parse(program_name, description, ""); 100 | opt_parse.add_opt("output", 'o', "yield output file (default: stdout)", 101 | false, outfile); 102 | opt_parse.add_opt("extrap", 'e', "maximum extrapolation", false, 103 | max_extrap); 104 | opt_parse.add_opt("step", 's', "extrapolation step size", false, step_size); 105 | opt_parse.add_opt("boots", 'n', "number of bootstraps", false, 106 | n_bootstraps); 107 | opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, 108 | c_level); 109 | opt_parse.add_opt("terms", 'x', "maximum terms in estimator", false, 110 | orig_max_terms); 111 | opt_parse.add_opt("verbose", 'v', "print more info", false, verbose); 112 | #ifdef HAVE_HTSLIB 113 | opt_parse.add_opt("bam", 'B', "input is in BAM format", false, 114 | BAM_FORMAT_INPUT); 115 | opt_parse.add_opt("seg_len", 'l', 116 | "maximum segment length when merging " 117 | "paired end bam reads", 118 | false, MAX_SEGMENT_LENGTH); 119 | opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM", 120 | false, n_threads); 121 | #endif 122 | opt_parse.add_opt("pe", 'P', "input is paired end read file", false, 123 | PAIRED_END); 124 | opt_parse.add_opt( 125 | "vals", 'V', "input is a text file containing only the observed counts", 126 | false, VALS_INPUT); 127 | opt_parse.add_opt("hist", 'H', 128 | "input is a text file containing the observed histogram", 129 | false, HIST_INPUT); 130 | opt_parse.add_opt("hist-out", '\0', 131 | "output histogram to this file (for non-hist input)", 132 | false, histogram_outfile); 133 | opt_parse.add_opt("quick", 'Q', 134 | "quick mode (no bootstraps) for confidence intervals", 135 | false, SINGLE_ESTIMATE); 136 | opt_parse.add_opt("defects", 'D', "no testing for defects", false, 137 | allow_defects); 138 | opt_parse.add_opt("seed", 'r', "seed for random number generator", false, 139 | seed); 140 | opt_parse.set_show_defaults(); 141 | vector leftover_args; 142 | opt_parse.parse(argc - 1, argv + 1, leftover_args); 143 | if (argc == 2 || opt_parse.help_requested()) { 144 | cerr << opt_parse.help_message() << endl; 145 | cerr << opt_parse.about_message() << endl; 146 | return EXIT_SUCCESS; 147 | } 148 | if (opt_parse.option_missing()) { 149 | cerr << opt_parse.option_missing_message() << endl; 150 | return EXIT_SUCCESS; 151 | } 152 | if (leftover_args.empty()) { 153 | cerr << opt_parse.help_message() << endl; 154 | return EXIT_SUCCESS; 155 | } 156 | const string input_file_name = leftover_args.front(); 157 | /******************************************************************/ 158 | 159 | vector counts_hist; 160 | size_t n_reads = 0; 161 | 162 | /************ loading input ***************************************/ 163 | if (HIST_INPUT) { 164 | if (verbose) 165 | cerr << "HIST_INPUT" << endl; 166 | n_reads = load_histogram(input_file_name, counts_hist); 167 | } 168 | else if (VALS_INPUT) { 169 | if (verbose) 170 | cerr << "VALS_INPUT" << endl; 171 | n_reads = load_counts(input_file_name, counts_hist); 172 | } 173 | #ifdef HAVE_HTSLIB 174 | else if (BAM_FORMAT_INPUT) { 175 | if (PAIRED_END) { 176 | if (verbose) 177 | cerr << "PAIRED_END_BAM_INPUT" << endl; 178 | n_reads = load_counts_BAM_pe(n_threads, input_file_name, counts_hist); 179 | } 180 | else { // single end 181 | if (verbose) 182 | cerr << "BAM_INPUT" << endl; 183 | n_reads = load_counts_BAM_se(n_threads, input_file_name, counts_hist); 184 | } 185 | } 186 | #endif 187 | else if (PAIRED_END) { 188 | if (verbose) 189 | cerr << "PAIRED_END_BED_INPUT" << endl; 190 | n_reads = load_counts_BED_pe(input_file_name, counts_hist); 191 | } 192 | else { // default is single end bed file 193 | if (verbose) 194 | cerr << "BED_INPUT" << endl; 195 | n_reads = load_counts_BED_se(input_file_name, counts_hist); 196 | } 197 | /************ done loading input **********************************/ 198 | 199 | const size_t max_observed_count = counts_hist.size() - 1; 200 | const double distinct_reads = 201 | std::accumulate(cbegin(counts_hist), cend(counts_hist), 0.0); 202 | 203 | // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE 204 | size_t first_zero = 1; 205 | while (first_zero < counts_hist.size() && counts_hist[first_zero] > 0) 206 | ++first_zero; 207 | 208 | // make sure the max terms is at most one less than the first zero 209 | orig_max_terms = std::min(orig_max_terms, first_zero - 1); 210 | orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1); 211 | 212 | const size_t distinct_counts = 213 | std::count_if(cbegin(counts_hist), cend(counts_hist), 214 | [](const double x) { return x > 0.0; }); 215 | 216 | if (verbose) 217 | cerr << "TOTAL READS = " << n_reads << endl 218 | << "DISTINCT READS = " << distinct_reads << endl 219 | << "DISTINCT COUNTS = " << distinct_counts << endl 220 | << "MAX COUNT = " << max_observed_count << endl 221 | << "COUNTS OF 1 = " << counts_hist[1] << endl 222 | << "MAX TERMS = " << orig_max_terms << endl; 223 | 224 | if (!histogram_outfile.empty()) 225 | report_histogram(histogram_outfile, counts_hist); 226 | 227 | // check to make sure library is not overly saturated 228 | const double two_fold_extrap = GoodToulmin2xExtrap(counts_hist); 229 | if (two_fold_extrap < 0.0) 230 | throw runtime_error("Saturation expected at double initial sample size. " 231 | "Unable to extrapolate."); 232 | 233 | // check that min required count is satisfied 234 | if (orig_max_terms < min_required_counts) 235 | throw runtime_error(min_required_counts_error_message); 236 | 237 | if (verbose) 238 | cerr << "[ESTIMATING YIELD CURVE]" << endl; 239 | vector yield_estimates; 240 | 241 | if (SINGLE_ESTIMATE) { 242 | const bool single_estimate_success = extrap_single_estimate( 243 | verbose, allow_defects, counts_hist, orig_max_terms, diagonal, 244 | step_size, max_extrap, yield_estimates); 245 | // exit on failure 246 | if (!single_estimate_success) 247 | throw runtime_error( 248 | "single estimate failed, run full mode for estimates"); 249 | 250 | std::ofstream of; 251 | if (!outfile.empty()) 252 | of.open(outfile); 253 | std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); 254 | 255 | out << "TOTAL_READS\tEXPECTED_DISTINCT" << endl; 256 | out.setf(std::ios_base::fixed, std::ios_base::floatfield); 257 | out.precision(1); 258 | 259 | out << 0 << '\t' << 0 << endl; 260 | for (size_t i = 0; i < yield_estimates.size(); ++i) 261 | out << (i + 1) * step_size << '\t' << yield_estimates[i] << endl; 262 | } 263 | else { 264 | if (verbose) 265 | cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl; 266 | 267 | const size_t max_iter = 100 * n_bootstraps; 268 | 269 | vector> bootstrap_estimates; 270 | extrap_bootstrap(verbose, allow_defects, seed, counts_hist, n_bootstraps, 271 | orig_max_terms, diagonal, step_size, max_extrap, 272 | max_iter, bootstrap_estimates); 273 | 274 | if (verbose) 275 | cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl; 276 | // yield ci 277 | vector yield_upper_ci_lognorm, yield_lower_ci_lognorm; 278 | vector_median_and_ci(bootstrap_estimates, c_level, yield_estimates, 279 | yield_lower_ci_lognorm, yield_upper_ci_lognorm); 280 | 281 | if (verbose) 282 | cerr << "[WRITING OUTPUT]" << endl; 283 | 284 | write_predicted_complexity_curve(outfile, c_level, step_size, 285 | yield_estimates, yield_lower_ci_lognorm, 286 | yield_upper_ci_lognorm); 287 | } 288 | } 289 | catch (const std::exception &e) { 290 | cerr << e.what() << endl; 291 | return EXIT_FAILURE; 292 | } 293 | return EXIT_SUCCESS; 294 | } 295 | -------------------------------------------------------------------------------- /src/lc_extrap.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2024 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Timothy Daley and Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see 18 | * . 19 | */ 20 | 21 | #ifndef SRC_LC_EXTRAP_HPP_ 22 | #define SRC_LC_EXTRAP_HPP_ 23 | 24 | int 25 | lc_extrap_main(const int argc, const char *argv[]); 26 | 27 | #endif // SRC_LC_EXTRAP_HPP_ 28 | -------------------------------------------------------------------------------- /src/load_data_for_complexity.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2014 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Andrew D. Smith and Timothy Daley 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | */ 19 | 20 | #ifndef SRC_LOAD_DATA_FOR_COMPLEXITY_HPP_ 21 | #define SRC_LOAD_DATA_FOR_COMPLEXITY_HPP_ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | std::size_t 29 | load_coverage_counts_MR(const std::string &input_file_name, 30 | const std::uint32_t seed, const std::size_t bin_size, 31 | const std::size_t max_width, 32 | std::vector &coverage_hist); 33 | 34 | std::size_t 35 | load_coverage_counts_GR(const std::string &input_file_name, 36 | const std::uint32_t seed, const std::size_t bin_size, 37 | const std::size_t max_width, 38 | std::vector &coverage_hist); 39 | 40 | std::size_t 41 | load_histogram(const std::string &filename, std::vector &counts_hist); 42 | 43 | std::size_t 44 | load_counts(const std::string &input_file_name, 45 | std::vector &counts_hist); 46 | 47 | std::size_t 48 | load_counts_BED_pe(const std::string &input_file_name, 49 | std::vector &counts_hist); 50 | 51 | std::size_t 52 | load_counts_BED_se(const std::string &input_file_name, 53 | std::vector &counts_hist); 54 | 55 | #ifdef HAVE_HTSLIB 56 | std::size_t 57 | load_counts_BAM_pe(const std::uint32_t n_threads, 58 | const std::string &input_file_name, 59 | std::vector &counts_hist); 60 | 61 | std::size_t 62 | load_counts_BAM_se(const std::uint32_t n_threads, 63 | const std::string &input_file_name, 64 | std::vector &counts_hist); 65 | 66 | std::size_t 67 | load_coverage_counts_BAM(const std::uint32_t n_threads, 68 | const std::string &input_file_name, 69 | const std::uint32_t seed, const std::size_t bin_size, 70 | const std::size_t max_width, 71 | std::vector &coverage_hist); 72 | 73 | #endif // HAVE_HTSLIB 74 | 75 | #endif // SRC_LOAD_DATA_FOR_COMPLEXITY_HPP_ 76 | -------------------------------------------------------------------------------- /src/moment_sequence.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2015 2 | * University of Southern California and 3 | * Andrew D. Smith and Timothy Daley 4 | * 5 | * Authors: Andrew D. Smith and Timothy Daley 6 | * 7 | * This program is free software: you can redistribute it and/or 8 | * modify it under the terms of the GNU General Public License as 9 | * published by the Free Software Foundation, either version 3 of the 10 | * License, or (at your option) any later version. 11 | * 12 | * This program is distributed in the hope that it will be useful, but 13 | * WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | * General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU General Public License 18 | * along with this program. If not, see 19 | * . 20 | */ 21 | 22 | #include "moment_sequence.hpp" 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include // std::swap 33 | #include 34 | 35 | using std::begin; 36 | using std::cbegin; 37 | using std::cend; 38 | using std::cerr; 39 | using std::endl; 40 | using std::find_if; 41 | using std::isfinite; 42 | using std::isinf; 43 | using std::max; 44 | using std::setprecision; 45 | using std::string; 46 | using std::swap; 47 | using std::transform; 48 | using std::vector; 49 | 50 | void 51 | LU_decomp(vector> &A, vector &P) { 52 | const size_t N = A.size(); 53 | double absA{}; 54 | size_t i, k; 55 | 56 | P.clear(); 57 | for (size_t x = 0; x <= N; x++) 58 | P.push_back(x); 59 | 60 | for (i = 0; i < N; i++) { 61 | double maxA = 0.0; 62 | size_t imax = i; 63 | 64 | for (k = i; k < N; k++) 65 | if ((absA = fabs(A[k][i])) > maxA) { 66 | maxA = absA; 67 | imax = k; 68 | } 69 | 70 | if (imax != i) { 71 | // pivoting P 72 | size_t j = P[i]; 73 | P[i] = P[imax]; 74 | P[imax] = j; 75 | 76 | // pivoting rows of A 77 | vector ptr(A[i]); 78 | A[i] = A[imax]; 79 | A[imax] = ptr; 80 | 81 | // counting pivots starting from N (for determinant) 82 | P[N]++; 83 | } 84 | 85 | for (size_t j = i + 1; j < N; j++) { 86 | A[j][i] /= A[i][i]; 87 | 88 | for (k = i + 1; k < N; k++) 89 | A[j][k] -= A[j][i] * A[i][k]; 90 | } 91 | } 92 | } 93 | 94 | double 95 | LU_determinant(const vector> &A, const vector &P) { 96 | const size_t N = A.size(); 97 | 98 | double det = A[0][0]; 99 | for (size_t i = 1; i < N; ++i) 100 | det *= A[i][i]; 101 | 102 | if ((P[N] - N) % 2 == 0) 103 | return det; 104 | 105 | return -det; 106 | } 107 | 108 | ///////////////////////////////////////////////////// 109 | // test Hankel moment matrix 110 | // ensure moment sequence is positive definite 111 | // truncate moment sequence to ensure pos def 112 | size_t 113 | ensure_pos_def_mom_seq(vector &moments, const double tolerance, 114 | const bool VERBOSE) { 115 | const size_t min_hankel_dim = 1; 116 | size_t hankel_dim = 2; 117 | if (moments.size() < 2 * hankel_dim) { 118 | if (VERBOSE) 119 | cerr << "too few moments" << endl; 120 | return min_hankel_dim; 121 | } 122 | 123 | while (2 * hankel_dim - 1 < moments.size()) { 124 | vector> hankel_mat(hankel_dim, 125 | vector(hankel_dim, 0.0)); 126 | for (size_t c_idx = 0; c_idx < hankel_dim; c_idx++) 127 | for (size_t r_idx = 0; r_idx < hankel_dim; r_idx++) 128 | hankel_mat[c_idx][r_idx] = moments[c_idx + r_idx]; 129 | 130 | vector perm; 131 | LU_decomp(hankel_mat, perm); 132 | const double hankel_mat_det = LU_determinant(hankel_mat, perm); 133 | 134 | vector> shift_hankel_matrix(hankel_dim, 135 | vector(hankel_dim, 0.0)); 136 | for (size_t c_idx = 0; c_idx < hankel_dim; c_idx++) 137 | for (size_t r_idx = 0; r_idx < hankel_dim; r_idx++) 138 | shift_hankel_matrix[c_idx][r_idx] = moments[c_idx + r_idx + 1]; 139 | 140 | vector s_perm; 141 | LU_decomp(shift_hankel_matrix, s_perm); 142 | const double shift_hankel_mat_det = 143 | LU_determinant(shift_hankel_matrix, s_perm); 144 | 145 | if (VERBOSE) { 146 | cerr << "dim" << '\t' << "hankel_det" << '\t' << "shifted_hankel_det" 147 | << endl; 148 | cerr << hankel_dim << '\t' << hankel_mat_det << '\t' 149 | << shift_hankel_mat_det << endl; 150 | } 151 | 152 | if (hankel_mat_det > tolerance && shift_hankel_mat_det > tolerance) { 153 | hankel_dim++; 154 | } 155 | else { 156 | hankel_dim--; 157 | moments.resize(2 * hankel_dim); 158 | return hankel_dim; 159 | } 160 | } 161 | 162 | return max(hankel_dim - 1, min_hankel_dim); 163 | } 164 | 165 | ///////////////////////////////////////////////////// 166 | // 3 term relations 167 | 168 | // check 3 term recurrence to avoid non-positive elements 169 | // truncate if non-positive element found 170 | static void 171 | check_three_term_relation(vector &a, vector &b) { 172 | // abort if first entry is zero or smaller 173 | if (a[0] <= 0.0) { 174 | a.clear(); 175 | b.clear(); 176 | } 177 | 178 | for (size_t i = 0; i < b.size(); i++) 179 | // ADS: some strange logic here 180 | if (b[i] <= 0.0 || !isfinite(b[i]) || a[i + 1] <= 0.0 || 181 | !isfinite(a[i + 1])) { 182 | b.resize(i); 183 | a.resize(i + 1); 184 | break; 185 | } 186 | } 187 | 188 | // check the moment sequence to avoid non-positive elements and 189 | // truncate at first non-positive element if found 190 | static void 191 | check_moment_sequence(vector &obs_moms) { 192 | if (obs_moms[0] <= 0.0 || !isfinite(obs_moms[0])) 193 | obs_moms.clear(); 194 | 195 | for (size_t i = 1; i < obs_moms.size(); i++) { 196 | if (obs_moms[i] <= 0.0 || !isfinite(obs_moms[i])) { 197 | obs_moms.resize(i + 1); 198 | break; 199 | } 200 | } 201 | } 202 | 203 | void 204 | MomentSequence::unmodified_Chebyshev() { 205 | const size_t n_points = static_cast(floor(moments.size() / 2)); 206 | vector a(n_points, 0.0); 207 | vector b(n_points - 1, 0.0); 208 | 209 | vector> sigma(2 * n_points, vector(2 * n_points, 0.0)); 210 | // initialization 211 | a[0] = moments[1] / moments[0]; 212 | // sigma[-1][l] = 0 213 | for (size_t l = 0; l < 2 * n_points; l++) 214 | sigma[0][l] = moments[l]; 215 | 216 | for (size_t k = 1; k <= n_points; k++) { 217 | for (size_t l = k; l < 2 * n_points - k; l++) { 218 | sigma[k][l] = sigma[k - 1][l + 1] - a[k - 1] * sigma[k - 1][l]; 219 | if (k > 1) 220 | sigma[k][l] -= b[k - 2] * sigma[k - 2][l]; 221 | } 222 | if (k != n_points) { 223 | a[k] = 224 | sigma[k][k + 1] / sigma[k][k] - sigma[k - 1][k] / sigma[k - 1][k - 1]; 225 | b[k - 1] = sigma[k][k] / sigma[k - 1][k - 1]; 226 | } 227 | } 228 | 229 | alpha = a; 230 | beta = b; 231 | } 232 | 233 | // un-normalized 3 term recurrence 234 | void 235 | MomentSequence::full_3term_recurrence(vector &full_alpha, 236 | vector &full_beta) { 237 | const size_t n_points = std::size(moments) / 2; 238 | vector a(n_points, 0.0); 239 | vector b(n_points - 1, 0.0); 240 | 241 | vector> sigma(2 * n_points, vector(2 * n_points, 0.0)); 242 | // initialization 243 | a[0] = moments[1] / moments[0]; 244 | // sigma[-1][l] = 0 245 | for (size_t l = 0; l < 2 * n_points; l++) 246 | sigma[0][l] = moments[l]; 247 | 248 | for (size_t k = 1; k <= n_points; k++) { 249 | for (size_t l = k; l < 2 * n_points - k; l++) { 250 | sigma[k][l] = sigma[k - 1][l + 1] - a[k - 1] * sigma[k - 1][l]; 251 | if (k > 1) 252 | sigma[k][l] -= b[k - 2] * sigma[k - 2][l]; 253 | } 254 | if (k != n_points) { 255 | a[k] = 256 | sigma[k][k + 1] / sigma[k][k] - sigma[k - 1][k] / sigma[k - 1][k - 1]; 257 | b[k - 1] = sigma[k][k] / sigma[k - 1][k - 1]; 258 | } 259 | } 260 | 261 | full_alpha.swap(a); 262 | full_beta.swap(b); 263 | } 264 | 265 | //////////////////////////////////////////////////// 266 | // Constructor 267 | 268 | MomentSequence::MomentSequence(const vector &obs_moms) : 269 | moments(obs_moms) { 270 | vector holding_moms(moments); 271 | // make sure the moments are all positive 272 | check_moment_sequence(holding_moms); 273 | moments = holding_moms; 274 | 275 | // calculate 3-term recurrence 276 | unmodified_Chebyshev(); 277 | } 278 | 279 | ///////////////////////////////////////////////////// 280 | // Quadrature Methods 281 | 282 | // one iteration of QR: 283 | // following eq's 3.3 of Golub & Welsh 284 | // one iteration is Z_N-1*Z_N-2*...*Z_1*X*Z_1*...*Z_N-1 285 | // Z_j is givens matrix to zero out the j+1,j'th element of X 286 | static void 287 | QRiteration(vector &alpha, vector &beta, 288 | vector &weights) { 289 | // initialize variables 290 | vector sin_theta(alpha.size(), 0.0); 291 | vector cos_theta(alpha.size(), 0.0); 292 | 293 | vector a(alpha.size(), 0.0); 294 | vector a_bar(alpha.size(), 0.0); 295 | a_bar[0] = alpha[0]; 296 | 297 | vector b(beta); 298 | vector b_bar(alpha.size(), 0.0); 299 | b_bar[0] = alpha[0]; 300 | vector b_tilde(alpha.size(), 0.0); 301 | b_tilde[0] = beta[0]; 302 | 303 | vector d(alpha.size(), 0.0); 304 | d[0] = beta[0]; 305 | 306 | vector z(weights); 307 | vector z_bar(weights.size(), 0.0); 308 | z_bar[0] = z[0]; 309 | 310 | for (size_t j = 0; j < alpha.size() - 1; j++) { 311 | // for d and b_bar, j here is j-1 in G&W 312 | if (d[j] == 0.0 && b_bar[j] == 0.0) { 313 | sin_theta[j] = 0.0; 314 | cos_theta[j] = 1.0; 315 | } 316 | else { 317 | sin_theta[j] = d[j] / sqrt(d[j] * d[j] + b_bar[j] * b_bar[j]); 318 | cos_theta[j] = b_bar[j] / sqrt(d[j] * d[j] + b_bar[j] * b_bar[j]); 319 | } 320 | 321 | a[j] = (a_bar[j] * cos_theta[j] * cos_theta[j] + 322 | 2 * b_tilde[j] * cos_theta[j] * sin_theta[j] + 323 | alpha[j + 1] * sin_theta[j] * sin_theta[j]); 324 | 325 | a_bar[j + 1] = (a_bar[j] * sin_theta[j] * sin_theta[j] - 326 | 2 * b_tilde[j] * cos_theta[j] * sin_theta[j] + 327 | alpha[j + 1] * cos_theta[j] * cos_theta[j]); 328 | 329 | if (j != 0) 330 | b[j - 1] = sqrt(d[j] * d[j] + b_bar[j] * b_bar[j]); 331 | 332 | b_bar[j + 1] = ((a_bar[j] - alpha[j + 1]) * sin_theta[j] * cos_theta[j] + 333 | b_tilde[j] * (sin_theta[j] * sin_theta[j] - 334 | cos_theta[j] * cos_theta[j])); 335 | 336 | b_tilde[j + 1] = -beta[j + 1] * cos_theta[j]; 337 | 338 | d[j + 1] = beta[j + 1] * sin_theta[j]; 339 | 340 | z[j] = z_bar[j] * cos_theta[j] + weights[j + 1] * sin_theta[j]; 341 | 342 | z_bar[j + 1] = z_bar[j] * sin_theta[j] - weights[j + 1] * cos_theta[j]; 343 | } 344 | 345 | // last entries set equal to final "holding" values 346 | a.back() = a_bar.back(); 347 | b.back() = b_bar.back(); 348 | z.back() = z_bar.back(); 349 | 350 | swap(alpha, a); 351 | swap(beta, b); 352 | swap(weights, z); 353 | } 354 | 355 | static bool 356 | check_positivity(const vector &v) { 357 | const auto non_pos = [](const double x) { return x <= 0.0 || isinf(x); }; 358 | return find_if(cbegin(v), cend(v), non_pos) == cend(v); 359 | } 360 | 361 | bool 362 | MomentSequence::Lower_quadrature_rules(const size_t n_points, const double tol, 363 | const size_t max_iter, 364 | vector &points, 365 | vector &weights) { 366 | // make sure that points.size() will be less than n_points 367 | vector a(alpha); 368 | a.resize((n_points < alpha.size()) ? n_points : alpha.size()); 369 | vector b(beta); 370 | b.resize((n_points - 1 < beta.size()) ? n_points - 1 : beta.size()); 371 | 372 | check_three_term_relation(a, b); 373 | 374 | // See Gautschi pgs 10-13, 375 | // the nu here is the square of the off-diagonal 376 | // of the Jacobi matrix 377 | for (size_t i = 0; i < b.size(); i++) 378 | b[i] = sqrt(b[i]); 379 | 380 | vector eigenvec(a.size(), 0.0); 381 | eigenvec[0] = 1.0; 382 | vector eigenvals(a); 383 | vector qr_beta(b); 384 | 385 | // in QR, off-diagonals go to zero use off diags for convergence 386 | double error_sum = 0.0; 387 | for (size_t i = 0; i < qr_beta.size(); i++) 388 | error_sum += fabs(qr_beta[i]); 389 | 390 | size_t iter = 0; 391 | while (iter < max_iter && error_sum > tol) { 392 | QRiteration(eigenvals, qr_beta, eigenvec); 393 | 394 | error_sum = 0.0; 395 | for (size_t i = 0; i < qr_beta.size(); i++) 396 | error_sum += fabs(qr_beta[i]); 397 | iter++; 398 | } 399 | 400 | // eigenvalues are on diagonal of J 401 | const bool points_are_positive = check_positivity(eigenvals); 402 | if (points_are_positive) { 403 | swap(points, eigenvals); 404 | swap(weights, eigenvec); 405 | } 406 | 407 | // square entries in the weights vector 408 | transform(cbegin(weights), cend(weights), begin(weights), 409 | [](const double x) { return x * x; }); 410 | 411 | return points_are_positive; 412 | } 413 | -------------------------------------------------------------------------------- /src/moment_sequence.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Andrew D. Smith and Timothy Daley 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | */ 19 | 20 | #ifndef SRC_MOMENT_SEQUENCE_HPP_ 21 | #define SRC_MOMENT_SEQUENCE_HPP_ 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | // test Hankel moment matrix to ensure the moment sequence 28 | // is positive definite 29 | std::size_t 30 | ensure_pos_def_mom_seq(std::vector &moments, const double tolerance, 31 | const bool VERBOSE); 32 | 33 | struct MomentSequence { 34 | MomentSequence() {} 35 | explicit MomentSequence(const std::vector &obs_moms); 36 | 37 | MomentSequence(const std::vector &a, const std::vector &b) : 38 | alpha(a), beta(b) {} 39 | 40 | // Estimate 3-term recurrence 41 | // these will be removed from the header when they are tested 42 | void unmodified_Chebyshev(); 43 | 44 | void full_3term_recurrence(std::vector &full_alpha, 45 | std::vector &full_beta); 46 | 47 | // quadrature rules using QR on Jacobi matrix 48 | bool Lower_quadrature_rules(const std::size_t n_points, 49 | const double tolerance, 50 | const std::size_t max_iter, 51 | std::vector &points, 52 | std::vector &weights); 53 | 54 | std::vector moments; 55 | // 3-term recurrence 56 | std::vector alpha; 57 | std::vector beta; 58 | }; 59 | 60 | #endif // SRC_MOMENT_SEQUENCE_HPP_ 61 | -------------------------------------------------------------------------------- /src/pop_size.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2024 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Timothy Daley and Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see 18 | * . 19 | */ 20 | 21 | #include "pop_size.hpp" 22 | 23 | #include "common.hpp" 24 | #include "load_data_for_complexity.hpp" 25 | 26 | #include 27 | 28 | #include 29 | #include // std::size_t 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | using std::cbegin; 39 | using std::cend; 40 | using std::cerr; 41 | using std::count_if; 42 | using std::endl; 43 | using std::min; 44 | using std::runtime_error; 45 | using std::size_t; 46 | using std::string; 47 | using std::to_string; 48 | using std::uint32_t; 49 | using std::vector; 50 | 51 | int 52 | pop_size_main(const int argc, const char *argv[]) { 53 | try { 54 | static const size_t min_required_counts = 4; 55 | static const string min_required_counts_error_message = 56 | "max count before zero is less than min required count (" + 57 | to_string(min_required_counts) + ") duplicates removed"; 58 | 59 | string outfile; 60 | string histogram_outfile; 61 | 62 | size_t orig_max_terms = 100; 63 | double max_extrap = 0.0; 64 | double step_size = 0.0; 65 | size_t n_desired_steps = 50; 66 | size_t n_bootstraps = 100; 67 | int diagonal = 0; 68 | double c_level = 0.95; 69 | uint32_t seed = 408; 70 | 71 | /* FLAGS */ 72 | bool verbose = false; 73 | bool VALS_INPUT = false; 74 | bool PAIRED_END = false; 75 | bool HIST_INPUT = false; 76 | bool SINGLE_ESTIMATE = false; 77 | bool allow_defects = false; 78 | 79 | #ifdef HAVE_HTSLIB 80 | bool BAM_FORMAT_INPUT = false; 81 | size_t MAX_SEGMENT_LENGTH = 5000; 82 | uint32_t n_threads{1}; 83 | #endif 84 | 85 | const string description = R"( 86 | Estimate the total population size using the approach described in 87 | Daley & Smith (2013), extrapolating to very long range. Default 88 | parameters assume that the initial sample represents at least 1e-9 of 89 | the population, which is sufficient for every example application we 90 | have seen. 91 | )"; 92 | string program_name = std::filesystem::path(argv[0]).filename(); 93 | program_name += " " + string(argv[1]); 94 | 95 | /********** GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/ 96 | 97 | OptionParser opt_parse(program_name, description, ""); 98 | opt_parse.add_opt("output", 'o', "yield output file (default: stdout)", 99 | false, outfile); 100 | opt_parse.add_opt("extrap", 'e', "maximum extrapolation", false, 101 | max_extrap); 102 | opt_parse.add_opt("steps", 's', "number of steps", false, n_desired_steps); 103 | opt_parse.add_opt("boots", 'n', "number of bootstraps", false, 104 | n_bootstraps); 105 | opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, 106 | c_level); 107 | opt_parse.add_opt("terms", 'x', "maximum terms in estimator", false, 108 | orig_max_terms); 109 | opt_parse.add_opt("verbose", 'v', "print more info", false, verbose); 110 | #ifdef HAVE_HTSLIB 111 | opt_parse.add_opt("bam", 'B', "input is in BAM format", false, 112 | BAM_FORMAT_INPUT); 113 | opt_parse.add_opt("seg_len", 'l', 114 | "maximum segment length when merging " 115 | "paired end bam reads", 116 | false, MAX_SEGMENT_LENGTH); 117 | opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM", 118 | false, n_threads); 119 | #endif 120 | opt_parse.add_opt("pe", 'P', "input is paired end read file", false, 121 | PAIRED_END); 122 | opt_parse.add_opt( 123 | "vals", 'V', "input is a text file containing only the observed counts", 124 | false, VALS_INPUT); 125 | opt_parse.add_opt("hist", 'H', 126 | "input is a text file containing the observed histogram", 127 | false, HIST_INPUT); 128 | opt_parse.add_opt("hist-out", '\0', 129 | "output histogram to this file (for non-hist input)", 130 | false, histogram_outfile); 131 | opt_parse.add_opt("quick", 'Q', 132 | "quick mode (no bootstraps) for confidence intervals", 133 | false, SINGLE_ESTIMATE); 134 | opt_parse.add_opt("defects", 'D', "no testing for defects", false, 135 | allow_defects); 136 | opt_parse.add_opt("seed", 'r', "seed for random number generator", false, 137 | seed); 138 | opt_parse.set_show_defaults(); 139 | vector leftover_args; 140 | // ADS: suspect bug below; "-about" isn't working. 141 | opt_parse.parse(argc - 1, argv + 1, leftover_args); 142 | if (argc == 2 || opt_parse.help_requested()) { 143 | cerr << opt_parse.help_message() << endl; 144 | cerr << opt_parse.about_message() << endl; 145 | return EXIT_SUCCESS; 146 | } 147 | if (opt_parse.option_missing()) { 148 | cerr << opt_parse.option_missing_message() << endl; 149 | return EXIT_SUCCESS; 150 | } 151 | if (leftover_args.empty()) { 152 | cerr << opt_parse.help_message() << endl; 153 | return EXIT_SUCCESS; 154 | } 155 | const string input_file_name = leftover_args.front(); 156 | /******************************************************************/ 157 | 158 | vector counts_hist; 159 | size_t n_reads = 0; 160 | 161 | /************ loading input ***************************************/ 162 | if (HIST_INPUT) { 163 | if (verbose) 164 | cerr << "HIST_INPUT" << endl; 165 | n_reads = load_histogram(input_file_name, counts_hist); 166 | } 167 | else if (VALS_INPUT) { 168 | if (verbose) 169 | cerr << "VALS_INPUT" << endl; 170 | n_reads = load_counts(input_file_name, counts_hist); 171 | } 172 | #ifdef HAVE_HTSLIB 173 | else if (BAM_FORMAT_INPUT && PAIRED_END) { 174 | if (verbose) 175 | cerr << "PAIRED_END_BAM_INPUT" << endl; 176 | n_reads = load_counts_BAM_pe(n_threads, input_file_name, counts_hist); 177 | } 178 | else if (BAM_FORMAT_INPUT) { 179 | if (verbose) 180 | cerr << "BAM_INPUT" << endl; 181 | n_reads = load_counts_BAM_se(n_threads, input_file_name, counts_hist); 182 | } 183 | #endif 184 | else if (PAIRED_END) { 185 | if (verbose) 186 | cerr << "PAIRED_END_BED_INPUT" << endl; 187 | n_reads = load_counts_BED_pe(input_file_name, counts_hist); 188 | } 189 | else { // default is single end bed file 190 | if (verbose) 191 | cerr << "BED_INPUT" << endl; 192 | n_reads = load_counts_BED_se(input_file_name, counts_hist); 193 | } 194 | /************ done loading input **********************************/ 195 | 196 | const size_t max_observed_count = counts_hist.size() - 1; 197 | const double distinct_reads = 198 | accumulate(cbegin(counts_hist), cend(counts_hist), 0.0); 199 | 200 | // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE 201 | size_t first_zero = 1; 202 | while (first_zero < counts_hist.size() && counts_hist[first_zero] > 0) 203 | ++first_zero; 204 | 205 | orig_max_terms = min(orig_max_terms, first_zero - 1); 206 | orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1); 207 | 208 | if (max_extrap < 1.0) 209 | max_extrap = 1000000000 * distinct_reads; 210 | if (step_size < 1.0) 211 | step_size = (max_extrap - distinct_reads) / n_desired_steps; 212 | 213 | const size_t distinct_counts = 214 | std::count_if(begin(counts_hist), end(counts_hist), 215 | [](const double x) { return x > 0.0; }); 216 | 217 | if (verbose) 218 | cerr << "TOTAL READS = " << n_reads << endl 219 | << "DISTINCT READS = " << distinct_reads << endl 220 | << "DISTINCT COUNTS = " << distinct_counts << endl 221 | << "MAX COUNT = " << max_observed_count << endl 222 | << "COUNTS OF 1 = " << counts_hist[1] << endl 223 | << "MAX TERMS = " << orig_max_terms << endl; 224 | 225 | if (!histogram_outfile.empty()) 226 | report_histogram(histogram_outfile, counts_hist); 227 | 228 | // check to make sure library is not overly saturated 229 | const double two_fold_extrap = GoodToulmin2xExtrap(counts_hist); 230 | if (two_fold_extrap < 0.0) 231 | throw runtime_error("Saturation expected at double initial sample size." 232 | " Unable to extrapolate"); 233 | 234 | // const size_t total_reads = get_counts_from_hist(counts_hist); 235 | 236 | // assert(total_reads == n_reads); // ADS: why commented out? 237 | 238 | // check that min required count is satisfied 239 | if (orig_max_terms < min_required_counts) 240 | throw runtime_error(min_required_counts_error_message); 241 | 242 | if (verbose) 243 | cerr << "[ESTIMATING YIELD CURVE]" << endl; 244 | 245 | vector yield_estimates; 246 | 247 | if (SINGLE_ESTIMATE) { 248 | const bool single_estimate_success = extrap_single_estimate( 249 | verbose, allow_defects, counts_hist, orig_max_terms, diagonal, 250 | step_size, max_extrap, yield_estimates); 251 | // IF FAILURE, EXIT 252 | if (!single_estimate_success) 253 | throw runtime_error("single estimate failed, run " 254 | "full mode for estimates"); 255 | 256 | std::ofstream of; 257 | if (!outfile.empty()) 258 | of.open(outfile.c_str()); 259 | std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); 260 | 261 | out << "TOTAL_READS\tEXPECTED_DISTINCT" << endl; 262 | out.setf(std::ios_base::fixed, std::ios_base::floatfield); 263 | out.precision(1); 264 | 265 | out << 0 << '\t' << 0 << endl; 266 | for (size_t i = 0; i < yield_estimates.size(); ++i) 267 | out << (i + 1) * step_size << '\t' << yield_estimates[i] << endl; 268 | } 269 | else { 270 | if (verbose) 271 | cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl; 272 | 273 | const size_t max_iter = 100 * n_bootstraps; 274 | 275 | vector> bootstrap_estimates; 276 | extrap_bootstrap(verbose, allow_defects, seed, counts_hist, n_bootstraps, 277 | orig_max_terms, diagonal, step_size, max_extrap, 278 | max_iter, bootstrap_estimates); 279 | 280 | if (verbose) 281 | cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl; 282 | // yield ci 283 | vector yield_upper_ci_lognorm, yield_lower_ci_lognorm; 284 | 285 | vector_median_and_ci(bootstrap_estimates, c_level, yield_estimates, 286 | yield_lower_ci_lognorm, yield_upper_ci_lognorm); 287 | if (verbose) 288 | cerr << "[WRITING OUTPUT]" << endl; 289 | 290 | std::ofstream of; 291 | if (!outfile.empty()) 292 | of.open(outfile); 293 | std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); 294 | 295 | out.setf(std::ios_base::fixed, std::ios_base::floatfield); 296 | out.precision(1); 297 | 298 | const size_t n_ests = yield_estimates.size() - 1; 299 | if (n_ests < 2) 300 | throw runtime_error("problem with number of estimates in pop_size"); 301 | 302 | const bool converged = 303 | (yield_estimates[n_ests] - yield_estimates[n_ests - 1] < 1.0); 304 | 305 | out << "pop_size_estimate" << '\t' << "lower_ci" << '\t' << "upper_ci" 306 | << endl; 307 | out << yield_estimates.back() << '\t' << yield_lower_ci_lognorm.back() 308 | << '\t' << yield_upper_ci_lognorm.back(); 309 | if (!converged) 310 | out << "\tnot_converged"; 311 | out << endl; 312 | } 313 | } 314 | catch (const std::exception &e) { 315 | cerr << e.what() << endl; 316 | return EXIT_FAILURE; 317 | } 318 | return EXIT_SUCCESS; 319 | } 320 | -------------------------------------------------------------------------------- /src/pop_size.hpp: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2013-2024 University of Southern California and 2 | * Andrew D. Smith and Timothy Daley 3 | * 4 | * Authors: Timothy Daley and Andrew Smith 5 | * 6 | * This program is free software: you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation, either version 3 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see 18 | * . 19 | */ 20 | 21 | #ifndef SRC_POP_SIZE_HPP_ 22 | #define SRC_POP_SIZE_HPP_ 23 | 24 | int 25 | pop_size_main(const int argc, const char *argv[]); 26 | 27 | #endif // SRC_POP_SIZE_HPP_ 28 | -------------------------------------------------------------------------------- /src/preseq.cpp: -------------------------------------------------------------------------------- 1 | /* preseq: to predict properties of genomic sequencing libraries 2 | * 3 | * Copyright (C) 2013-2024 University of Southern California and 4 | * Andrew D. Smith and Timothy Daley 5 | * 6 | * Authors: Timothy Daley, Chao Deng, Victoria Helus, and Andrew Smith 7 | * 8 | * This program is free software: you can redistribute it and/or 9 | * modify it under the terms of the GNU General Public License as 10 | * published by the Free Software Foundation, either version 3 of the 11 | * License, or (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, but 14 | * WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 | * General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see 20 | * . 21 | */ 22 | 23 | #include 24 | 25 | #include "common.hpp" 26 | 27 | // the preseq commands 28 | #include "bound_pop.hpp" 29 | #include "c_curve.hpp" 30 | #include "gc_extrap.hpp" 31 | #include "lc_extrap.hpp" 32 | #include "pop_size.hpp" 33 | 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | static std::string 40 | usage_message() { 41 | std::ostringstream oss; 42 | oss << "preseq: a program for analyzing library complexity\n" 43 | "Version: "; 44 | oss << VERSION; 45 | oss << "\n\n" 46 | "Usage: preseq [OPTIONS]\n\n" 47 | ": c_curve generate complexity curve for a library\n" 48 | " lc_extrap predict the yield for future experiments\n" 49 | " gc_extrap predict genome coverage low input\n" 50 | " sequencing experiments\n" 51 | " bound_pop lower bound on population size\n" 52 | " pop_size estimate number of unique species\n"; 53 | return oss.str(); 54 | } 55 | 56 | int 57 | main(const int argc, const char *argv[]) { 58 | if (argc < 2) { 59 | std::cerr << usage_message() << std::endl; 60 | return EXIT_SUCCESS; 61 | } 62 | 63 | static const std::string cmd = argv[1]; 64 | 65 | if (cmd == "lc_extrap") 66 | return lc_extrap_main(argc, argv); 67 | 68 | if (cmd == "c_curve") 69 | return c_curve_main(argc, argv); 70 | 71 | if (cmd == "gc_extrap") 72 | return gc_extrap_main(argc, argv); 73 | 74 | if (cmd == "bound_pop") 75 | return bound_pop_main(argc, argv); 76 | 77 | if (cmd == "pop_size") 78 | return pop_size_main(argc, argv); 79 | 80 | std::cerr << "Error: unrecognized command: " << argv[1] << std::endl 81 | << usage_message() << std::endl; 82 | 83 | return EXIT_FAILURE; 84 | } 85 | -------------------------------------------------------------------------------- /tests/data/c_curve_input.hist: -------------------------------------------------------------------------------- 1 | 1 982419 2 | 2 6060 3 | 3 214 4 | 4 63 5 | 5 32 6 | 6 21 7 | 7 14 8 | 8 9 9 | 9 6 10 | 10 3 11 | 11 6 12 | 12 2 13 | 13 2 14 | 14 2 15 | 15 3 16 | 16 2 17 | 24 2 18 | 31 1 19 | -------------------------------------------------------------------------------- /tests/md5sum.txt: -------------------------------------------------------------------------------- 1 | 91ef0368a7da1a55e3acad083485df8b tests/c_curve_output.txt 2 | ba02e52a5f3bc7646998e7ade1c7e35e tests/lc_extrap_output.txt 3 | c8895e94346231a5beb4d867df3bb480 tests/gc_extrap_output.txt 4 | -------------------------------------------------------------------------------- /tests/scripts/test_c_curve.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This file is part of preseq 4 | # 5 | # Copyright (C) 2024: Andrew D. Smith 6 | # 7 | # Authors: Andrew D. Smith 8 | # 9 | # This is free software: you can redistribute it and/or modify it 10 | # under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This software is distributed in the hope that it will be useful, but 15 | # WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 | # General Public License for more details. 18 | 19 | infile=tests/c_curve_input.hist 20 | outfile=tests/c_curve_output.txt 21 | if [[ -e "${infile}" ]]; then 22 | ./preseq c_curve -o "${outfile}" -s 100000 -H "${infile}" 23 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 24 | if [[ "${x}" != "OK" ]]; then 25 | exit 1; 26 | fi 27 | else 28 | echo "${infile} not found"; 29 | exit 77; 30 | fi 31 | -------------------------------------------------------------------------------- /tests/scripts/test_gc_extrap.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This file is part of preseq 4 | # 5 | # Copyright (C) 2024: Andrew D. Smith 6 | # 7 | # Authors: Andrew D. Smith 8 | # 9 | # This is free software: you can redistribute it and/or modify it 10 | # under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This software is distributed in the hope that it will be useful, but 15 | # WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 | # General Public License for more details. 18 | 19 | infile=tests/gc_extrap_input.mr 20 | outfile=tests/gc_extrap_output.txt 21 | if [[ -e "${infile}" ]]; then 22 | ./preseq gc_extrap -o "${outfile}" "${infile}" 23 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 24 | if [[ "${x}" != "OK" ]]; then 25 | exit 1; 26 | fi 27 | else 28 | echo "${infile} not found"; 29 | exit 77; 30 | fi 31 | -------------------------------------------------------------------------------- /tests/scripts/test_lc_extrap.test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This file is part of preseq 4 | # 5 | # Copyright (C) 2024: Andrew D. Smith 6 | # 7 | # Authors: Andrew D. Smith 8 | # 9 | # This is free software: you can redistribute it and/or modify it 10 | # under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This software is distributed in the hope that it will be useful, but 15 | # WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 | # General Public License for more details. 18 | 19 | infile=tests/lc_extrap_input.vals 20 | outfile=tests/lc_extrap_output.txt 21 | if [[ -e "${infile}" ]]; then 22 | ./preseq lc_extrap -o "${outfile}" -V "${infile}" 23 | x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2) 24 | if [[ "${x}" != "OK" ]]; then 25 | exit 1; 26 | fi 27 | else 28 | echo "${infile} not found"; 29 | exit 77; 30 | fi 31 | --------------------------------------------------------------------------------