├── .clang-format
├── .cppcheck_suppress
├── .github
    └── workflows
    │   ├── clang-format.yml
    │   ├── cppcheck.yml
    │   ├── cpplint.yml
    │   └── ubuntu-builds.yml
├── .gitmodules
├── .readthedocs.yaml
├── CPPLINT.cfg
├── LICENSE
├── Makefile
├── Makefile.am
├── README.md
├── autogen.sh
├── configure.ac
├── data
    ├── SRR1003759_5M_subset.mr
    ├── SRR1106616_5M_subset.bam
    ├── SRR1301329_1M_hist.txt
    ├── Shakespeare_hist.txt
    ├── Shakespeare_vals.txt
    └── additional_data.txt
├── docs
    ├── FullExperiment_copy.pdf
    ├── InitialExperimentComplexityCurves_copy.pdf
    ├── RELEASE_NOTES.txt
    ├── TCR_richness_vs_age_lm.pdf
    ├── biblio.bib
    ├── compare_RNA_Capture_junction_complexity.pdf
    ├── comparing_scWGA_coverage.pdf
    ├── manual.pdf
    └── manual.tex
├── documentation
    ├── README.md
    ├── docs
    │   ├── Makefile
    │   ├── index.md
    │   ├── quickstart.md
    │   └── requirements.txt
    └── mkdocs.yml
├── m4
    ├── ax_cxx_check_lib.m4
    ├── ax_cxx_compile_stdcxx.m4
    └── ax_cxx_compile_stdcxx_17.m4
├── src
    ├── Makefile
    ├── bam_record_utils.cpp
    ├── bam_record_utils.hpp
    ├── bound_pop.cpp
    ├── bound_pop.hpp
    ├── c_curve.cpp
    ├── c_curve.hpp
    ├── common.cpp
    ├── common.hpp
    ├── continued_fraction.cpp
    ├── continued_fraction.hpp
    ├── dnmt_error.hpp
    ├── gc_extrap.cpp
    ├── gc_extrap.hpp
    ├── lc_extrap.cpp
    ├── lc_extrap.hpp
    ├── load_data_for_complexity.cpp
    ├── load_data_for_complexity.hpp
    ├── moment_sequence.cpp
    ├── moment_sequence.hpp
    ├── pop_size.cpp
    ├── pop_size.hpp
    └── preseq.cpp
└── tests
    ├── data
        ├── c_curve_input.hist
        ├── gc_extrap_input.mr
        └── lc_extrap_input.vals
    ├── md5sum.txt
    └── scripts
        ├── test_c_curve.test
        ├── test_gc_extrap.test
        └── test_lc_extrap.test


/.clang-format:
--------------------------------------------------------------------------------
 1 | # This file is part of preseq
 2 | #
 3 | # Copyright (C) 2024: Andrew D. Smith
 4 | #
 5 | # Authors: Andrew D. Smith
 6 | #
 7 | # This is free software: you can redistribute it and/or modify it
 8 | # under the terms of the GNU General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This software is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | BasedOnStyle: LLVM
18 | ColumnLimit: 80
19 | IndentWidth: 2
20 | AlwaysBreakAfterReturnType: TopLevel
21 | ContinuationIndentWidth: 2
22 | ConstructorInitializerIndentWidth: 2
23 | BraceWrapping:
24 |   BeforeElse: true
25 |   BeforeCatch: true
26 | BreakBeforeBraces: Custom
27 | BreakConstructorInitializers: AfterColon
28 | SpacesBeforeTrailingComments: 2
29 | 


--------------------------------------------------------------------------------
/.cppcheck_suppress:
--------------------------------------------------------------------------------
 1 | # This file is part of preseq
 2 | #
 3 | # Copyright (C) 2024: Andrew D. Smith
 4 | #
 5 | # Authors: Andrew D. Smith
 6 | #
 7 | # This is free software: you can redistribute it and/or modify it
 8 | # under the terms of the GNU General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This software is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | missingIncludeSystem
18 | constVariablePointer
19 | checkersReport
20 | unusedFunction:src/bam_record_utils.hpp
21 | unusedFunction:src/bam_record_utils.cpp
22 | *:src/smithlab_cpp*
23 | unusedStructMember:src/*.hpp
24 | 


--------------------------------------------------------------------------------
/.github/workflows/clang-format.yml:
--------------------------------------------------------------------------------
 1 | # This file is part of preseq
 2 | #
 3 | # Copyright (C) 2024: Andrew D. Smith
 4 | #
 5 | # Authors: Andrew D. Smith
 6 | #
 7 | # This is free software: you can redistribute it and/or modify it
 8 | # under the terms of the GNU General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This software is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | name: Source formatting with clang-format
18 | 
19 | on:
20 |   push:
21 |     branches: [ "master" ]
22 |   pull_request:
23 |     branches: [ "master" ]
24 |   workflow_dispatch:
25 | 
26 | jobs:
27 |   clang-format:
28 |     runs-on: ubuntu-24.04
29 | 
30 |     steps:
31 |     - name: Checkout repository
32 |       uses: actions/checkout@v4
33 | 
34 |     - name: Install dependencies
35 |       run: sudo apt-get install -y clang-format
36 | 
37 |     - name: Run clang-format
38 |       run: |
39 |         clang-format --dry-run -Werror $(git ls-files '*.*pp')
40 | 


--------------------------------------------------------------------------------
/.github/workflows/cppcheck.yml:
--------------------------------------------------------------------------------
 1 | # This file is part of preseq
 2 | #
 3 | # Copyright (C) 2024: Andrew D. Smith
 4 | #
 5 | # Authors: Andrew D. Smith
 6 | #
 7 | # This is free software: you can redistribute it and/or modify it
 8 | # under the terms of the GNU General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This software is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | name: Linting with cppcheck
18 | 
19 | on:
20 |   push:
21 |     branches: [ "master" ]
22 |   pull_request:
23 |     branches: [ "master" ]
24 |   workflow_dispatch:
25 | 
26 | jobs:
27 |   cppcheck:
28 |     runs-on: ubuntu-24.04
29 |     strategy:
30 |       matrix:
31 |         python-version: ["3.12"]
32 | 
33 |     steps:
34 |     - name: Checkout repository
35 |       uses: actions/checkout@v4
36 | 
37 |     - name: Python setup ${{ matrix.python-version }}
38 |       uses: actions/setup-python@v5
39 |       with:
40 |         python-version: ${{ matrix.python-version }}
41 | 
42 |     - name: Install cppcheck
43 |       run: |
44 |         conda install -c conda-forge cppcheck
45 | 
46 |     - name: Run cppcheck
47 |       run: |
48 |         ${CONDA}/bin/cppcheck \
49 |           --std=c++17 \
50 |           --enable=all \
51 |           --check-level=exhaustive \
52 |           --suppressions-list=.cppcheck_suppress \
53 |           -I src/smithlab_cpp \
54 |           $(git ls-files '*.*pp')
55 | 


--------------------------------------------------------------------------------
/.github/workflows/cpplint.yml:
--------------------------------------------------------------------------------
 1 | # This file is part of preseq
 2 | #
 3 | # Copyright (C) 2024: Andrew D. Smith
 4 | #
 5 | # Authors: Andrew D. Smith
 6 | #
 7 | # This is free software: you can redistribute it and/or modify it
 8 | # under the terms of the GNU General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This software is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | name: Linting with cpplint
18 | 
19 | on:
20 |   push:
21 |     branches: [ "master" ]
22 |   pull_request:
23 |     branches: [ "master" ]
24 |   workflow_dispatch:
25 | 
26 | jobs:
27 |   cpplint:
28 |     runs-on: ubuntu-24.04
29 |     strategy:
30 |       matrix:
31 |         python-version: ["3.12"]
32 | 
33 |     steps:
34 |     - name: Checkout repository
35 |       uses: actions/checkout@v4
36 | 
37 |     - name: Python setup ${{ matrix.python-version }}
38 |       uses: actions/setup-python@v5
39 |       with:
40 |         python-version: ${{ matrix.python-version }}
41 | 
42 |     - name: Install cpplint
43 |       run: |
44 |         python -m pip install --upgrade pip
45 |         pip install cpplint
46 | 
47 |     - name: Run cpplint
48 |       run: |
49 |         cpplint --quiet --repository=. $(git ls-files '*.*pp')
50 | 


--------------------------------------------------------------------------------
/.github/workflows/ubuntu-builds.yml:
--------------------------------------------------------------------------------
 1 | # This file is part of preseq
 2 | #
 3 | # Copyright (C) 2024: Andrew D. Smith
 4 | #
 5 | # Authors: Andrew D. Smith
 6 | #
 7 | # This is free software: you can redistribute it and/or modify it
 8 | # under the terms of the GNU General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This software is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | name: Building preseq on Ubuntu
18 | 
19 | on:
20 |   workflow_dispatch:
21 | 
22 | jobs:
23 |   build:
24 |     runs-on: ubuntu-24.04
25 |     steps:
26 |     - uses: actions/checkout@v4
27 |       with:
28 |         submodules: recursive
29 |     - name: Install dependencies
30 |       run: sudo apt-get install -y libhts-dev
31 | 
32 |     - name: Generate configure script
33 |       run: ./autogen.sh
34 | 
35 |     - name: Configure for g++
36 |       run: ./configure CXX="g++"
37 | 
38 |     - name: Build with g++
39 |       run: make -j2
40 | 
41 |     - name: Test the g++ build
42 |       run: make check
43 | 
44 |     - name: Cleanup after the g++ build
45 |       run: make distclean
46 | 
47 |     - name: Configure for clang++
48 |       run: ./configure CXX="clang++"
49 | 
50 |     - name: Build with clang++
51 |       run: make -j2
52 | 
53 |     - name: Test the clang++ build
54 |       run: make check
55 | 
56 |     - name: Cleanup after the clang++ build
57 |       run: make distclean
58 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "preseqR"]
 2 | 	path = preseqR
 3 | 	url = ../preseqR.git
 4 | [submodule "src/smithlab_cpp"]
 5 | 	path = src/smithlab_cpp
 6 | 	url = ../smithlab_cpp.git
 7 | [submodule "src/bamxx"]
 8 | 	path = src/bamxx
 9 | 	url = ../bamxx.git
10 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | build:
 3 |   os: ubuntu-20.04
 4 |   tools:
 5 |     python: "3.9"
 6 | 
 7 | mkdocs:
 8 |   configuration: documentation/mkdocs.yml
 9 |   fail_on_warning: false
10 | 
11 | python:
12 |   install:
13 |     - requirements: documentation/docs/requirements.txt
14 | 
15 | formats:
16 |   - pdf
17 |   - epub
18 | 


--------------------------------------------------------------------------------
/CPPLINT.cfg:
--------------------------------------------------------------------------------
 1 | # This file is part of preseq
 2 | #
 3 | # Copyright (C) 2024: Andrew D. Smith
 4 | #
 5 | # Authors: Andrew D. Smith
 6 | #
 7 | # This is free software: you can redistribute it and/or modify it
 8 | # under the terms of the GNU General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This software is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | set noparent
18 | filter=-runtime/references
19 | filter=-build/include_subdir
20 | filter=-build/include_order
21 | filter=-build/c++11
22 | filter=-build/c++17
23 | # Formatting below handled by clang-format
24 | filter=-whitespace/line_length
25 | filter=-whitespace/newline
26 | filter=-readability/braces
27 | filter=-whitespace/semicolon
28 | filter=-whitespace/indent
29 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2011-2020 University of Southern California and
 2 | #                         Andrew D. Smith and Timothy Daley
 3 | #
 4 | # Authors: Timothy Daley and Andrew D. Smith
 5 | #
 6 | # This program is free software: you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation, either version 3 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | 
16 | ifndef install_dir
17 | install_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
18 | endif
19 | 
20 | all:
21 | 	@make -C src
22 | 
23 | install:
24 | 	@make -C src install_dir=$(install_dir) install
25 | 
26 | clean:
27 | 	@make -C src clean
28 | 
29 | distclean: clean
30 | 	@rm -rf $(install_dir)/bin
31 | 
32 | .PHONY: all distclean clean install
33 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
 1 | # This file is part of preseq
 2 | #
 3 | # Copyright (C) 2018-2024: Andrew D. Smith
 4 | #
 5 | # Authors: Andrew D. Smith
 6 | #
 7 | # This is free software: you can redistribute it and/or modify it
 8 | # under the terms of the GNU General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This software is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | ACLOCAL_AMFLAGS = -I m4
18 | 
19 | SUBDIRS := src/smithlab_cpp
20 | install installdirs: SUBDIRS := $(filter-out src/smithlab_cpp, $(SUBDIRS))
21 | AM_CPPFLAGS = -I $(top_srcdir)/src/smithlab_cpp
22 | if ENABLE_HTS
23 | AM_CPPFLAGS += -I $(top_srcdir)/src/bamxx
24 | endif
25 | 
26 | AM_CPPFLAGS += -Wall -Wextra -Wpedantic -Wno-unknown-attributes
27 | if ENABLE_HTS
28 | AM_CPPFLAGS += -DHAVE_HTSLIB
29 | endif
30 | 
31 | EXTRA_DIST = \
32 | 	README.md \
33 | 	LICENSE \
34 | 	preseqR \
35 | 	data \
36 | 	docs \
37 | 	tests/md5sum.txt \
38 | 	tests/data/lc_extrap_input.vals \
39 | 	tests/data/gc_extrap_input.mr \
40 | 	tests/data/c_curve_input.hist \
41 | 	tests/scripts/test_c_curve.test \
42 | 	tests/scripts/test_gc_extrap.test \
43 | 	tests/scripts/test_lc_extrap.test
44 | 
45 | TESTS = \
46 | 	tests/scripts/test_c_curve.test \
47 | 	tests/scripts/test_lc_extrap.test \
48 | 	tests/scripts/test_gc_extrap.test
49 | 
50 | TEST_EXTENSIONS = .test
51 | 
52 | LDADD = src/smithlab_cpp/libsmithlab_cpp.a
53 | 
54 | bin_PROGRAMS = preseq
55 | 
56 | preseq_SOURCES = \
57 |         src/preseq.cpp \
58 |         src/common.hpp \
59 |         src/common.cpp \
60 |         src/c_curve.hpp \
61 |         src/c_curve.cpp \
62 |         src/gc_extrap.hpp \
63 |         src/gc_extrap.cpp \
64 |         src/lc_extrap.hpp \
65 |         src/lc_extrap.cpp \
66 |         src/bound_pop.hpp \
67 |         src/bound_pop.cpp \
68 |         src/pop_size.hpp \
69 |         src/pop_size.cpp \
70 |         src/continued_fraction.hpp \
71 |         src/continued_fraction.cpp \
72 |         src/load_data_for_complexity.hpp \
73 |         src/load_data_for_complexity.cpp \
74 |         src/moment_sequence.hpp \
75 |         src/moment_sequence.cpp
76 | 
77 | if ENABLE_HTS
78 | preseq_SOURCES += \
79 | 	src/bamxx/bamxx.hpp \
80 | 	src/bam_record_utils.hpp \
81 |         src/bam_record_utils.cpp
82 | endif
83 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![GitHub all releases](https://img.shields.io/github/downloads/smithlabcode/preseq/total?label=GitHub%20downloads)](https://github.com/smithlabcode/preseq/releases)
  2 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/preseq/badges/version.svg)](https://anaconda.org/bioconda/preseq)
  3 | [![Install with Conda](https://anaconda.org/bioconda/preseq/badges/platforms.svg)](https://anaconda.org/bioconda/preseq)
  4 | [![Install with Conda](https://img.shields.io/conda/dn/bioconda/preseq?label=Conda%20downloads)](https://anaconda.org/bioconda/preseq)
  5 | [![Documentation Status](https://readthedocs.org/projects/preseq/badge/?version=latest)](https://preseq.readthedocs.io/en/latest/?badge=latest)
  6 | [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
  7 | 
  8 | This is the README file for the preseq package. The preseq package is
  9 | aimed at predicting the yield of distinct reads from a genomic library
 10 | from an initial sequencing experiment. The estimates can then be used
 11 | to examine the utility of further sequencing, optimize the sequencing
 12 | depth, or to screen multiple libraries to avoid low complexity
 13 | samples.
 14 | 
 15 | SYSTEM REQUIREMENTS
 16 | ========================================================================
 17 | The preseq software will only run on 64-bit UNIX-like operating
 18 | systems and was developed on both Linux and Mac. The preseq software
 19 | requires a C++ compiler that supports C++11.
 20 | 
 21 | INSTALLATION
 22 | ========================================================================
 23 | ### Installing from a release
 24 | 
 25 | 1. Download `preseq-x.tar.gz` from the releases tab of this repository.
 26 | 2. Unpack the archive:
 27 | ```console
 28 | $ tar -zxvf preseq-x.tar.gz
 29 | ```
 30 | 3. Move into the preseq directory and create a build directory:
 31 | ```console
 32 | $ cd preseq-x
 33 | $ mkdir build && cd build
 34 | ```
 35 | 4. Run the configuration script:
 36 | ```console
 37 | $ ../configure
 38 | ```
 39 | If you do not want to install preseq system-wide, or if you do
 40 | not have admin privileges, specify a prefix directory:
 41 | ```console
 42 | $ ../configure --prefix=/some/reasonable/place
 43 | ```
 44 | Finally, if you want to build with HTSlib support (for the `to-mr`
 45 | program) then you need to specify the following:
 46 | ```console
 47 | $ ../configure --enable-hts
 48 | ```
 49 | And if you installed HTSlib yourself in some non-standard directory,
 50 | you must specify the location like this:
 51 | ```console
 52 | $ ../configure --enable-hts CPPFLAGS='-I /path/to/htslib/headers' \
 53 |     LDFLAGS='-L/path/to/htslib/lib'
 54 | ```
 55 | 5. Compile and install the tools:
 56 | ```console
 57 | $ make
 58 | $ make install
 59 | ```
 60 | 
 61 | ### Installing from source
 62 | 
 63 | Developers looking to use the latest commits can compile the cloned
 64 | repository using the `Makefile` within the `src` directory. The
 65 | process is simple:
 66 | ```console
 67 | $ cd src/
 68 | $ make
 69 | ```
 70 | If the desired input is in `.bam` format, `htslib` is required. Type
 71 | ```console
 72 | $ make HAVE_HTSLIB=1 all
 73 | ```
 74 | The HTSLib library can be obtained here:
 75 | http://www.htslib.org/download.
 76 | 
 77 | INPUT FILE FORMATS
 78 | ========================================================================
 79 | The input to preseq can be in 3 general formats:
 80 | 1. Mapped read locations in BED or BAM file format. The file should be
 81 |    sorted by chromosome, start position, end position, and finally
 82 |    strand if in BED format. If the file is in BAM format, then the
 83 |    file should be sorted using `bamtools` or `samtools sort`.
 84 | 2. The "counts histogram" which will have, for each count 1,2,..., the
 85 |    number of unique "species" (e.g. reads, or anything else) that
 86 |    appear with that count. Examples can be found in the data directory
 87 |    within the preseqR subdirectory. Note these should not have a count
 88 |    for "0", and they should not have any header above the counts. Just
 89 |    two columns of numbers, with the first column sorted and unique.
 90 | 3. The counts themselves, so just a file with one count on each
 91 |    line. These will be made into the "counts histogram" inside preseq
 92 |    right away.
 93 | 
 94 | USAGE EXAMPLES
 95 | ========================================================================
 96 | Each program included in this software package will print a list of
 97 | options if executed without any command line arguments. Many of the
 98 | programs use similar options (for example, output files are specified
 99 | with '-o').
100 | 
101 | We have provided a data directory to test each of our programs.
102 | Change to the `data` directory and try some of our commands.
103 | To predict the yield of a future experiment, use `lc_extrap`.
104 | For the most basic usage of `lc_extrap` to compute the expected yield,
105 | use the command on the following data:
106 | ```console
107 | $ preseq lc_extrap -o yield_estimates.txt SRR1003759_5M_subset.mr
108 | ```
109 | If the input file is in `.bam` format, use the `-B` flag:
110 | ```console
111 | $ preseq lc_extrap -B -o yield_estimates.txt SRR1106616_5M_subset.bam
112 | ```
113 | For the counts histogram format, use the `-H` flag:
114 | ```console
115 | $ preseq lc_extrap -H -o yield_estimates.txt SRR1301329_1M_read.txt
116 | ```
117 | 
118 | The yield estimates will appear in yield_estimates.txt, and will be a
119 | column of future experiment sizes in `TOTAL_READS`, a column of the
120 | corresponding expected distinct reads in `EXPECTED_DISTINCT`, followed
121 | by two columns giving the corresponding confidence intervals.
122 | 
123 | To investigate the past yield of an experiment, use `c_curve`.
124 | `c_curve` can take in the same file formats as `lc_extrap` by using
125 | the same flags. The estimates will appear in estimates.txt with two
126 | columns.  The first column gives the total number of reads in a
127 | theoretically smaller experiment and the second gives the
128 | corresponding number of distinct reads.
129 | 
130 | `bound_pop` provides an estimate for the species richness of the
131 | sampled population. The input file formats and corresponding flags are
132 | identical to `c_curve` and `lc_extrap`. The output provides the median
133 | species richness in the first column and the confidence intervals in
134 | the next two columns.
135 | 
136 | Finally, `gc_extrap` predicts the expected genomic coverage for a
137 | future experiment.  It produces the coverage in an output format
138 | identical to `lc_extrap`. `gc_extrap` can only take in files in BED
139 | and mapped reads format (using the `-B` flag for BED):
140 | ```console
141 | $ preseq gc_extrap -B -o coverage_estimates.txt SRR1003759_5M_subset.mr
142 | ```
143 | 
144 | More data is available in the `additional_data.txt` file in the `data`
145 | directory.  For an extended write-up on our programs, please read the
146 | manual in the `docs` directory.
147 | 
148 | HISTORY
149 | ========================================================================
150 | Preseq was originally developed by Timothy Daley and Andrew D. Smith
151 | at University of Southern California.
152 | 
153 | **v3.2.0:**
154 | Updates to the repo in preparation for putting preseq in conda
155 | 
156 | **v3.1.2:**
157 | Two headers were added.
158 | 
159 | **v3.1.0:**
160 | A mode `pop_size` has been added that uses the continued fraction
161 | approximation to the Good-Toulmin model and extrapolates as far as
162 | possible. Although `bound_pop` provides a good and reliable
163 | lower-bound, this new mode will give a more accurate estimate of the
164 | population size (e.g. total number of distinct molecules). It's not
165 | perfect yet, and in some cases if the population is more than a
166 | billion times larger than the sample, it will still only give a lower
167 | bound. But it works well on most data sets.
168 | 
169 | **v3.0.2:**
170 | GSL has been completely removed, and a data directory has been added
171 | for users to test our programs.
172 | 
173 | **v3.0.1:**
174 | We no longer require users to have GSL for all modules except for
175 | `bound_pop`. Users interested in using `bound_pop` can install GSL and
176 | follow the instructions above to configure with GSL.
177 | 
178 | **v3.0.0:**
179 | The main change to this version is that if BAM/SAM format will be used
180 | as input, the HTSLib library must be installed on the system when
181 | preseq is built. Installation instructions above have been updated
182 | correspondingly. We also updated to use C++11, so a more recent
183 | compiler is required, but these days C++11 is usually supported.
184 | 
185 | **v2.0.3:**
186 | A bug in defect mode was fixed and a rng seed was added to allow for
187 | reproducibility.
188 | 
189 | **v2.0.0:**
190 | We have added a new module, `bound_pop`, to estimate a lower bound of
191 | the population sampled from.  Interpolation is calculated by
192 | expectation rather than subsampling, dramatically improving the speed.
193 | 
194 | **v1.0.2:**
195 | We have switched the dependency on the BamTools API to SAMTools, which
196 | we believe will be more convenient for most users of preseq. Minor
197 | bugs have been fixed, and algorithms have been refined to more
198 | accurately construct counts histograms and extrapolate the complexity
199 | curve. More options have been added to `lc_extrap`. `c_curve` and
200 | `lc_extrap` are now both under a single binary for easier use, and
201 | commands will now be written as `preseq lc_extrap [OPTIONS]`
202 | Furthermore, there are updates to the manual for any minor issues
203 | encountered when compiling the preseq binary.
204 | 
205 | We released an R package called
206 | [preseqR](http://cran.r-project.org/web/packages/preseqR/index.html)
207 | along with preseq. This makes most of the preseq functionality
208 | available in the R statistical environment, and includes some new
209 | functionality. The preseqR directory contains all required source code
210 | to build this R package.
211 | 
212 | CONTACT INFORMATION
213 | ========================================================================
214 | Andrew D. Smith <andrewds@usc.edu> and Timothy Daley <tdaley@stanford.edu>
215 | 
216 | http://smithlabresearch.org
217 | 
218 | LICENSE
219 | ========================================================================
220 | ```txt
221 | The preseq software for estimating library complexity
222 | Copyright (C) 2014-2022 Timothy Daley, Andrew D Smith, Chao Deng
223 |                         University of Southern California
224 | 
225 | This program is free software: you can redistribute it and/or modify
226 | it under the terms of the GNU General Public License as published by
227 | the Free Software Foundation, either version 3 of the License, or (at
228 | your option) any later version.
229 | 
230 | This program is distributed in the hope that it will be useful,
231 | but WITHOUT ANY WARRANTY; without even the implied warranty of
232 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
233 | GNU General Public License for more details.
234 | 
235 | You should have received a copy of the GNU General Public License
236 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
237 | ```
238 | 


--------------------------------------------------------------------------------
/autogen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # Run 'autoreconf -i' to generate 'configure', 'Makefile.in', etc.
 4 | #
 5 | # The first time this is run on a new cloned git repo the configure
 6 | # script will not be present, only the configure.ac and
 7 | # Makefile.am. The rest must be generated by `autoreconf -i`.
 8 | #
 9 | # If you are working with a distribution (file ending with ".tar.gz"
10 | # or similar) then this script should not be needed, and should not be
11 | # present, as all the files should already exist. You should only run
12 | # this script if you know what you are doing with autoreconf.
13 | #
14 | # This script will only work with an argument to confirm the help
15 | # message has been read.
16 | 
17 | runautoreconf() {
18 |     autoreconf -i;
19 | }
20 | 
21 | if test -d .git && test "$(basename "${PWD}")" = "preseq"
22 | then
23 |     runautoreconf
24 |     exit 0
25 | else
26 |     echo "  It seems you are either attempting to run this script       "
27 |     echo "  from the wrong directory, or in a source tree that was      "
28 |     echo "  not obtained by cloning the preseq git repo.                "
29 |     echo "                                                              "
30 |     echo "  ./autogen.sh generates the configure script. Only run       "
31 |     echo "                                                              "
32 |     echo "  Only run this if you know what you are doing with           "
33 |     echo "  autoreconf and are simply avoiding doing that. If you       "
34 |     echo "  just want to use the software, download a release and       "
35 |     echo "  this script will not be needed.                             "
36 |     exit 1
37 | fi
38 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
 1 | dnl This file is part of preseq
 2 | dnl
 3 | dnl Copyright (C) 2018-2024: Andrew D. Smith
 4 | dnl
 5 | dnl Authors: Andrew D. Smith
 6 | dnl
 7 | dnl This is free software: you can redistribute it and/or modify it
 8 | dnl under the terms of the GNU General Public License as published by
 9 | dnl the Free Software Foundation, either version 3 of the License, or
10 | dnl (at your option) any later version.
11 | dnl
12 | dnl This software is distributed in the hope that it will be useful,
13 | dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | dnl General Public License for more details.
16 | 
17 | AC_INIT([preseq], [3.2.0], [andrewds@usc.edu],
18 |         [preseq], [https://github.com/smithlabcode/preseq])
19 | dnl the config.h is not currently #included in the source, and only
20 | dnl used to keep command lines short.
21 | AC_CONFIG_HEADERS([config.h])
22 | AM_INIT_AUTOMAKE([subdir-objects foreign])
23 | 
24 | AC_CONFIG_MACRO_DIR([m4])
25 | AC_LANG(C++)
26 | AC_PROG_CXX
27 | AX_CXX_COMPILE_STDCXX_17([noext], [mandatory])
28 | AC_PROG_RANLIB
29 | 
30 | dnl recursively configure smithlab_cpp
31 | AC_CONFIG_SUBDIRS([src/smithlab_cpp])
32 | 
33 | dnl check for HTSLib if requested
34 | hts_fail_msg="
35 | 
36 | Failed to locate HTSLib on your system. Please use the LDFLAGS and
37 | CPPFLAGS variables to specify the directories where the HTSLib library
38 | and headers can be found.
39 | "
40 | AC_ARG_ENABLE([hts],
41 |   [AS_HELP_STRING([--enable-hts], [Enable HTSLib @<:@yes@:>@])],
42 |   [enable_hts=yes], [enable_hts=no])
43 | AS_IF([test "x$enable_hts" = "xyes"],
44 |   [AC_CHECK_LIB([hts], [hts_version], [],
45 |   [AC_MSG_FAILURE([$hts_fail_msg])])]
46 | )
47 | AM_CONDITIONAL([ENABLE_HTS], [test "x$enable_hts" = "xyes"])
48 | 
49 | AC_CONFIG_FILES([Makefile])
50 | 
51 | dnl make the test data files available in the build tree
52 | AC_CONFIG_LINKS([
53 | tests/md5sum.txt:tests/md5sum.txt
54 | tests/c_curve_input.hist:tests/data/c_curve_input.hist
55 | tests/lc_extrap_input.vals:tests/data/lc_extrap_input.vals
56 | tests/gc_extrap_input.mr:tests/data/gc_extrap_input.mr
57 | ])
58 | 
59 | AC_OUTPUT
60 | 


--------------------------------------------------------------------------------
/data/SRR1106616_5M_subset.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/data/SRR1106616_5M_subset.bam


--------------------------------------------------------------------------------
/data/SRR1301329_1M_hist.txt:
--------------------------------------------------------------------------------
 1 | 1	982419
 2 | 2	6060
 3 | 3	214
 4 | 4	63
 5 | 5	32
 6 | 6	21
 7 | 7	14
 8 | 8	9
 9 | 9	6
10 | 10	3
11 | 11	6
12 | 12	2
13 | 13	2
14 | 14	2
15 | 15	3
16 | 16	2
17 | 24	2
18 | 31	1
19 | 


--------------------------------------------------------------------------------
/data/Shakespeare_hist.txt:
--------------------------------------------------------------------------------
  1 | 1	14376
  2 | 2	4343
  3 | 3	2292
  4 | 4	1463
  5 | 5	1043
  6 | 6	837
  7 | 7	638
  8 | 8	519
  9 | 9	430
 10 | 10	364
 11 | 11	305
 12 | 12	259
 13 | 13	242
 14 | 14	223
 15 | 15	187
 16 | 16	181
 17 | 17	179
 18 | 18	130
 19 | 19	127
 20 | 20	128
 21 | 21	104
 22 | 22	105
 23 | 23	99
 24 | 24	112
 25 | 25	93
 26 | 26	74
 27 | 27	83
 28 | 28	76
 29 | 29	72
 30 | 30	63
 31 | 31	73
 32 | 32	47
 33 | 33	56
 34 | 34	59
 35 | 35	53
 36 | 36	45
 37 | 37	34
 38 | 38	49
 39 | 39	45
 40 | 40	52
 41 | 41	49
 42 | 42	41
 43 | 43	30
 44 | 44	35
 45 | 45	37
 46 | 46	21
 47 | 47	41
 48 | 48	30
 49 | 49	28
 50 | 50	19
 51 | 51	25
 52 | 52	19
 53 | 53	28
 54 | 54	27
 55 | 55	31
 56 | 56	19
 57 | 57	19
 58 | 58	22
 59 | 59	23
 60 | 60	14
 61 | 61	30
 62 | 62	19
 63 | 63	21
 64 | 64	18
 65 | 65	15
 66 | 66	10
 67 | 67	15
 68 | 68	14
 69 | 69	11
 70 | 70	16
 71 | 71	13
 72 | 72	12
 73 | 73	10
 74 | 74	16
 75 | 75	18
 76 | 76	11
 77 | 77	8
 78 | 78	15
 79 | 79	12
 80 | 80	7
 81 | 81	13
 82 | 82	12
 83 | 83	11
 84 | 84	8
 85 | 85	10
 86 | 86	11
 87 | 87	7
 88 | 88	12
 89 | 89	9
 90 | 90	8
 91 | 91	4
 92 | 92	7
 93 | 93	6
 94 | 94	7
 95 | 95	10
 96 | 96	10
 97 | 97	15
 98 | 98	7
 99 | 99	7
100 | 100	5
101 | 815	845
102 | 1305	1
103 | 


--------------------------------------------------------------------------------
/data/additional_data.txt:
--------------------------------------------------------------------------------
1 | If you would like additional data, please use the following links:
2 | 
3 | http://smithlabresearch.org/downloads/preseq/library_complex_test_data/SRR1003759.tar.bz2
4 | http://smithlabresearch.org/downloads/preseq/library_complex_test_data/SRR1041830.tar.bz2
5 | http://smithlabresearch.org/downloads/preseq/library_complex_test_data/SRR1106616.tar.bz2
6 | http://smithlabresearch.org/downloads/preseq/library_complex_test_data/SRR975260.tar.bz2
7 | http://smithlabresearch.org/downloads/preseq/library_complex_test_data/SRX314956.tar.bz2
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/FullExperiment_copy.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/docs/FullExperiment_copy.pdf


--------------------------------------------------------------------------------
/docs/InitialExperimentComplexityCurves_copy.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/docs/InitialExperimentComplexityCurves_copy.pdf


--------------------------------------------------------------------------------
/docs/RELEASE_NOTES.txt:
--------------------------------------------------------------------------------
 1 | preseq release 3.0 (July 22, 2020)
 2 | ------------------------------------------------------------------
 3 | 
 4 | Notable changes in preseq software:
 5 | 
 6 |   * Added GNU autotools for building preseq
 7 | 
 8 |   * Updated to use HTSLib directly, rather than source files from
 9 |     HTSLib, when preseq is built with support for BAM/SAM input
10 |     format.
11 | 
12 | preseq beta release 2.0.2 (December 1, 2015)
13 | ------------------------------------------------------------------
14 | 
15 | Notable changes in preseq software:
16 | 
17 |   * Fix a bug in defect mode to allow for estimation without
18 |     checking the curves for defects.  In such case, more
19 |     bootstraps will help to smooth the curve.
20 | 
21 | 
22 | preseq beta release 2.0.0 (October 29, 2015)
23 | 
24 | ------------------------------------------------------------------
25 | 
26 | Notable changes in preseq software:
27 | 
28 |   * Include the module bound_pop.  This module constructs a
29 |     nonparametric moment-based estimator of species richness,
30 |     the total number of species or classes in the population.
31 | 
32 | 
33 | preseq beta release 1.0.3 (December 15, 2014)
34 | 
35 | ------------------------------------------------------------------
36 | 
37 | Notable changes in preseq software:
38 | 
39 |   * Include defect mode to extrapolate without testing for
40 |     defects.
41 | 
42 | 
43 | preseq beta release 1.0.2 (Aug 25, 2014)
44 | 
45 | ------------------------------------------------------------------
46 | 
47 | Notable changes in preseq software:
48 | 
49 |   * Included gc_extrap option to predict genomic coverage for
50 |     single cell sequencing experiments.
51 | 
52 |   * Changed the method of finding optimal continued fraction
53 |     to improve performance for high variable (e.g. RNAseq)
54 |     libraries.
55 | 
56 | 
57 | preseq beta release 0.0.3 (Aug 5, 2013)
58 | 
59 | ------------------------------------------------------------------
60 | 
61 | Notable changes in preseq software:
62 | 
63 |   * Prediction of the complexity curve is done using the observed data
64 |     when possible.  Previous versions bootstrapped the histogram and
65 |     used the median estimate.  Bootstrapping is only done to compute
66 |     confidence intervals.
67 | 
68 |   * Addition of quick mode option with flag -Q.  The complexity is
69 |     predicted with the observed data and bootstrapping is not done,
70 |     speeding up the computation time tremendously.
71 | 
72 |   * Fixed a bug associated with the unistd.h header for GCC versions
73 |     4.7+.
74 | 
75 |   * Extensively updated the manual with examples and FAQ
76 | 
77 | 
78 | preseq beta release 1.0.2 (Aug 25, 2014)
79 | 
80 | ------------------------------------------------------------------
81 | 
82 |   * Functions for input in header file load_data_for_complexity
83 | 
84 |   * Fix samtools linking problem
85 | 
86 | If you have any questions, comments or bugs, please contact us at
87 | tdaley@usc.edu.  Thank you for using preseq.
88 | 


--------------------------------------------------------------------------------
/docs/TCR_richness_vs_age_lm.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/docs/TCR_richness_vs_age_lm.pdf


--------------------------------------------------------------------------------
/docs/biblio.bib:
--------------------------------------------------------------------------------
  1 | @article{heck1975explicit,
  2 |   title={Explicit Calculation of the Rarefaction Diversity Measurement and the Determination of Sufficient Sample Size},
  3 |   author={Heck, Jr, Kenneth L and van Belle, Gerald and Simberloff, Daniel},
  4 |   journal={Ecology},
  5 |   volume={56},
  6 |   number={6},
  7 |   pages={1459--1461},
  8 |   year={1975},
  9 |   publisher={JSTOR}
 10 | }
 11 | 
 12 | @article{willis2015inference,
 13 |   title={Inference for changes in biodiversity},
 14 |   author={Willis, Amy and Bunge, John and Whitman, Thea},
 15 |   journal={arXiv preprint arXiv:1506.05710},
 16 |   year={2015}
 17 | }
 18 | 
 19 | @article{britanova2014age,
 20 |   title={Age-related decrease in TCR repertoire diversity measured with deep and normalized sequence profiling},
 21 |   author={Britanova, Olga V and Putintseva, Ekaterina V and Shugay, Mikhail and Merzlyak, Ekaterina M and Turchaninova, Maria A and Staroverov, Dmitriy B and Bolotin, Dmitriy A and Lukyanov, Sergey and Bogdanova, Ekaterina A and Mamedov, Ilgar Z and others},
 22 |   journal={The Journal of Immunology},
 23 |   volume={192},
 24 |   number={6},
 25 |   pages={2689--2698},
 26 |   year={2014},
 27 |   publisher={Am Assoc Immnol}
 28 | }
 29 | 
 30 | @article{daley2014modeling,
 31 |   title={Modeling genome coverage in single-cell sequencing},
 32 |   author={Daley, Timothy and Smith, Andrew D},
 33 |   journal={Bioinformatics},
 34 |   pages={btu540},
 35 |   year={2014},
 36 |   publisher={Oxford Univ Press}
 37 | }
 38 | 
 39 | 
 40 | 
 41 | @article{fu2015uniform,
 42 |   title={Uniform and accurate single-cell sequencing based on emulsion whole-genome amplification},
 43 |   author={Fu, Yusi and Li, Chunmei and Lu, Sijia and Zhou, Wenxiong and Tang, Fuchou and Xie, X Sunney and Huang, Yanyi},
 44 |   journal={Proceedings of the National Academy of Sciences},
 45 |   volume={112},
 46 |   number={38},
 47 |   pages={11923--11928},
 48 |   year={2015},
 49 |   publisher={National Acad Sciences}
 50 | }
 51 | 
 52 | @article{chao1987estimating,
 53 |   title={Estimating the population size for capture-recapture data with unequal catchability},
 54 |   author={Chao, Anne},
 55 |   journal={Biometrics},
 56 |   pages={783--791},
 57 |   year={1987},
 58 |   publisher={JSTOR}
 59 | }
 60 | 
 61 | @article{zelterman1988robust,
 62 |   title={Robust estimation in truncated discrete distributions with application to capture-recapture experiments},
 63 |   author={Zelterman, Daniel},
 64 |   journal={Journal of statistical planning and inference},
 65 |   volume={18},
 66 |   number={2},
 67 |   pages={225--237},
 68 |   year={1988},
 69 |   publisher={Elsevier}
 70 | }
 71 | 
 72 | @article{good1956number,
 73 | author = "Good, I. J. and Toulmin, G. H.",
 74 | title = {The number of new species, and the increase in population coverage, when a sample is increased},
 75 | journal = {Biometrika},
 76 | volume = {43},
 77 | year = {1956},
 78 | pages = {45--63}
 79 | }
 80 | 
 81 | @article{kivioja2011counting,
 82 |   title={Counting absolute numbers of molecules using unique molecular identifiers},
 83 |   author={Kivioja, T. and V{\"a}h{\"a}rautio, A. and Karlsson, K. and Bonke, M. and Enge, M. and Linnarsson, S. and Taipale, J.},
 84 |   journal={Nature Methods},
 85 |   year={2012},
 86 |   volume={9},
 87 |   pages={72--74},
 88 |   publisher={Nature Publishing Group}
 89 | }
 90 | 
 91 | @article{lu2012probing,
 92 |   title={Probing meiotic recombination and aneuploidy of single sperm cells by whole-genome sequencing},
 93 |   author={Lu, Sijia and Zong, Chenghang and Fan, Wei and Yang, Mingyu and Li, Jinsen and Chapman, Alec R and Zhu, Ping and Hu, Xuesong and Xu, Liya and Yan, Liying and others},
 94 |   journal={Science},
 95 |   volume={338},
 96 |   number={6114},
 97 |   pages={1627--1630},
 98 |   year={2012},
 99 |   publisher={American Association for the Advancement of Science}
100 | }
101 | 
102 | @article{mercer2011targeted,
103 |   title={Targeted {RNA} sequencing reveals the deep complexity of the human transcriptome},
104 |   author={Mercer, T.R. and Gerhardt, D.J. and Dinger, M.E. and Crawford, J. and Trapnell, C. and Jeddeloh, J.A. and Mattick, J.S. and Rinn, J.L.},
105 |   journal={Nature Biotechnology},
106 |   volume={30},
107 |   number={1},
108 |   pages={99--104},
109 |   year={2011},
110 |   publisher={Nature Publishing Group}
111 | }
112 | 
113 | @article{van2010most,
114 |   title={Most dark matter transcripts are associated with known genes},
115 |   author={van Bakel, Harm and Nislow, Corey and Blencowe, Benjamin J and Hughes, Timothy R},
116 |   journal={PLoS biology},
117 |   volume={8},
118 |   number={5},
119 |   pages={e1000371},
120 |   year={2010},
121 |   publisher={Public Library of Science}
122 | }
123 | 
124 | @article{clark2011reality,
125 |   title={The reality of pervasive transcription},
126 |   author={Clark, Michael B and Amaral, Paulo P and Schlesinger, Felix J and Dinger, Marcel E and Taft, Ryan J and Rinn, John L and Ponting, Chris P and Stadler, Peter F and Morris, Kevin V and Morillon, Antonin and others},
127 |   journal={PLoS biology},
128 |   volume={9},
129 |   number={7},
130 |   pages={e1000625},
131 |   year={2011},
132 |   publisher={Public Library of Science}
133 | }


--------------------------------------------------------------------------------
/docs/compare_RNA_Capture_junction_complexity.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/docs/compare_RNA_Capture_junction_complexity.pdf


--------------------------------------------------------------------------------
/docs/comparing_scWGA_coverage.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/docs/comparing_scWGA_coverage.pdf


--------------------------------------------------------------------------------
/docs/manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smithlabcode/preseq/acc94f2957b15f51f917b7b8e0819c41c81c4949/docs/manual.pdf


--------------------------------------------------------------------------------
/documentation/README.md:
--------------------------------------------------------------------------------
 1 | # preseq documentation
 2 | 
 3 | This is the (new) documentation for preseq that uses
 4 | [mkdocs](https://mkdocs.readthedocs.io) to generate readthedocs pages.
 5 | The public web verison of this documentation is available at
 6 | [preseq.readthedocs.io](https://preseq.readthedocs.io), but for users
 7 | who wish to see the documentation on a web browser offline, you can
 8 | build the documentation locally as described below.
 9 | 
10 | ### Dependencies
11 | 
12 | To build the documentation locally, install mkdocs
13 | ```console
14 | pip install -U mkdocs
15 | ```
16 | 
17 | ### Local compilation
18 | 
19 | Build the HTML documentation by running
20 | ```console
21 | mkdocs build
22 | ```
23 | which will create a `site` directory where markdown files are
24 | converted to HTML
25 | 
26 | Create a local host for the HTML documentation by running
27 | ```console
28 | mkdocs serve
29 | ```
30 | This will create the documentation, usually at http://localhost:8000 .
31 | 


--------------------------------------------------------------------------------
/documentation/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/documentation/docs/quickstart.md:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | ## Installation via conda
 5 | 
 6 | If you know how to use conda then preseq is available among the
 7 | bioconda recipes. You can install it as follows if you have a conda
 8 | environment activated:
 9 | 
10 | ```console
11 | $ conda install -c bioconda preseq
12 | ```
13 | 
14 | The instructions for installing conda are
15 | [here](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html).
16 | 


--------------------------------------------------------------------------------
/documentation/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | jinja2>=3.1.4
 2 | mkdocs>=1.3.1
 3 | babel>=2.9.0
 4 | click>=7.0
 5 | Jinja2>=3.1.4
 6 | Markdown>=3.2.1,<3.4
 7 | PyYAML>=5.2
 8 | watchdog>=2.0.0
 9 | mdx_gh_links>=0.2
10 | ghp-import>=1.0
11 | pyyaml_env_tag>=0.1
12 | mkdocs-redirects>=1.0.1
13 | importlib_metadata>=4.3
14 | packaging>=20.5
15 | mergedeep>=1.3.4
16 | pygments>=2.12
17 | pymdown-extensions
18 | mkdocs-material
19 | 


--------------------------------------------------------------------------------
/documentation/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: preseq
2 | strict: true
3 | 
4 | theme: readthedocs
5 | nav:
6 |    - Home: 'index.md'
7 |    - 'Installation': 'quickstart.md'
8 |    - 'preseq on GitHub' : https://github.com/smithlabcode/preseq
9 | 


--------------------------------------------------------------------------------
/m4/ax_cxx_check_lib.m4:
--------------------------------------------------------------------------------
  1 | dnl @synopsis AX_CXX_CHECK_LIB(libname, functioname, action-if, action-if-not)
  2 | dnl
  3 | dnl The standard AC_CHECK_LIB can not test functions in namespaces.
  4 | dnl Therefore AC_CHECK_LIB(cgicc, cgicc::Cgicc::getVersion) will always
  5 | dnl fail. We need to decompose the functionname into a series of namespaces
  6 | dnl where it gets declared so that it can be used for a link test.
  7 | dnl
  8 | dnl In the first version I did allow namespace::functionname to be a
  9 | dnl reference to a void-argument global functionname (just wrapped in a
 10 | dnl namespace) like its C counterparts would be - but in reality such
 11 | dnl thing does not exist. The only global / static functions are always
 12 | dnl made const-functions which is an attribute mangled along into the
 13 | dnl library function export name. 
 14 | dnl
 15 | dnl The normal usage will ask for a test of a class-member function which
 16 | dnl should be presented with a full function spec with arguments given in 
 17 | dnl parentheses following the function name - if the function to test for 
 18 | dnl does expect arguments then you should add default initial values in the 
 19 | dnl prototype (even if they do not exist originally, these are used only 
 20 | dnl locally to build a correct function call in the configure test script).
 21 | dnl
 22 | dnl In the current version if you do omit the parenthesis from the macro
 23 | dnl argument then the macro will assume that you want to check for the
 24 | dnl class name - which is really to check for default constructor being
 25 | dnl exported from the given library name. 
 26 | dnl
 27 | dnl   EXAMPLE:
 28 | dnl AX_CXX_CHECK_LIB(cgicc, [cgicc::HTTPCookie])
 29 | dnl AX_CXX_CHECK_LIB(cgicc, [cgicc::Cgicc::getVersion () const],
 30 | dnl AX_CXX_CHECK_LIB(boost_regex, [boost::RegEx::Position (int i = 0) const])
 31 | dnl
 32 | dnl Result:
 33 | dnl Just as the usual AX_CXX_CHECK_LIB - defines HAVE_LIBCGICC 
 34 | dnl and adds the libraries to the default library path (and
 35 | dnl uses internally the normal ac_check_lib cache symbol
 36 | dnl like ac_cv_lib_cgicc_cgicc__Cgicc)
 37 | dnl
 38 | dnl Footnote: The C++ language is not good at creating stable library
 39 | dnl interfaces at the binary level - a lot of functionality is usually being 
 40 | dnl given as inline functions plus there is hardly a chance to create opaque 
 41 | dnl types. Therefore most C++ library tests will only do compile tests using
 42 | dnl the header files. Doing a check_lib is however good to check the link
 43 | dnl dependency before hitting it as an error in the build later.
 44 | dnl
 45 | dnl @category C++
 46 | dnl @author Guido U. Draheim
 47 | dnl @vesion 2006-12-18
 48 | 
 49 | AC_DEFUN([AX_CXX_CHECK_LIB],
 50 | [m4_ifval([$3], , [AH_CHECK_LIB([$1])])dnl
 51 | AS_LITERAL_IF([$1],
 52 | 	      [AS_VAR_PUSHDEF([ac_Lib], [ac_cv_lib_$1_$2])],
 53 | 	      [AS_VAR_PUSHDEF([ac_Lib], [ac_cv_lib_$1''_$2])])dnl
 54 | AC_CACHE_CHECK([for $2 in -l$1], ac_Lib,
 55 | [ac_check_lib_save_LIBS=$LIBS
 56 | LIBS="-l$1 $5 $LIBS"
 57 | case "$2" 
 58 | in *::*::*\(*)
 59 | AC_LINK_IFELSE([AC_LANG_PROGRAM([
 60 |  namespace `echo "$2" | sed -e "s/::.*//"` 
 61 |  { class `echo "$2" | sed -e "s/.*::\\(.*\\)::.*/\\1/" -e "s/(.*//"` 
 62 |    { public: int `echo "$2" | sed -e "s/.*:://" -e "/(/!s/..*/&()/"`;
 63 |    };
 64 |  }
 65 | ],[`echo "$2" | sed  -e "s/(.*//" -e "s/\\(.*\\)::\\(.*\\)/((\\1*)(0))->\\2/g"`()])],
 66 | 	       [AS_VAR_SET(ac_Lib, yes)],
 67 | 	       [AS_VAR_SET(ac_Lib, no)])
 68 | ;; *::*::*)
 69 | AC_LINK_IFELSE([AC_LANG_PROGRAM([
 70 |  namespace `echo "$2" | sed -e "s/::.*//"` 
 71 |  { namespace `echo "$2" | sed -e "s/.*::\\(.*\\)::.*/\\1/"` 
 72 |    { class `echo "$2" | sed -e "s/.*:://"` 
 73 |       { public: `echo "$2" | sed -e "s/.*:://"` ();
 74 |       };
 75 |    }
 76 |  }
 77 | ],[new $2()])],
 78 | 	       [AS_VAR_SET(ac_Lib, yes)],
 79 | 	       [AS_VAR_SET(ac_Lib, no)])
 80 | ;; *::*\(*)
 81 | AC_LINK_IFELSE([AC_LANG_PROGRAM([
 82 |  class `echo "$2" | sed -e "s/\\(.*\\)::.*/\\1/" -e "s/(.*//"` 
 83 |    { public: int `echo "$2" | sed -e "s/.*:://" -e "/(/!s/..*/&()/"`;
 84 |    };
 85 | ],[`echo "$2" | sed  -e "s/(.*//" -e "s/\\(.*\\)::\\(.*\\)/((\\1*)(0))->\\2/g"`()])],
 86 | 	       [AS_VAR_SET(ac_Lib, yes)],
 87 | 	       [AS_VAR_SET(ac_Lib, no)])
 88 | ;; *::*)
 89 | AC_LINK_IFELSE([AC_LANG_PROGRAM([
 90 |  namespace `echo "$2" | sed -e "s/::.*//"` 
 91 |  { class `echo "$2" | sed -e "s/.*:://"`
 92 |    { public: `echo "$2" | sed -e "s/.*:://"` ();
 93 |    };
 94 |  }
 95 | ],[new $2()])],
 96 | 	       [AS_VAR_SET(ac_Lib, yes)],
 97 | 	       [AS_VAR_SET(ac_Lib, no)])
 98 | ;; *)
 99 | AC_LINK_IFELSE([AC_LANG_CALL([], [$2])],
100 | 	       [AS_VAR_SET(ac_Lib, yes)],
101 | 	       [AS_VAR_SET(ac_Lib, no)])
102 | ;; esac
103 | LIBS=$ac_check_lib_save_LIBS])
104 | AS_IF([test AS_VAR_GET(ac_Lib) = yes],
105 |       [m4_default([$3], [AC_DEFINE_UNQUOTED(AS_TR_CPP(HAVE_LIB$1))
106 |   LIBS="-l$1 $LIBS"
107 | ])],
108 |       [$4])dnl
109 | AS_VAR_POPDEF([ac_Lib])dnl
110 | ])# AC_CHECK_LIB
111 | 


--------------------------------------------------------------------------------
/m4/ax_cxx_compile_stdcxx.m4:
--------------------------------------------------------------------------------
  1 | # ===========================================================================
  2 | #  https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx.html
  3 | # ===========================================================================
  4 | #
  5 | # SYNOPSIS
  6 | #
  7 | #   AX_CXX_COMPILE_STDCXX(VERSION, [ext|noext], [mandatory|optional])
  8 | #
  9 | # DESCRIPTION
 10 | #
 11 | #   Check for baseline language coverage in the compiler for the specified
 12 | #   version of the C++ standard.  If necessary, add switches to CXX and
 13 | #   CXXCPP to enable support.  VERSION may be '11' (for the C++11 standard)
 14 | #   or '14' (for the C++14 standard).
 15 | #
 16 | #   The second argument, if specified, indicates whether you insist on an
 17 | #   extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
 18 | #   -std=c++11).  If neither is specified, you get whatever works, with
 19 | #   preference for an extended mode.
 20 | #
 21 | #   The third argument, if specified 'mandatory' or if left unspecified,
 22 | #   indicates that baseline support for the specified C++ standard is
 23 | #   required and that the macro should error out if no mode with that
 24 | #   support is found.  If specified 'optional', then configuration proceeds
 25 | #   regardless, after defining HAVE_CXX${VERSION} if and only if a
 26 | #   supporting mode is found.
 27 | #
 28 | # LICENSE
 29 | #
 30 | #   Copyright (c) 2008 Benjamin Kosnik <bkoz@redhat.com>
 31 | #   Copyright (c) 2012 Zack Weinberg <zackw@panix.com>
 32 | #   Copyright (c) 2013 Roy Stogner <roystgnr@ices.utexas.edu>
 33 | #   Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov <sokolov@google.com>
 34 | #   Copyright (c) 2015 Paul Norman <penorman@mac.com>
 35 | #   Copyright (c) 2015 Moritz Klammler <moritz@klammler.eu>
 36 | #   Copyright (c) 2016, 2018 Krzesimir Nowak <qdlacz@gmail.com>
 37 | #   Copyright (c) 2019 Enji Cooper <yaneurabeya@gmail.com>
 38 | #
 39 | #   Copying and distribution of this file, with or without modification, are
 40 | #   permitted in any medium without royalty provided the copyright notice
 41 | #   and this notice are preserved.  This file is offered as-is, without any
 42 | #   warranty.
 43 | 
 44 | #serial 11
 45 | 
 46 | dnl  This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro
 47 | dnl  (serial version number 13).
 48 | 
 49 | AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl
 50 |   m4_if([$1], [11], [ax_cxx_compile_alternatives="11 0x"],
 51 |         [$1], [14], [ax_cxx_compile_alternatives="14 1y"],
 52 |         [$1], [17], [ax_cxx_compile_alternatives="17 1z"],
 53 |         [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl
 54 |   m4_if([$2], [], [],
 55 |         [$2], [ext], [],
 56 |         [$2], [noext], [],
 57 |         [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX])])dnl
 58 |   m4_if([$3], [], [ax_cxx_compile_cxx$1_required=true],
 59 |         [$3], [mandatory], [ax_cxx_compile_cxx$1_required=true],
 60 |         [$3], [optional], [ax_cxx_compile_cxx$1_required=false],
 61 |         [m4_fatal([invalid third argument `$3' to AX_CXX_COMPILE_STDCXX])])
 62 |   AC_LANG_PUSH([C++])dnl
 63 |   ac_success=no
 64 | 
 65 |   m4_if([$2], [noext], [], [dnl
 66 |   if test x$ac_success = xno; then
 67 |     for alternative in ${ax_cxx_compile_alternatives}; do
 68 |       switch="-std=gnu++${alternative}"
 69 |       cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
 70 |       AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
 71 |                      $cachevar,
 72 |         [ac_save_CXX="$CXX"
 73 |          CXX="$CXX $switch"
 74 |          AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
 75 |           [eval $cachevar=yes],
 76 |           [eval $cachevar=no])
 77 |          CXX="$ac_save_CXX"])
 78 |       if eval test x\$$cachevar = xyes; then
 79 |         CXX="$CXX $switch"
 80 |         if test -n "$CXXCPP" ; then
 81 |           CXXCPP="$CXXCPP $switch"
 82 |         fi
 83 |         ac_success=yes
 84 |         break
 85 |       fi
 86 |     done
 87 |   fi])
 88 | 
 89 |   m4_if([$2], [ext], [], [dnl
 90 |   if test x$ac_success = xno; then
 91 |     dnl HP's aCC needs +std=c++11 according to:
 92 |     dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf
 93 |     dnl Cray's crayCC needs "-h std=c++11"
 94 |     for alternative in ${ax_cxx_compile_alternatives}; do
 95 |       for switch in -std=c++${alternative} +std=c++${alternative} "-h std=c++${alternative}"; do
 96 |         cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
 97 |         AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
 98 |                        $cachevar,
 99 |           [ac_save_CXX="$CXX"
100 |            CXX="$CXX $switch"
101 |            AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
102 |             [eval $cachevar=yes],
103 |             [eval $cachevar=no])
104 |            CXX="$ac_save_CXX"])
105 |         if eval test x\$$cachevar = xyes; then
106 |           CXX="$CXX $switch"
107 |           if test -n "$CXXCPP" ; then
108 |             CXXCPP="$CXXCPP $switch"
109 |           fi
110 |           ac_success=yes
111 |           break
112 |         fi
113 |       done
114 |       if test x$ac_success = xyes; then
115 |         break
116 |       fi
117 |     done
118 |   fi])
119 |   AC_LANG_POP([C++])
120 |   if test x$ax_cxx_compile_cxx$1_required = xtrue; then
121 |     if test x$ac_success = xno; then
122 |       AC_MSG_ERROR([*** A compiler with support for C++$1 language features is required.])
123 |     fi
124 |   fi
125 |   if test x$ac_success = xno; then
126 |     HAVE_CXX$1=0
127 |     AC_MSG_NOTICE([No compiler with C++$1 support was found])
128 |   else
129 |     HAVE_CXX$1=1
130 |     AC_DEFINE(HAVE_CXX$1,1,
131 |               [define if the compiler supports basic C++$1 syntax])
132 |   fi
133 |   AC_SUBST(HAVE_CXX$1)
134 | ])
135 | 
136 | 
137 | dnl  Test body for checking C++11 support
138 | 
139 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11],
140 |   _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
141 | )
142 | 
143 | 
144 | dnl  Test body for checking C++14 support
145 | 
146 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14],
147 |   _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
148 |   _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
149 | )
150 | 
151 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_17],
152 |   _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
153 |   _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
154 |   _AX_CXX_COMPILE_STDCXX_testbody_new_in_17
155 | )
156 | 
157 | dnl  Tests for new features in C++11
158 | 
159 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[
160 | 
161 | // If the compiler admits that it is not ready for C++11, why torture it?
162 | // Hopefully, this will speed up the test.
163 | 
164 | #ifndef __cplusplus
165 | 
166 | #error "This is not a C++ compiler"
167 | 
168 | #elif __cplusplus < 201103L
169 | 
170 | #error "This is not a C++11 compiler"
171 | 
172 | #else
173 | 
174 | namespace cxx11
175 | {
176 | 
177 |   namespace test_static_assert
178 |   {
179 | 
180 |     template <typename T>
181 |     struct check
182 |     {
183 |       static_assert(sizeof(int) <= sizeof(T), "not big enough");
184 |     };
185 | 
186 |   }
187 | 
188 |   namespace test_final_override
189 |   {
190 | 
191 |     struct Base
192 |     {
193 |       virtual ~Base() {}
194 |       virtual void f() {}
195 |     };
196 | 
197 |     struct Derived : public Base
198 |     {
199 |       virtual ~Derived() override {}
200 |       virtual void f() override {}
201 |     };
202 | 
203 |   }
204 | 
205 |   namespace test_double_right_angle_brackets
206 |   {
207 | 
208 |     template < typename T >
209 |     struct check {};
210 | 
211 |     typedef check<void> single_type;
212 |     typedef check<check<void>> double_type;
213 |     typedef check<check<check<void>>> triple_type;
214 |     typedef check<check<check<check<void>>>> quadruple_type;
215 | 
216 |   }
217 | 
218 |   namespace test_decltype
219 |   {
220 | 
221 |     int
222 |     f()
223 |     {
224 |       int a = 1;
225 |       decltype(a) b = 2;
226 |       return a + b;
227 |     }
228 | 
229 |   }
230 | 
231 |   namespace test_type_deduction
232 |   {
233 | 
234 |     template < typename T1, typename T2 >
235 |     struct is_same
236 |     {
237 |       static const bool value = false;
238 |     };
239 | 
240 |     template < typename T >
241 |     struct is_same<T, T>
242 |     {
243 |       static const bool value = true;
244 |     };
245 | 
246 |     template < typename T1, typename T2 >
247 |     auto
248 |     add(T1 a1, T2 a2) -> decltype(a1 + a2)
249 |     {
250 |       return a1 + a2;
251 |     }
252 | 
253 |     int
254 |     test(const int c, volatile int v)
255 |     {
256 |       static_assert(is_same<int, decltype(0)>::value == true, "");
257 |       static_assert(is_same<int, decltype(c)>::value == false, "");
258 |       static_assert(is_same<int, decltype(v)>::value == false, "");
259 |       auto ac = c;
260 |       auto av = v;
261 |       auto sumi = ac + av + 'x';
262 |       auto sumf = ac + av + 1.0;
263 |       static_assert(is_same<int, decltype(ac)>::value == true, "");
264 |       static_assert(is_same<int, decltype(av)>::value == true, "");
265 |       static_assert(is_same<int, decltype(sumi)>::value == true, "");
266 |       static_assert(is_same<int, decltype(sumf)>::value == false, "");
267 |       static_assert(is_same<int, decltype(add(c, v))>::value == true, "");
268 |       return (sumf > 0.0) ? sumi : add(c, v);
269 |     }
270 | 
271 |   }
272 | 
273 |   namespace test_noexcept
274 |   {
275 | 
276 |     int f() { return 0; }
277 |     int g() noexcept { return 0; }
278 | 
279 |     static_assert(noexcept(f()) == false, "");
280 |     static_assert(noexcept(g()) == true, "");
281 | 
282 |   }
283 | 
284 |   namespace test_constexpr
285 |   {
286 | 
287 |     template < typename CharT >
288 |     unsigned long constexpr
289 |     strlen_c_r(const CharT *const s, const unsigned long acc) noexcept
290 |     {
291 |       return *s ? strlen_c_r(s + 1, acc + 1) : acc;
292 |     }
293 | 
294 |     template < typename CharT >
295 |     unsigned long constexpr
296 |     strlen_c(const CharT *const s) noexcept
297 |     {
298 |       return strlen_c_r(s, 0UL);
299 |     }
300 | 
301 |     static_assert(strlen_c("") == 0UL, "");
302 |     static_assert(strlen_c("1") == 1UL, "");
303 |     static_assert(strlen_c("example") == 7UL, "");
304 |     static_assert(strlen_c("another\0example") == 7UL, "");
305 | 
306 |   }
307 | 
308 |   namespace test_rvalue_references
309 |   {
310 | 
311 |     template < int N >
312 |     struct answer
313 |     {
314 |       static constexpr int value = N;
315 |     };
316 | 
317 |     answer<1> f(int&)       { return answer<1>(); }
318 |     answer<2> f(const int&) { return answer<2>(); }
319 |     answer<3> f(int&&)      { return answer<3>(); }
320 | 
321 |     void
322 |     test()
323 |     {
324 |       int i = 0;
325 |       const int c = 0;
326 |       static_assert(decltype(f(i))::value == 1, "");
327 |       static_assert(decltype(f(c))::value == 2, "");
328 |       static_assert(decltype(f(0))::value == 3, "");
329 |     }
330 | 
331 |   }
332 | 
333 |   namespace test_uniform_initialization
334 |   {
335 | 
336 |     struct test
337 |     {
338 |       static const int zero {};
339 |       static const int one {1};
340 |     };
341 | 
342 |     static_assert(test::zero == 0, "");
343 |     static_assert(test::one == 1, "");
344 | 
345 |   }
346 | 
347 |   namespace test_lambdas
348 |   {
349 | 
350 |     void
351 |     test1()
352 |     {
353 |       auto lambda1 = [](){};
354 |       auto lambda2 = lambda1;
355 |       lambda1();
356 |       lambda2();
357 |     }
358 | 
359 |     int
360 |     test2()
361 |     {
362 |       auto a = [](int i, int j){ return i + j; }(1, 2);
363 |       auto b = []() -> int { return '0'; }();
364 |       auto c = [=](){ return a + b; }();
365 |       auto d = [&](){ return c; }();
366 |       auto e = [a, &b](int x) mutable {
367 |         const auto identity = [](int y){ return y; };
368 |         for (auto i = 0; i < a; ++i)
369 |           a += b--;
370 |         return x + identity(a + b);
371 |       }(0);
372 |       return a + b + c + d + e;
373 |     }
374 | 
375 |     int
376 |     test3()
377 |     {
378 |       const auto nullary = [](){ return 0; };
379 |       const auto unary = [](int x){ return x; };
380 |       using nullary_t = decltype(nullary);
381 |       using unary_t = decltype(unary);
382 |       const auto higher1st = [](nullary_t f){ return f(); };
383 |       const auto higher2nd = [unary](nullary_t f1){
384 |         return [unary, f1](unary_t f2){ return f2(unary(f1())); };
385 |       };
386 |       return higher1st(nullary) + higher2nd(nullary)(unary);
387 |     }
388 | 
389 |   }
390 | 
391 |   namespace test_variadic_templates
392 |   {
393 | 
394 |     template <int...>
395 |     struct sum;
396 | 
397 |     template <int N0, int... N1toN>
398 |     struct sum<N0, N1toN...>
399 |     {
400 |       static constexpr auto value = N0 + sum<N1toN...>::value;
401 |     };
402 | 
403 |     template <>
404 |     struct sum<>
405 |     {
406 |       static constexpr auto value = 0;
407 |     };
408 | 
409 |     static_assert(sum<>::value == 0, "");
410 |     static_assert(sum<1>::value == 1, "");
411 |     static_assert(sum<23>::value == 23, "");
412 |     static_assert(sum<1, 2>::value == 3, "");
413 |     static_assert(sum<5, 5, 11>::value == 21, "");
414 |     static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, "");
415 | 
416 |   }
417 | 
418 |   // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
419 |   // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function
420 |   // because of this.
421 |   namespace test_template_alias_sfinae
422 |   {
423 | 
424 |     struct foo {};
425 | 
426 |     template<typename T>
427 |     using member = typename T::member_type;
428 | 
429 |     template<typename T>
430 |     void func(...) {}
431 | 
432 |     template<typename T>
433 |     void func(member<T>*) {}
434 | 
435 |     void test();
436 | 
437 |     void test() { func<foo>(0); }
438 | 
439 |   }
440 | 
441 | }  // namespace cxx11
442 | 
443 | #endif  // __cplusplus >= 201103L
444 | 
445 | ]])
446 | 
447 | 
448 | dnl  Tests for new features in C++14
449 | 
450 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[
451 | 
452 | // If the compiler admits that it is not ready for C++14, why torture it?
453 | // Hopefully, this will speed up the test.
454 | 
455 | #ifndef __cplusplus
456 | 
457 | #error "This is not a C++ compiler"
458 | 
459 | #elif __cplusplus < 201402L
460 | 
461 | #error "This is not a C++14 compiler"
462 | 
463 | #else
464 | 
465 | namespace cxx14
466 | {
467 | 
468 |   namespace test_polymorphic_lambdas
469 |   {
470 | 
471 |     int
472 |     test()
473 |     {
474 |       const auto lambda = [](auto&&... args){
475 |         const auto istiny = [](auto x){
476 |           return (sizeof(x) == 1UL) ? 1 : 0;
477 |         };
478 |         const int aretiny[] = { istiny(args)... };
479 |         return aretiny[0];
480 |       };
481 |       return lambda(1, 1L, 1.0f, '1');
482 |     }
483 | 
484 |   }
485 | 
486 |   namespace test_binary_literals
487 |   {
488 | 
489 |     constexpr auto ivii = 0b0000000000101010;
490 |     static_assert(ivii == 42, "wrong value");
491 | 
492 |   }
493 | 
494 |   namespace test_generalized_constexpr
495 |   {
496 | 
497 |     template < typename CharT >
498 |     constexpr unsigned long
499 |     strlen_c(const CharT *const s) noexcept
500 |     {
501 |       auto length = 0UL;
502 |       for (auto p = s; *p; ++p)
503 |         ++length;
504 |       return length;
505 |     }
506 | 
507 |     static_assert(strlen_c("") == 0UL, "");
508 |     static_assert(strlen_c("x") == 1UL, "");
509 |     static_assert(strlen_c("test") == 4UL, "");
510 |     static_assert(strlen_c("another\0test") == 7UL, "");
511 | 
512 |   }
513 | 
514 |   namespace test_lambda_init_capture
515 |   {
516 | 
517 |     int
518 |     test()
519 |     {
520 |       auto x = 0;
521 |       const auto lambda1 = [a = x](int b){ return a + b; };
522 |       const auto lambda2 = [a = lambda1(x)](){ return a; };
523 |       return lambda2();
524 |     }
525 | 
526 |   }
527 | 
528 |   namespace test_digit_separators
529 |   {
530 | 
531 |     constexpr auto ten_million = 100'000'000;
532 |     static_assert(ten_million == 100000000, "");
533 | 
534 |   }
535 | 
536 |   namespace test_return_type_deduction
537 |   {
538 | 
539 |     auto f(int& x) { return x; }
540 |     decltype(auto) g(int& x) { return x; }
541 | 
542 |     template < typename T1, typename T2 >
543 |     struct is_same
544 |     {
545 |       static constexpr auto value = false;
546 |     };
547 | 
548 |     template < typename T >
549 |     struct is_same<T, T>
550 |     {
551 |       static constexpr auto value = true;
552 |     };
553 | 
554 |     int
555 |     test()
556 |     {
557 |       auto x = 0;
558 |       static_assert(is_same<int, decltype(f(x))>::value, "");
559 |       static_assert(is_same<int&, decltype(g(x))>::value, "");
560 |       return x;
561 |     }
562 | 
563 |   }
564 | 
565 | }  // namespace cxx14
566 | 
567 | #endif  // __cplusplus >= 201402L
568 | 
569 | ]])
570 | 
571 | 
572 | dnl  Tests for new features in C++17
573 | 
574 | m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_17], [[
575 | 
576 | // If the compiler admits that it is not ready for C++17, why torture it?
577 | // Hopefully, this will speed up the test.
578 | 
579 | #ifndef __cplusplus
580 | 
581 | #error "This is not a C++ compiler"
582 | 
583 | #elif __cplusplus < 201703L
584 | 
585 | #error "This is not a C++17 compiler"
586 | 
587 | #else
588 | 
589 | #include <initializer_list>
590 | #include <utility>
591 | #include <type_traits>
592 | 
593 | namespace cxx17
594 | {
595 | 
596 |   namespace test_constexpr_lambdas
597 |   {
598 | 
599 |     constexpr int foo = [](){return 42;}();
600 | 
601 |   }
602 | 
603 |   namespace test::nested_namespace::definitions
604 |   {
605 | 
606 |   }
607 | 
608 |   namespace test_fold_expression
609 |   {
610 | 
611 |     template<typename... Args>
612 |     int multiply(Args... args)
613 |     {
614 |       return (args * ... * 1);
615 |     }
616 | 
617 |     template<typename... Args>
618 |     bool all(Args... args)
619 |     {
620 |       return (args && ...);
621 |     }
622 | 
623 |   }
624 | 
625 |   namespace test_extended_static_assert
626 |   {
627 | 
628 |     static_assert (true);
629 | 
630 |   }
631 | 
632 |   namespace test_auto_brace_init_list
633 |   {
634 | 
635 |     auto foo = {5};
636 |     auto bar {5};
637 | 
638 |     static_assert(std::is_same<std::initializer_list<int>, decltype(foo)>::value);
639 |     static_assert(std::is_same<int, decltype(bar)>::value);
640 |   }
641 | 
642 |   namespace test_typename_in_template_template_parameter
643 |   {
644 | 
645 |     template<template<typename> typename X> struct D;
646 | 
647 |   }
648 | 
649 |   namespace test_fallthrough_nodiscard_maybe_unused_attributes
650 |   {
651 | 
652 |     int f1()
653 |     {
654 |       return 42;
655 |     }
656 | 
657 |     [[nodiscard]] int f2()
658 |     {
659 |       [[maybe_unused]] auto unused = f1();
660 | 
661 |       switch (f1())
662 |       {
663 |       case 17:
664 |         f1();
665 |         [[fallthrough]];
666 |       case 42:
667 |         f1();
668 |       }
669 |       return f1();
670 |     }
671 | 
672 |   }
673 | 
674 |   namespace test_extended_aggregate_initialization
675 |   {
676 | 
677 |     struct base1
678 |     {
679 |       int b1, b2 = 42;
680 |     };
681 | 
682 |     struct base2
683 |     {
684 |       base2() {
685 |         b3 = 42;
686 |       }
687 |       int b3;
688 |     };
689 | 
690 |     struct derived : base1, base2
691 |     {
692 |         int d;
693 |     };
694 | 
695 |     derived d1 {{1, 2}, {}, 4};  // full initialization
696 |     derived d2 {{}, {}, 4};      // value-initialized bases
697 | 
698 |   }
699 | 
700 |   namespace test_general_range_based_for_loop
701 |   {
702 | 
703 |     struct iter
704 |     {
705 |       int i;
706 | 
707 |       int& operator* ()
708 |       {
709 |         return i;
710 |       }
711 | 
712 |       const int& operator* () const
713 |       {
714 |         return i;
715 |       }
716 | 
717 |       iter& operator++()
718 |       {
719 |         ++i;
720 |         return *this;
721 |       }
722 |     };
723 | 
724 |     struct sentinel
725 |     {
726 |       int i;
727 |     };
728 | 
729 |     bool operator== (const iter& i, const sentinel& s)
730 |     {
731 |       return i.i == s.i;
732 |     }
733 | 
734 |     bool operator!= (const iter& i, const sentinel& s)
735 |     {
736 |       return !(i == s);
737 |     }
738 | 
739 |     struct range
740 |     {
741 |       iter begin() const
742 |       {
743 |         return {0};
744 |       }
745 | 
746 |       sentinel end() const
747 |       {
748 |         return {5};
749 |       }
750 |     };
751 | 
752 |     void f()
753 |     {
754 |       range r {};
755 | 
756 |       for (auto i : r)
757 |       {
758 |         [[maybe_unused]] auto v = i;
759 |       }
760 |     }
761 | 
762 |   }
763 | 
764 |   namespace test_lambda_capture_asterisk_this_by_value
765 |   {
766 | 
767 |     struct t
768 |     {
769 |       int i;
770 |       int foo()
771 |       {
772 |         return [*this]()
773 |         {
774 |           return i;
775 |         }();
776 |       }
777 |     };
778 | 
779 |   }
780 | 
781 |   namespace test_enum_class_construction
782 |   {
783 | 
784 |     enum class byte : unsigned char
785 |     {};
786 | 
787 |     byte foo {42};
788 | 
789 |   }
790 | 
791 |   namespace test_constexpr_if
792 |   {
793 | 
794 |     template <bool cond>
795 |     int f ()
796 |     {
797 |       if constexpr(cond)
798 |       {
799 |         return 13;
800 |       }
801 |       else
802 |       {
803 |         return 42;
804 |       }
805 |     }
806 | 
807 |   }
808 | 
809 |   namespace test_selection_statement_with_initializer
810 |   {
811 | 
812 |     int f()
813 |     {
814 |       return 13;
815 |     }
816 | 
817 |     int f2()
818 |     {
819 |       if (auto i = f(); i > 0)
820 |       {
821 |         return 3;
822 |       }
823 | 
824 |       switch (auto i = f(); i + 4)
825 |       {
826 |       case 17:
827 |         return 2;
828 | 
829 |       default:
830 |         return 1;
831 |       }
832 |     }
833 | 
834 |   }
835 | 
836 |   namespace test_template_argument_deduction_for_class_templates
837 |   {
838 | 
839 |     template <typename T1, typename T2>
840 |     struct pair
841 |     {
842 |       pair (T1 p1, T2 p2)
843 |         : m1 {p1},
844 |           m2 {p2}
845 |       {}
846 | 
847 |       T1 m1;
848 |       T2 m2;
849 |     };
850 | 
851 |     void f()
852 |     {
853 |       [[maybe_unused]] auto p = pair{13, 42u};
854 |     }
855 | 
856 |   }
857 | 
858 |   namespace test_non_type_auto_template_parameters
859 |   {
860 | 
861 |     template <auto n>
862 |     struct B
863 |     {};
864 | 
865 |     B<5> b1;
866 |     B<'a'> b2;
867 | 
868 |   }
869 | 
870 |   namespace test_structured_bindings
871 |   {
872 | 
873 |     int arr[2] = { 1, 2 };
874 |     std::pair<int, int> pr = { 1, 2 };
875 | 
876 |     auto f1() -> int(&)[2]
877 |     {
878 |       return arr;
879 |     }
880 | 
881 |     auto f2() -> std::pair<int, int>&
882 |     {
883 |       return pr;
884 |     }
885 | 
886 |     struct S
887 |     {
888 |       int x1 : 2;
889 |       volatile double y1;
890 |     };
891 | 
892 |     S f3()
893 |     {
894 |       return {};
895 |     }
896 | 
897 |     auto [ x1, y1 ] = f1();
898 |     auto& [ xr1, yr1 ] = f1();
899 |     auto [ x2, y2 ] = f2();
900 |     auto& [ xr2, yr2 ] = f2();
901 |     const auto [ x3, y3 ] = f3();
902 | 
903 |   }
904 | 
905 |   namespace test_exception_spec_type_system
906 |   {
907 | 
908 |     struct Good {};
909 |     struct Bad {};
910 | 
911 |     void g1() noexcept;
912 |     void g2();
913 | 
914 |     template<typename T>
915 |     Bad
916 |     f(T*, T*);
917 | 
918 |     template<typename T1, typename T2>
919 |     Good
920 |     f(T1*, T2*);
921 | 
922 |     static_assert (std::is_same_v<Good, decltype(f(g1, g2))>);
923 | 
924 |   }
925 | 
926 |   namespace test_inline_variables
927 |   {
928 | 
929 |     template<class T> void f(T)
930 |     {}
931 | 
932 |     template<class T> inline T g(T)
933 |     {
934 |       return T{};
935 |     }
936 | 
937 |     template<> inline void f<>(int)
938 |     {}
939 | 
940 |     template<> int g<>(int)
941 |     {
942 |       return 5;
943 |     }
944 | 
945 |   }
946 | 
947 | }  // namespace cxx17
948 | 
949 | #endif  // __cplusplus < 201703L
950 | 
951 | ]])
952 | 


--------------------------------------------------------------------------------
/m4/ax_cxx_compile_stdcxx_17.m4:
--------------------------------------------------------------------------------
 1 | # =============================================================================
 2 | #  https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_17.html
 3 | # =============================================================================
 4 | #
 5 | # SYNOPSIS
 6 | #
 7 | #   AX_CXX_COMPILE_STDCXX_17([ext|noext], [mandatory|optional])
 8 | #
 9 | # DESCRIPTION
10 | #
11 | #   Check for baseline language coverage in the compiler for the C++17
12 | #   standard; if necessary, add switches to CXX and CXXCPP to enable
13 | #   support.
14 | #
15 | #   This macro is a convenience alias for calling the AX_CXX_COMPILE_STDCXX
16 | #   macro with the version set to C++17.  The two optional arguments are
17 | #   forwarded literally as the second and third argument respectively.
18 | #   Please see the documentation for the AX_CXX_COMPILE_STDCXX macro for
19 | #   more information.  If you want to use this macro, you also need to
20 | #   download the ax_cxx_compile_stdcxx.m4 file.
21 | #
22 | # LICENSE
23 | #
24 | #   Copyright (c) 2015 Moritz Klammler <moritz@klammler.eu>
25 | #   Copyright (c) 2016 Krzesimir Nowak <qdlacz@gmail.com>
26 | #
27 | #   Copying and distribution of this file, with or without modification, are
28 | #   permitted in any medium without royalty provided the copyright notice
29 | #   and this notice are preserved. This file is offered as-is, without any
30 | #   warranty.
31 | 
32 | #serial 2
33 | 
34 | AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX])
35 | AC_DEFUN([AX_CXX_COMPILE_STDCXX_17], [AX_CXX_COMPILE_STDCXX([17], [$1], [$2])])
36 | 
37 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2011-2020 University of Southern California and
 2 | #                         Andrew D. Smith and Timothy Daley
 3 | #
 4 | # Authors: Timothy Daley and Andrew D. Smith
 5 | #
 6 | # This program is free software: you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation, either version 3 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | 
16 | PROGS = preseq
17 | ifdef HAVE_HTSLIB
18 | PROGS += to-mr
19 | endif
20 | 
21 | INCLUDEDIRS = smithlab_cpp
22 | INCLUDEARGS = $(addprefix -I, $(INCLUDEDIRS))
23 | LIBS = -lz
24 | CXX = g++
25 | CXXFLAGS = -std=c++11 -Wall
26 | 
27 | ifdef DEBUG
28 | CXXFLAGS += -g
29 | else
30 | CXXFLAGS += -O2
31 | endif
32 | 
33 | ifdef HAVE_HTSLIB
34 | CXXFLAGS += -DHAVE_HTSLIB
35 | LIBS += -lhts
36 | endif
37 | 
38 | all: $(PROGS)
39 | 
40 | $(PROGS): $(addprefix smithlab_cpp/, \
41 | 	smithlab_os.o smithlab_utils.o GenomicRegion.o \
42 | 	OptionParser.o MappedRead.o)
43 | 
44 | ifdef HAVE_HTSLIB
45 | preseq to-mr: $(addprefix smithlab_cpp/, \
46 |   htslib_wrapper_deprecated.o cigar_utils.o)
47 | endif
48 | 
49 | preseq: continued_fraction.o load_data_for_complexity.o moment_sequence.o
50 | 
51 | %.o: %.cpp %.hpp
52 | 	$(CXX) $(CXXFLAGS) -c -o $@ $< $(INCLUDEARGS)
53 | 
54 | %: %.cpp
55 | 	$(CXX) $(CXXFLAGS) -o $@ $^ $(INCLUDEARGS) $(LIBS)
56 | 
57 | install: $(PROGS)
58 | 	@mkdir -p $(install_dir)/bin
59 | 	@install $(PROGS) $(install_dir)/bin
60 | 
61 | clean:
62 | 	@-make -C smithlab_cpp clean
63 | 	@-rm -f $(PROGS) *.o
64 | 
65 | .PHONY: install clean
66 | 


--------------------------------------------------------------------------------
/src/bam_record_utils.hpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (C) 2020-2023 Masaru Nakajima and Andrew D. Smith
  2 |  *
  3 |  * Authors: Masaru Nakajima and Andrew D. Smith
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or
  6 |  * modify it under the terms of the GNU General Public License as
  7 |  * published by the Free Software Foundation, either version 3 of the
  8 |  * License, or (at your option) any later version.
  9 |  *
 10 |  * This program is distributed in the hope that it will be useful, but
 11 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13 |  * General Public License for more details.
 14 |  */
 15 | 
 16 | #ifndef SRC_BAM_RECORD_UTILS_HPP_
 17 | #define SRC_BAM_RECORD_UTILS_HPP_
 18 | 
 19 | /* ADS: need to control all the macros from HTSlib pollution. For
 20 |    functions maybe:
 21 | 
 22 |    $ gcc -dM -E sam.h | grep "define [a-z]" | awk '{print $2}' |\
 23 |        grep "[(]" | awk -v FS="(" '{print "#undef",$1}'
 24 | 
 25 |    This gives about 65 symbols that need to be deleted. For the others
 26 |    I don't know what to do because some of them have "#define _" which
 27 |    means they should be system symbols.
 28 | */
 29 | 
 30 | #include <bamxx.hpp>
 31 | 
 32 | #include <string>
 33 | 
 34 | #ifdef bam_is_rev
 35 | #undef bam_is_rev
 36 | #endif
 37 | 
 38 | inline bool
 39 | bam_is_rev(const bamxx::bam_rec &b) {
 40 |   return (b.b->core.flag & BAM_FREVERSE) != 0;
 41 | }
 42 | 
 43 | #ifdef bam_is_mrev
 44 | #undef bam_is_mrev
 45 | #endif
 46 | 
 47 | inline bool
 48 | bam_is_mrev(const bamxx::bam_rec &b) {
 49 |   return (b.b->core.flag & BAM_FMREVERSE) != 0;
 50 | }
 51 | 
 52 | #ifdef bam_get_qname
 53 | #undef bam_get_qname
 54 | #endif
 55 | 
 56 | inline char *
 57 | bam_get_qname(const bamxx::bam_rec &b) {
 58 |   return reinterpret_cast<char *>(b.b->data);
 59 | }
 60 | 
 61 | #ifdef bam_get_cigar
 62 | #undef bam_get_cigar
 63 | #endif
 64 | 
 65 | inline uint32_t *
 66 | bam_get_cigar(const bamxx::bam_rec &b) {
 67 |   // start of data + bytes for query/read name
 68 |   return reinterpret_cast<uint32_t *>(b.b->data + b.b->core.l_qname);
 69 | }
 70 | 
 71 | #ifdef bam_get_seq
 72 | #undef bam_get_seq
 73 | #endif
 74 | 
 75 | inline uint8_t *
 76 | bam_get_seq(const bamxx::bam_rec &b) {
 77 |   // start of data + bytes for cigar + bytes for query/read name
 78 |   return b.b->data + b.b->core.l_qname + (b.b->core.n_cigar << 2);
 79 | }
 80 | 
 81 | #ifdef bam_get_qual
 82 | #undef bam_get_qual
 83 | #endif
 84 | 
 85 | inline uint8_t *
 86 | bam_get_qual(const bamxx::bam_rec &b) {
 87 |   return b.b->data +                     // start of data
 88 |          b.b->core.l_qname +             // bytes for query name
 89 |          (b.b->core.n_cigar << 2) +      // bytes for cigar
 90 |          ((b.b->core.l_qseq + 1) >> 1);  // bytes for packed query/read
 91 | }
 92 | 
 93 | #ifdef bam_get_aux
 94 | #undef bam_get_aux
 95 | #endif
 96 | 
 97 | inline uint8_t *
 98 | bam_get_aux(const bamxx::bam_rec &b) {
 99 |   return b.b->data + b.b->core.l_qname + (b.b->core.n_cigar << 2) +
100 |          ((b.b->core.l_qseq + 1) >> 1) + b.b->core.l_qseq;
101 | }
102 | 
103 | #ifdef bam_get_l_aux
104 | #undef bam_get_l_aux
105 | #endif
106 | 
107 | inline int
108 | bam_get_l_aux(const bamxx::bam_rec &b) {
109 |   return b.b->l_data - (b.b->core.l_qname + (b.b->core.n_cigar << 2) +
110 |                         ((b.b->core.l_qseq + 1) >> 1) + b.b->core.l_qseq);
111 | }
112 | 
113 | #ifdef bam_cigar_op
114 | #undef bam_cigar_op
115 | #endif
116 | 
117 | inline uint32_t
118 | bam_cigar_op(const uint32_t c) {
119 |   return c & BAM_CIGAR_MASK;
120 | }
121 | 
122 | #ifdef bam_cigar_oplen
123 | #undef bam_cigar_oplen
124 | #endif
125 | 
126 | inline uint32_t
127 | bam_cigar_oplen(const uint32_t c) {
128 |   return c >> BAM_CIGAR_SHIFT;
129 | }
130 | 
131 | inline bool
132 | bam_same_orientation(const bamxx::bam_rec &a, const bamxx::bam_rec &b) {
133 |   return ((a.b->core.flag ^ b.b->core.flag) & BAM_FREVERSE) != 0;
134 | }
135 | 
136 | int
137 | truncate_overlap(const bamxx::bam_rec &a, const uint32_t overlap,
138 |                  bamxx::bam_rec &c);
139 | 
140 | int
141 | merge_overlap(const bamxx::bam_rec &a, const bamxx::bam_rec &b,
142 |               const uint32_t head, bamxx::bam_rec &c);
143 | 
144 | int
145 | merge_non_overlap(const bamxx::bam_rec &a, const bamxx::bam_rec &b,
146 |                   const uint32_t spacer, bamxx::bam_rec &c);
147 | 
148 | int
149 | keep_better_end(const bamxx::bam_rec &a, const bamxx::bam_rec &b,
150 |                 bamxx::bam_rec &c);
151 | 
152 | size_t
153 | correct_cigar(bamxx::bam_rec &b);
154 | 
155 | void
156 | flip_conversion(bamxx::bam_rec &aln);
157 | 
158 | inline bool
159 | is_a_rich(const bamxx::bam_rec &b) {
160 |   return bam_aux2A(bam_aux_get(b.b, "CV")) == 'A';
161 | }
162 | 
163 | void
164 | standardize_format(const std::string &input_format, bamxx::bam_rec &aln);
165 | 
166 | void
167 | apply_cigar(const bamxx::bam_rec &aln, std::string &to_inflate,
168 |             const char inflation_symbol);
169 | 
170 | void
171 | get_seq_str(const bamxx::bam_rec &aln, std::string &seq_str);
172 | 
173 | inline bool
174 | are_mates(const bamxx::bam_rec &one, const bamxx::bam_rec &two) {
175 |   return one.b->core.mtid == two.b->core.tid &&
176 |          one.b->core.mpos == two.b->core.pos && bam_same_orientation(one, two);
177 |   // below is a consistency check and should not be necessary
178 |   /* &&
179 |      two->core.mtid == one->core.tid &&
180 |      two->core.mpos == one->core.pos; */
181 | }
182 | 
183 | inline int32_t
184 | get_l_qseq(const bamxx::bam_rec &b) {
185 |   return b.b->core.l_qseq;
186 | }
187 | 
188 | inline size_t
189 | get_n_targets(const bamxx::bam_header &bh) {
190 |   return bh.h->n_targets;
191 | }
192 | 
193 | inline std::string
194 | get_qname(const bamxx::bam_rec &b) {
195 |   return bam_get_qname(b);
196 | }
197 | 
198 | inline int32_t
199 | get_tid(const bamxx::bam_rec &b) {
200 |   return b.b->core.tid;
201 | }
202 | 
203 | inline hts_pos_t
204 | get_pos(const bamxx::bam_rec &b) {
205 |   return b.b->core.pos;
206 | }
207 | 
208 | inline int32_t
209 | get_mtid(const bamxx::bam_rec &b) {
210 |   return b.b->core.mtid;
211 | }
212 | 
213 | inline hts_pos_t
214 | get_mpos(const bamxx::bam_rec &b) {
215 |   return b.b->core.mpos;
216 | }
217 | 
218 | inline uint32_t
219 | get_n_cigar(const bamxx::bam_rec &b) {
220 |   return b.b->core.n_cigar;
221 | }
222 | 
223 | inline hts_pos_t
224 | get_endpos(const bamxx::bam_rec &b) {
225 |   return bam_endpos(b.b);
226 | }
227 | 
228 | inline bool
229 | cigar_eats_ref(const uint32_t c) {
230 |   return bam_cigar_type(bam_cigar_op(c)) & 2;
231 | }
232 | 
233 | inline bool
234 | cigar_eats_query(const uint32_t c) {
235 |   return bam_cigar_type(bam_cigar_op(c)) & 1;
236 | }
237 | 
238 | inline bool
239 | cigar_eats_frag(const uint32_t c) {
240 |   return bam_cigar_op(c) == BAM_CREF_SKIP;
241 | }
242 | 
243 | inline bool
244 | precedes_by_start(const bamxx::bam_rec &a, const bamxx::bam_rec &b) {
245 |   // assumes a.get_tid() <= b.get_tid()
246 |   return get_tid(a) == get_tid(b) && get_pos(a) < get_pos(b);
247 | }
248 | 
249 | inline bool
250 | precedes_by_end_and_strand(const bamxx::bam_rec &a, const bamxx::bam_rec &b) {
251 |   const auto end_a = bam_endpos(a.b);
252 |   const auto end_b = bam_endpos(b.b);
253 |   return end_a < end_b ||
254 |          (end_a == end_b && bam_is_rev(a) == false && bam_is_rev(b) == true);
255 | }
256 | 
257 | inline bool
258 | equivalent_chrom_and_start(const bamxx::bam_rec &a, const bamxx::bam_rec &b) {
259 |   return a.b->core.pos == b.b->core.pos && a.b->core.tid == b.b->core.tid;
260 | }
261 | 
262 | inline bool
263 | equivalent_end_and_strand(const bamxx::bam_rec &a, const bamxx::bam_rec &b) {
264 |   return bam_endpos(a.b) == bam_endpos(b.b) && bam_is_rev(a) == bam_is_rev(b);
265 | }
266 | 
267 | template <typename T>
268 | int
269 | bam_aux_update_int(bamxx::bam_rec &b, const char tag[2], T val) {
270 |   return bam_aux_update_int(b.b, tag, val);
271 | }
272 | 
273 | inline std::string
274 | sam_hdr_tid2name(const bamxx::bam_header &hdr, const int32_t tid) {
275 |   return std::string(sam_hdr_tid2name(hdr.h, tid));
276 | }
277 | 
278 | inline uint32_t
279 | sam_hdr_tid2len(const bamxx::bam_header &hdr, const int32_t tid) {
280 |   return sam_hdr_tid2len(hdr.h, tid);
281 | }
282 | 
283 | inline std::string
284 | sam_hdr_tid2name(const bamxx::bam_header &hdr, const bamxx::bam_rec &aln) {
285 |   return std::string(sam_hdr_tid2name(hdr.h, aln.b->core.tid));
286 | }
287 | 
288 | std::string
289 | to_string(const bamxx::bam_header &hdr, const bamxx::bam_rec &aln);
290 | 
291 | inline size_t
292 | rlen_from_cigar(const bamxx::bam_rec &aln) {
293 |   return bam_cigar2rlen(get_n_cigar(aln), bam_get_cigar(aln));
294 | }
295 | 
296 | #endif  // SRC_BAM_RECORD_UTILS_HPP_
297 | 


--------------------------------------------------------------------------------
/src/bound_pop.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (C) 2013-2024 University of Southern California and
  2 |  *                         Andrew D. Smith and Timothy Daley
  3 |  *
  4 |  * Authors: Timothy Daley and Andrew Smith
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or
  7 |  * modify it under the terms of the GNU General Public License as
  8 |  * published by the Free Software Foundation, either version 3 of the
  9 |  * License, or (at your option) any later version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful, but
 12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 |  * General Public License for more details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program. If not, see
 18 |  * <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | #include "bound_pop.hpp"
 22 | 
 23 | #include "common.hpp"
 24 | #include "load_data_for_complexity.hpp"
 25 | #include "moment_sequence.hpp"
 26 | 
 27 | #include <OptionParser.hpp>
 28 | 
 29 | #include <algorithm>
 30 | #include <cstdint>
 31 | #include <filesystem>
 32 | #include <fstream>
 33 | #include <iostream>
 34 | #include <random>  // std::mt19937
 35 | #include <stdexcept>
 36 | #include <string>
 37 | #include <vector>
 38 | 
 39 | using std::cerr;
 40 | using std::endl;
 41 | using std::isfinite;
 42 | using std::min;
 43 | using std::mt19937;
 44 | using std::runtime_error;
 45 | using std::string;
 46 | using std::uint32_t;
 47 | using std::vector;
 48 | 
 49 | static void
 50 | report_bootstrapped_moments(const vector<double> &bootstrap_moments,
 51 |                             const MomentSequence &bootstrap_mom_seq,
 52 |                             const vector<double> &points,
 53 |                             const vector<double> &weights,
 54 |                             const double estimated_unobs) {
 55 |   cerr << "bootstrapped_moments=" << endl;
 56 |   for (size_t i = 0; i < bootstrap_moments.size(); i++)
 57 |     cerr << bootstrap_moments[i] << endl;
 58 |   for (size_t k = 0; k < bootstrap_mom_seq.alpha.size(); k++)
 59 |     cerr << "alpha_" << k << '\t';
 60 |   cerr << endl;
 61 |   for (size_t k = 0; k < bootstrap_mom_seq.alpha.size(); k++)
 62 |     cerr << bootstrap_mom_seq.alpha[k] << '\t';
 63 |   cerr << endl;
 64 | 
 65 |   for (size_t k = 0; k < bootstrap_mom_seq.beta.size(); k++)
 66 |     cerr << "beta_" << k << '\t';
 67 |   cerr << endl;
 68 |   for (size_t k = 0; k < bootstrap_mom_seq.beta.size(); k++)
 69 |     cerr << bootstrap_mom_seq.beta[k] << '\t';
 70 |   cerr << endl;
 71 |   cerr << "points=" << "\t";
 72 |   for (size_t i = 0; i < points.size(); i++)
 73 |     cerr << points[i] << "\t";
 74 |   cerr << endl;
 75 |   cerr << "weights=" << "\t";
 76 |   for (size_t i = 0; i < weights.size(); i++)
 77 |     cerr << weights[i] << "\t";
 78 |   cerr << endl;
 79 |   cerr << "estimated_unobs=" << "\t" << estimated_unobs << endl;
 80 | }
 81 | 
 82 | // BOUND_UNOBS: bounding n_0
 83 | int
 84 | bound_pop_main(const int argc, const char *argv[]) {
 85 |   try {
 86 |     bool verbose = false;
 87 |     bool PAIRED_END = false;
 88 |     bool HIST_INPUT = false;
 89 |     bool VALS_INPUT = false;
 90 |     bool QUICK_MODE = false;
 91 | 
 92 |     string outfile;
 93 |     string histogram_outfile;
 94 | 
 95 | #ifdef HAVE_HTSLIB
 96 |     bool BAM_FORMAT_INPUT = false;
 97 |     size_t MAX_SEGMENT_LENGTH = 5000;
 98 |     uint32_t n_threads{1};
 99 | #endif
100 | 
101 |     size_t max_num_points = 10;
102 |     double tolerance = 1e-20;
103 |     size_t n_bootstraps = 500;
104 |     double c_level = 0.95;
105 |     size_t max_iter = 100;
106 |     uint32_t seed = 408;
107 | 
108 |     const string description = R"(
109 | Estimate a bound on the size of the underlying population based on
110 | counts of observed species in an initial sample.
111 | )";
112 |     string program_name = std::filesystem::path(argv[0]).filename();
113 |     program_name += " " + string(argv[1]);
114 | 
115 |     /********** GET COMMAND LINE ARGUMENTS FOR BOUND_POP ***********/
116 |     OptionParser opt_parse(program_name, description, "<input-file>");
117 |     opt_parse.add_opt("output", 'o',
118 |                       "species richness output file "
119 |                       "(default: stdout)",
120 |                       false, outfile);
121 |     opt_parse.add_opt("max_num_points", 'p',
122 |                       "maximum number of points in "
123 |                       "quadrature estimates",
124 |                       false, max_num_points);
125 |     opt_parse.add_opt("tolerance", 't', "numerical tolerance", false,
126 |                       tolerance);
127 |     opt_parse.add_opt("bootstraps", 'n', "number of bootstraps", false,
128 |                       n_bootstraps);
129 |     opt_parse.add_opt("clevel", 'c', "level for confidence intervals", false,
130 |                       c_level);
131 |     opt_parse.add_opt("verbose", 'v', "print more information", false, verbose);
132 |     opt_parse.add_opt("pe", 'P', "input is paired end read file", false,
133 |                       PAIRED_END);
134 |     opt_parse.add_opt("hist", 'H',
135 |                       "input is a text file containing the "
136 |                       "observed histogram",
137 |                       false, HIST_INPUT);
138 |     opt_parse.add_opt("hist-out", '\0',
139 |                       "output histogram to this file (for non-hist input)",
140 |                       false, histogram_outfile);
141 |     opt_parse.add_opt("vals", 'V',
142 |                       "input is a text file containing only the "
143 |                       "observed duplicate counts",
144 |                       false, VALS_INPUT);
145 | #ifdef HAVE_HTSLIB
146 |     opt_parse.add_opt("bam", 'B', "input is in BAM format", false,
147 |                       BAM_FORMAT_INPUT);
148 |     opt_parse.add_opt("seg_len", 'l',
149 |                       "maximum segment length when merging "
150 |                       "paired end bam reads",
151 |                       false, MAX_SEGMENT_LENGTH);
152 |     opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM",
153 |                       false, n_threads);
154 | #endif
155 |     opt_parse.add_opt("quick", 'Q',
156 |                       "quick mode, estimate without bootstrapping", false,
157 |                       QUICK_MODE);
158 |     opt_parse.add_opt("seed", 'r', "seed for random number generator", false,
159 |                       seed);
160 |     opt_parse.set_show_defaults();
161 | 
162 |     vector<string> leftover_args;
163 |     opt_parse.parse(argc - 1, argv + 1, leftover_args);
164 |     if (argc == 2 || opt_parse.help_requested()) {
165 |       cerr << opt_parse.help_message() << endl;
166 |       cerr << opt_parse.about_message() << endl;
167 |       return EXIT_SUCCESS;
168 |     }
169 |     if (opt_parse.option_missing()) {
170 |       cerr << opt_parse.option_missing_message() << endl;
171 |       return EXIT_SUCCESS;
172 |     }
173 |     if (leftover_args.empty()) {
174 |       cerr << opt_parse.help_message() << endl;
175 |       return EXIT_SUCCESS;
176 |     }
177 |     const string input_file_name = leftover_args.front();
178 |     // ****************************************************************
179 | 
180 |     vector<double> counts_hist;
181 |     size_t n_obs = 0;
182 | 
183 |     // LOAD VALUES
184 |     if (HIST_INPUT) {
185 |       if (verbose)
186 |         cerr << "HIST_INPUT" << endl;
187 |       n_obs = load_histogram(input_file_name, counts_hist);
188 |     }
189 |     else if (VALS_INPUT) {
190 |       if (verbose)
191 |         cerr << "VALS_INPUT" << endl;
192 |       n_obs = load_counts(input_file_name, counts_hist);
193 |     }
194 | #ifdef HAVE_HTSLIB
195 |     else if (BAM_FORMAT_INPUT && PAIRED_END) {
196 |       if (verbose)
197 |         cerr << "PAIRED_END_BAM_INPUT" << endl;
198 |       n_obs = load_counts_BAM_pe(n_threads, input_file_name, counts_hist);
199 |     }
200 |     else if (BAM_FORMAT_INPUT) {
201 |       if (verbose)
202 |         cerr << "BAM_INPUT" << endl;
203 |       n_obs = load_counts_BAM_se(n_threads, input_file_name, counts_hist);
204 |     }
205 | #endif
206 |     else if (PAIRED_END) {
207 |       if (verbose)
208 |         cerr << "PAIRED_END_BED_INPUT" << endl;
209 |       n_obs = load_counts_BED_pe(input_file_name, counts_hist);
210 |     }
211 |     else {  // default is single end bed file
212 |       if (verbose)
213 |         cerr << "BED_INPUT" << endl;
214 |       n_obs = load_counts_BED_se(input_file_name, counts_hist);
215 |     }
216 | 
217 |     const double distinct_obs =
218 |       accumulate(begin(counts_hist), end(counts_hist), 0.0);
219 | 
220 |     vector<double> measure_moments;
221 |     // mu_r = (r + 1)! n_{r+1} / n_1
222 |     size_t idx = 1;
223 |     while (idx < counts_hist.size() && counts_hist[idx]) {
224 |       // idx + 1 because function calculates (x-1)!
225 |       measure_moments.push_back(
226 |         exp(factorial(idx + 1) + log(counts_hist[idx]) - log(counts_hist[1])));
227 |       if (!isfinite(measure_moments.back())) {
228 |         measure_moments.pop_back();
229 |         break;
230 |       }
231 |       ++idx;
232 |     }
233 | 
234 |     if (verbose) {
235 |       cerr << "TOTAL OBSERVATIONS     = " << n_obs << endl
236 |            << "DISTINCT OBSERVATIONS  = " << distinct_obs << endl
237 |            << "MAX COUNT              = " << counts_hist.size() - 1 << endl;
238 | 
239 |       cerr << "OBSERVED MOMENTS" << endl;
240 |       for (size_t i = 0; i < measure_moments.size(); i++)
241 |         cerr << std::setprecision(16) << measure_moments[i] << endl;
242 |     }
243 | 
244 |     if (!histogram_outfile.empty())
245 |       report_histogram(histogram_outfile, counts_hist);
246 | 
247 |     if (QUICK_MODE) {
248 |       if (measure_moments.size() < 2 * max_num_points)
249 |         max_num_points = static_cast<size_t>(floor(measure_moments.size() / 2));
250 |       else
251 |         measure_moments.resize(2 * max_num_points);
252 |       size_t n_points = 0;
253 |       n_points = ensure_pos_def_mom_seq(measure_moments, tolerance, verbose);
254 |       if (verbose)
255 |         cerr << "n_points = " << n_points << endl;
256 | 
257 |       MomentSequence obs_mom_seq(measure_moments);
258 | 
259 |       if (verbose) {
260 |         for (size_t k = 0; k < obs_mom_seq.alpha.size(); k++)
261 |           cerr << "alpha_" << k << '\t';
262 |         cerr << endl;
263 |         for (size_t k = 0; k < obs_mom_seq.alpha.size(); k++)
264 |           cerr << obs_mom_seq.alpha[k] << '\t';
265 |         cerr << endl;
266 | 
267 |         for (size_t k = 0; k < obs_mom_seq.beta.size(); k++)
268 |           cerr << "beta_" << k << '\t';
269 |         cerr << endl;
270 |         for (size_t k = 0; k < obs_mom_seq.beta.size(); k++)
271 |           cerr << obs_mom_seq.beta[k] << '\t';
272 |         cerr << endl;
273 |       }
274 | 
275 |       vector<double> points, weights;
276 |       obs_mom_seq.Lower_quadrature_rules(n_points, tolerance, max_iter, points,
277 |                                          weights);
278 | 
279 |       // renormalize if needed
280 |       const double weights_sum = accumulate(begin(weights), end(weights), 0.0);
281 |       if (weights_sum != 1.0)
282 |         for (size_t i = 0; i < weights.size(); i++)
283 |           weights[i] = weights[i] / weights_sum;
284 | 
285 |       if (verbose) {
286 |         cerr << "points = " << endl;
287 |         for (size_t i = 0; i < points.size(); i++)
288 |           cerr << points[i] << '\t';
289 |         cerr << endl;
290 | 
291 |         cerr << "weights = " << endl;
292 |         for (size_t i = 0; i < weights.size(); i++)
293 |           cerr << weights[i] << '\t';
294 |         cerr << endl;
295 |       }
296 | 
297 |       double estimated_unobs = 0.0;
298 | 
299 |       for (size_t i = 0; i < weights.size(); i++)
300 |         estimated_unobs += counts_hist[1] * weights[i] / points[i];
301 | 
302 |       if (estimated_unobs > 0.0)
303 |         estimated_unobs += distinct_obs;
304 |       else {
305 |         estimated_unobs = distinct_obs;
306 |         n_points = 0;
307 |       }
308 | 
309 |       std::ofstream of;
310 |       if (!outfile.empty())
311 |         of.open(outfile);
312 |       std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
313 | 
314 |       out.setf(std::ios_base::fixed, std::ios_base::floatfield);
315 |       out.precision(1);
316 | 
317 |       out << "quadrature_estimated_unobs" << '\t' << "n_points" << endl;
318 |       out << estimated_unobs << '\t' << n_points << endl;
319 |     }
320 |     // NOT QUICK MODE, BOOTSTRAP
321 |     else {
322 |       vector<double> quad_estimates;
323 | 
324 |       // setup rng
325 |       mt19937 rng(seed);
326 | 
327 |       // hist may be sparse, to speed up bootstrapping
328 |       // sample only from positive entries
329 |       vector<size_t> counts_hist_distinct_counts;
330 |       vector<double> distinct_counts_hist;
331 |       for (size_t i = 0; i < counts_hist.size(); i++)
332 |         if (counts_hist[i] > 0) {
333 |           counts_hist_distinct_counts.push_back(i);
334 |           distinct_counts_hist.push_back(counts_hist[i]);
335 |         }
336 | 
337 |       for (size_t iter = 0;
338 |            iter < max_iter && quad_estimates.size() < n_bootstraps; ++iter) {
339 |         if (verbose)
340 |           cerr << "iter=" << "\t" << iter << endl;
341 | 
342 |         vector<double> sample_hist;
343 |         resample_hist(rng, counts_hist_distinct_counts, distinct_counts_hist,
344 |                       sample_hist);
345 | 
346 |         const double sampled_distinct =
347 |           accumulate(begin(sample_hist), end(sample_hist), 0.0);
348 | 
349 |         // initialize moments, 0th moment is 1
350 |         vector<double> bootstrap_moments(1, 1.0);
351 |         // moments[r] = (r + 1)! n_{r+1} / n_1
352 |         for (size_t i = 0; i < 2 * max_num_points; i++) {
353 |           bootstrap_moments.push_back(exp(
354 |             factorial(i + 3) + log(sample_hist[i + 2]) - log(sample_hist[1])));
355 |         }
356 | 
357 |         size_t n_points = 0;
358 |         n_points =
359 |           ensure_pos_def_mom_seq(bootstrap_moments, tolerance, verbose);
360 |         n_points = min(n_points, max_num_points);
361 |         if (verbose)
362 |           cerr << "n_points = " << n_points << endl;
363 | 
364 |         MomentSequence bootstrap_mom_seq(bootstrap_moments);
365 | 
366 |         vector<double> points;
367 |         vector<double> weights;
368 |         bootstrap_mom_seq.Lower_quadrature_rules(n_points, tolerance, max_iter,
369 |                                                  points, weights);
370 | 
371 |         // renormalize if needed
372 |         const double weights_sum =
373 |           accumulate(begin(weights), end(weights), 0.0);
374 |         if (weights_sum != 1.0)
375 |           for (size_t i = 0; i < weights.size(); i++)
376 |             weights[i] = weights[i] / weights_sum;
377 | 
378 |         double estimated_unobs = 0.0;
379 | 
380 |         for (size_t i = 0; i < weights.size(); i++)
381 |           estimated_unobs += counts_hist[1] * weights[i] / points[i];
382 | 
383 |         if (estimated_unobs > 0.0)
384 |           estimated_unobs += sampled_distinct;
385 |         else {
386 |           estimated_unobs = sampled_distinct;
387 |           n_points = 0;
388 |         }
389 | 
390 |         if (verbose)
391 |           report_bootstrapped_moments(bootstrap_moments, bootstrap_mom_seq,
392 |                                       points, weights, estimated_unobs);
393 | 
394 |         quad_estimates.push_back(estimated_unobs);
395 |       }
396 | 
397 |       double median_estimate, lower_ci, upper_ci;
398 |       median_and_ci(quad_estimates, c_level, median_estimate, lower_ci,
399 |                     upper_ci);
400 | 
401 |       std::ofstream of;
402 |       if (!outfile.empty())
403 |         of.open(outfile);
404 |       std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
405 | 
406 |       out.setf(std::ios_base::fixed, std::ios_base::floatfield);
407 |       out.precision(1);
408 | 
409 |       out << "median_estimated_unobs" << '\t' << "lower_ci" << '\t'
410 |           << "upper_ci" << endl;
411 |       out << median_estimate << '\t' << lower_ci << '\t' << upper_ci << endl;
412 |     }
413 |   }
414 |   catch (const std::exception &e) {
415 |     cerr << e.what() << endl;
416 |     return EXIT_FAILURE;
417 |   }
418 |   return EXIT_SUCCESS;
419 | }
420 | 


--------------------------------------------------------------------------------
/src/bound_pop.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2013-2024 University of Southern California and
 2 |  *                         Andrew D. Smith and Timothy Daley
 3 |  *
 4 |  * Authors: Timothy Daley and Andrew Smith
 5 |  *
 6 |  * This program is free software: you can redistribute it and/or
 7 |  * modify it under the terms of the GNU General Public License as
 8 |  * published by the Free Software Foundation, either version 3 of the
 9 |  * License, or (at your option) any later version.
10 |  *
11 |  * This program is distributed in the hope that it will be useful, but
12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 |  * General Public License for more details.
15 |  *
16 |  * You should have received a copy of the GNU General Public License
17 |  * along with this program. If not, see
18 |  * <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | #ifndef SRC_BOUND_POP_HPP_
22 | #define SRC_BOUND_POP_HPP_
23 | 
24 | int
25 | bound_pop_main(const int argc, const char *argv[]);
26 | 
27 | #endif  // SRC_BOUND_POP_HPP_
28 | 


--------------------------------------------------------------------------------
/src/c_curve.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (C) 2013-2024 University of Southern California and
  2 |  *                         Andrew D. Smith and Timothy Daley
  3 |  *
  4 |  * Authors: Timothy Daley and Andrew Smith
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or
  7 |  * modify it under the terms of the GNU General Public License as
  8 |  * published by the Free Software Foundation, either version 3 of the
  9 |  * License, or (at your option) any later version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful, but
 12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 |  * General Public License for more details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program. If not, see
 18 |  * <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | #include "c_curve.hpp"
 22 | 
 23 | #include "common.hpp"
 24 | #include "continued_fraction.hpp"
 25 | #include "load_data_for_complexity.hpp"
 26 | #include "moment_sequence.hpp"
 27 | 
 28 | #include <OptionParser.hpp>
 29 | #include <smithlab_os.hpp>
 30 | #include <smithlab_utils.hpp>
 31 | 
 32 | #include <algorithm>
 33 | #include <cstddef>  // std::size_t
 34 | #include <cstdint>
 35 | #include <filesystem>
 36 | #include <fstream>
 37 | #include <iostream>
 38 | #include <numeric>
 39 | #include <random>
 40 | #include <string>
 41 | #include <vector>
 42 | 
 43 | using std::accumulate;
 44 | using std::cbegin;
 45 | using std::cend;
 46 | using std::cerr;
 47 | using std::endl;
 48 | using std::mt19937;
 49 | using std::size;
 50 | using std::size_t;
 51 | using std::string;
 52 | using std::uint32_t;
 53 | using std::vector;
 54 | 
 55 | template <typename T>
 56 | T
 57 | median_from_sorted_vector(const vector<T> &sorted_data, const size_t stride,
 58 |                           const size_t n) {
 59 |   if (n == 0 || sorted_data.empty())
 60 |     return 0.0;
 61 |   const size_t lhs = (n - 1) / 2;
 62 |   const size_t rhs = n / 2;
 63 |   if (lhs == rhs)
 64 |     return sorted_data[lhs * stride];
 65 |   return (sorted_data[lhs * stride] + sorted_data[rhs * stride]) / 2.0;
 66 | }
 67 | 
 68 | int
 69 | c_curve_main(const int argc, const char *argv[]) {
 70 |   try {
 71 |     bool verbose = false;
 72 |     bool PAIRED_END = false;
 73 |     bool HIST_INPUT = false;
 74 |     bool VALS_INPUT = false;
 75 |     uint32_t seed = 408;
 76 | 
 77 |     string outfile;
 78 |     string histogram_outfile;
 79 | 
 80 |     double step_size = 1e6;
 81 | #ifdef HAVE_HTSLIB
 82 |     bool BAM_FORMAT_INPUT = false;
 83 |     size_t MAX_SEGMENT_LENGTH = 5000;
 84 |     uint32_t n_threads{1};
 85 | #endif
 86 | 
 87 |     const string description =
 88 |       R"(
 89 | Generate the complexity curve for data. This does not extrapolate, but
 90 | instead resamples from the given data.
 91 | )";
 92 |     string program_name = std::filesystem::path(argv[0]).filename();
 93 |     program_name += " " + string(argv[1]);
 94 | 
 95 |     /********** GET COMMAND LINE ARGUMENTS  FOR C_CURVE ***********/
 96 |     OptionParser opt_parse(program_name, description, "<input-file>");
 97 |     opt_parse.add_opt("output", 'o', "yield output file (default: stdout)",
 98 |                       false, outfile);
 99 |     opt_parse.add_opt("step", 's', "step size in extrapolations", false,
100 |                       step_size);
101 |     opt_parse.add_opt("verbose", 'v', "print more information", false, verbose);
102 |     opt_parse.add_opt("pe", 'P', "input paired end read file", false,
103 |                       PAIRED_END);
104 |     opt_parse.add_opt("hist", 'H',
105 |                       "input is text file containing observed histogram", false,
106 |                       HIST_INPUT);
107 |     opt_parse.add_opt("hist-out", '\0',
108 |                       "output histogram to this file (for non-hist input)",
109 |                       false, histogram_outfile);
110 |     opt_parse.add_opt("vals", 'V',
111 |                       "input is text file containing only observed counts",
112 |                       false, VALS_INPUT);
113 | #ifdef HAVE_HTSLIB
114 |     opt_parse.add_opt("bam", 'B', "input is in BAM format", false,
115 |                       BAM_FORMAT_INPUT);
116 |     opt_parse.add_opt("seg_len", 'l',
117 |                       "maximum segment length when merging "
118 |                       "paired end bam reads",
119 |                       false, MAX_SEGMENT_LENGTH);
120 |     opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM",
121 |                       false, n_threads);
122 | #endif
123 |     opt_parse.add_opt("seed", 'r', "seed for random number generator", false,
124 |                       seed);
125 |     opt_parse.set_show_defaults();
126 | 
127 |     vector<string> leftover_args;
128 |     opt_parse.parse(argc - 1, argv + 1, leftover_args);
129 |     if (argc == 2 || opt_parse.help_requested()) {
130 |       cerr << opt_parse.help_message() << endl;
131 |       cerr << opt_parse.about_message() << endl;
132 |       return EXIT_SUCCESS;
133 |     }
134 |     if (opt_parse.about_requested()) {
135 |       cerr << opt_parse.about_message() << endl;
136 |       return EXIT_SUCCESS;
137 |     }
138 |     if (opt_parse.option_missing()) {
139 |       cerr << opt_parse.option_missing_message() << endl;
140 |       return EXIT_SUCCESS;
141 |     }
142 |     if (leftover_args.empty()) {
143 |       cerr << opt_parse.help_message() << endl;
144 |       return EXIT_SUCCESS;
145 |     }
146 |     const string input_file_name = leftover_args.front();
147 |     /******************************************************************/
148 | 
149 |     // Setup the random number generator
150 |     mt19937 rng(seed);
151 | 
152 |     vector<double> counts_hist;
153 |     size_t n_reads = 0;
154 | 
155 |     // LOAD VALUES
156 |     if (HIST_INPUT) {
157 |       if (verbose)
158 |         cerr << "INPUT_HIST" << endl;
159 |       n_reads = load_histogram(input_file_name, counts_hist);
160 |     }
161 |     else if (VALS_INPUT) {
162 |       if (verbose)
163 |         cerr << "VALS_INPUT" << endl;
164 |       n_reads = load_counts(input_file_name, counts_hist);
165 |     }
166 | #ifdef HAVE_HTSLIB
167 |     else if (BAM_FORMAT_INPUT && PAIRED_END) {
168 |       if (verbose)
169 |         cerr << "PAIRED_END_BAM_INPUT" << endl;
170 |       n_reads = load_counts_BAM_pe(n_threads, input_file_name, counts_hist);
171 |     }
172 |     else if (BAM_FORMAT_INPUT) {
173 |       if (verbose)
174 |         cerr << "BAM_INPUT" << endl;
175 |       n_reads = load_counts_BAM_se(n_threads, input_file_name, counts_hist);
176 |     }
177 | #endif
178 |     else if (PAIRED_END) {
179 |       if (verbose)
180 |         cerr << "PAIRED_END_BED_INPUT" << endl;
181 |       n_reads = load_counts_BED_pe(input_file_name, counts_hist);
182 |     }
183 |     else {  // default is single end bed file
184 |       if (verbose)
185 |         cerr << "BED_INPUT" << endl;
186 |       n_reads = load_counts_BED_se(input_file_name, counts_hist);
187 |     }
188 | 
189 |     const size_t max_observed_count = size(counts_hist) - 1;
190 |     const double distinct_reads =
191 |       accumulate(cbegin(counts_hist), cend(counts_hist), 0.0);
192 | 
193 |     const size_t total_reads = get_counts_from_hist(counts_hist);
194 | 
195 |     const size_t distinct_counts =
196 |       std::count_if(cbegin(counts_hist), cend(counts_hist),
197 |                     [](const double x) { return x > 0.0; });
198 | 
199 |     if (verbose)
200 |       cerr << "TOTAL READS     = " << n_reads << endl
201 |            << "COUNTS_SUM      = " << total_reads << endl
202 |            << "DISTINCT READS  = " << distinct_reads << endl
203 |            << "DISTINCT COUNTS = " << distinct_counts << endl
204 |            << "MAX COUNT       = " << max_observed_count << endl
205 |            << "COUNTS OF 1     = " << counts_hist[1] << endl;
206 | 
207 |     if (!histogram_outfile.empty())
208 |       report_histogram(histogram_outfile, counts_hist);
209 | 
210 |     const size_t upper_limit = n_reads;  // set upper limit equal to number of
211 |                                          // molecules
212 | 
213 |     // setup for output of the complexity curve
214 |     std::ofstream of;
215 |     if (!outfile.empty())
216 |       of.open(outfile);
217 |     std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
218 | 
219 |     // prints the complexity curve
220 |     out << "total_reads" << "\t" << "distinct_reads" << endl;
221 |     out << 0 << '\t' << 0 << endl;
222 |     for (size_t i = step_size; i <= upper_limit; i += step_size) {
223 |       if (verbose)
224 |         cerr << "sample size: " << i << endl;
225 |       out << i << "\t"
226 |           << interpolate_distinct(counts_hist, total_reads, distinct_reads, i)
227 |           << endl;
228 |     }
229 |   }
230 |   catch (const std::exception &e) {
231 |     cerr << "ERROR:\t" << e.what() << endl;
232 |     return EXIT_FAILURE;
233 |   }
234 |   return EXIT_SUCCESS;
235 | }
236 | 


--------------------------------------------------------------------------------
/src/c_curve.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2013-2024 University of Southern California and
 2 |  *                         Andrew D. Smith and Timothy Daley
 3 |  *
 4 |  * Authors: Timothy Daley and Andrew Smith
 5 |  *
 6 |  * This program is free software: you can redistribute it and/or
 7 |  * modify it under the terms of the GNU General Public License as
 8 |  * published by the Free Software Foundation, either version 3 of the
 9 |  * License, or (at your option) any later version.
10 |  *
11 |  * This program is distributed in the hope that it will be useful, but
12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 |  * General Public License for more details.
15 |  *
16 |  * You should have received a copy of the GNU General Public License
17 |  * along with this program. If not, see
18 |  * <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | #ifndef SRC_C_CURVE_HPP_
22 | #define SRC_C_CURVE_HPP_
23 | 
24 | int
25 | c_curve_main(const int argc, const char *argv[]);
26 | 
27 | #endif  // SRC_C_CURVE_HPP_
28 | 


--------------------------------------------------------------------------------
/src/common.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (C) 2013-2024 University of Southern California and
  2 |  *                         Andrew D. Smith and Timothy Daley
  3 |  *
  4 |  * Authors: Timothy Daley and Andrew Smith
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or
  7 |  * modify it under the terms of the GNU General Public License as
  8 |  * published by the Free Software Foundation, either version 3 of the
  9 |  * License, or (at your option) any later version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful, but
 12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 |  * General Public License for more details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program. If not, see
 18 |  * <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | #include "common.hpp"
 22 | 
 23 | #include "continued_fraction.hpp"
 24 | 
 25 | #include <unistd.h>
 26 | 
 27 | #include <algorithm>
 28 | #include <array>
 29 | #include <cassert>
 30 | #include <cmath>
 31 | #include <cstddef>
 32 | #include <cstdint>
 33 | #include <exception>
 34 | #include <iostream>
 35 | #include <random>
 36 | #include <string>
 37 | #include <vector>
 38 | 
 39 | using std::array;
 40 | using std::begin;
 41 | using std::cbegin;
 42 | using std::cend;
 43 | using std::cerr;
 44 | using std::end;
 45 | using std::endl;
 46 | using std::min;
 47 | using std::mt19937;
 48 | using std::runtime_error;
 49 | using std::size_t;
 50 | using std::string;
 51 | using std::uint32_t;
 52 | using std::vector;
 53 | 
 54 | double
 55 | GoodToulmin2xExtrap(const vector<double> &counts_hist) {
 56 |   double two_fold_extrap = 0.0;
 57 |   for (size_t i = 0; i < counts_hist.size(); i++)
 58 |     two_fold_extrap += pow(-1.0, i + 1) * counts_hist[i];
 59 |   return two_fold_extrap;
 60 | }
 61 | 
 62 | // Lanczos approximation for gamma function for x >= 0.5 - essentially an
 63 | // approximation for (x-1)!
 64 | double
 65 | factorial(double x) {
 66 |   // constants
 67 |   static constexpr double LogRootTwoPi = 0.9189385332046727;
 68 |   static constexpr double Euler = 2.71828182845904523536028747135;
 69 |   array<double, 9> Lanczos{0.99999999999980993227684700473478,
 70 |                            676.520368121885098567009190444019,
 71 |                            -1259.13921672240287047156078755283,
 72 |                            771.3234287776530788486528258894,
 73 |                            -176.61502916214059906584551354,
 74 |                            12.507343278686904814458936853,
 75 |                            -0.13857109526572011689554707,
 76 |                            9.984369578019570859563e-6,
 77 |                            1.50563273514931155834e-7};
 78 | 
 79 |   // Approximation for factorial is actually x-1
 80 |   x -= 1.0;
 81 | 
 82 |   double Ag = Lanczos[0];
 83 |   for (auto k = 1u; k < size(Lanczos); k++)
 84 |     Ag += Lanczos[k] / (x + k);
 85 | 
 86 |   const double term1 = (x + 0.5) * log((x + 7.5) / Euler);
 87 |   const double term2 = LogRootTwoPi + log(Ag);
 88 | 
 89 |   return term1 + (term2 - 7.0);
 90 | }
 91 | 
 92 | // interpolate by explicit calculating the expectation
 93 | // for sampling without replacement;
 94 | // see K.L Heck 1975
 95 | // N total sample size; S the total number of distincts
 96 | // n sub sample size
 97 | double
 98 | interpolate_distinct(const vector<double> &hist, const size_t N, const size_t S,
 99 |                      const size_t n) {
100 |   const double denom =
101 |     factorial(N + 1) - factorial(n + 1) - factorial(N - n + 1);
102 | 
103 |   vector<double> numer(hist.size(), 0);
104 |   for (size_t i = 1; i < hist.size(); i++) {
105 |     // N - i -n + 1 should be greater than 0
106 |     if (N < i + n) {
107 |       numer[i] = 0;
108 |     }
109 |     else {
110 |       const double x =
111 |         (factorial(N - i + 1) - factorial(n + 1) - factorial(N - i - n + 1));
112 |       numer[i] = exp(x - denom) * hist[i];
113 |     }
114 |   }
115 |   return S - accumulate(cbegin(numer), cend(numer), 0);
116 | }
117 | 
118 | static void
119 | extrapolate_curve(const ContinuedFraction &the_cf,
120 |                   const double initial_distinct, const double vals_sum,
121 |                   const double initial_sample_size, const double step_size,
122 |                   const double max_sample_size, vector<double> &estimates) {
123 |   double curr_samp_sz = initial_sample_size;
124 |   while (curr_samp_sz < max_sample_size) {
125 |     const double fold = (curr_samp_sz - vals_sum) / vals_sum;
126 |     assert(fold >= 0.0);
127 |     estimates.push_back(initial_distinct + fold * the_cf(fold));
128 |     curr_samp_sz += step_size;
129 |   }
130 | }
131 | 
132 | bool
133 | extrap_single_estimate(const bool VERBOSE, const bool allow_defects,
134 |                        const vector<double> &hist, size_t max_terms,
135 |                        const int diagonal, const double step_size,
136 |                        const double max_extrap,
137 |                        vector<double> &yield_estimate) {
138 |   yield_estimate.clear();
139 | 
140 |   const double vals_sum = get_counts_from_hist(hist);
141 |   const double initial_distinct = accumulate(cbegin(hist), cend(hist), 0.0);
142 | 
143 |   // interpolate complexity curve by random sampling w/out replacement
144 |   const size_t upper_limit = vals_sum;
145 |   const size_t step = step_size;
146 |   size_t sample = static_cast<size_t>(step_size);
147 |   for (; sample < upper_limit; sample += step)
148 |     yield_estimate.push_back(
149 |       interpolate_distinct(hist, upper_limit, initial_distinct, sample));
150 | 
151 |   // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
152 |   size_t first_zero = 1;
153 |   while (first_zero < hist.size() && hist[first_zero] > 0)
154 |     ++first_zero;
155 | 
156 |   // Ensure we are not using a zero term
157 |   max_terms = min(max_terms, first_zero - 1);
158 | 
159 |   // refit curve for lower bound (degree of approx is 1 less than
160 |   // max_terms)
161 |   max_terms = max_terms - (max_terms % 2 == 1);
162 | 
163 |   if (allow_defects) {
164 |     vector<double> ps_coeffs;
165 |     for (size_t j = 1; j <= max_terms; j++)
166 |       ps_coeffs.push_back(hist[j] * std::pow(-1.0, j + 1));
167 | 
168 |     const ContinuedFraction defect_cf(ps_coeffs, diagonal, max_terms);
169 | 
170 |     extrapolate_curve(defect_cf, initial_distinct, vals_sum, sample, step_size,
171 |                       max_extrap, yield_estimate);
172 | 
173 |     if (VERBOSE)
174 |       cerr << defect_cf << endl;
175 |     // NO FAIL! defect mode doesn't care about failure
176 |   }
177 |   else {
178 |     const ContinuedFractionApproximation lower_cfa(diagonal, max_terms);
179 |     const ContinuedFraction lower_cf(
180 |       lower_cfa.optimal_cont_frac_distinct(hist));
181 | 
182 |     // extrapolate curve
183 |     if (lower_cf.is_valid()) {
184 |       extrapolate_curve(lower_cf, initial_distinct, vals_sum, sample, step_size,
185 |                         max_extrap, yield_estimate);
186 |     }
187 |     else {
188 |       // FAIL! lower_cf unacceptable, need to bootstrap to obtain
189 |       // estimates
190 |       return false;
191 |     }
192 | 
193 |     if (VERBOSE)
194 |       cerr << lower_cf << endl;
195 |   }
196 |   // SUCCESS!!
197 |   return true;
198 | }
199 | 
200 | void
201 | extrap_bootstrap(const bool VERBOSE, const bool allow_defects,
202 |                  const uint32_t seed, const vector<double> &orig_hist,
203 |                  const size_t n_bootstraps, const size_t orig_max_terms,
204 |                  const int diagonal, const double bin_step_size,
205 |                  const double max_extrap, const size_t max_iter,
206 |                  vector<vector<double>> &bootstrap_estimates) {
207 |   // clear returning vectors
208 |   bootstrap_estimates.clear();
209 | 
210 |   // setup rng
211 |   mt19937 rng(seed);
212 | 
213 |   const double initial_distinct =
214 |     std::accumulate(cbegin(orig_hist), cend(orig_hist), 0.0);
215 | 
216 |   vector<size_t> orig_hist_distinct_counts;
217 |   vector<double> distinct_orig_hist;
218 |   for (size_t i = 0; i < orig_hist.size(); i++)
219 |     if (orig_hist[i] > 0) {
220 |       orig_hist_distinct_counts.push_back(i);
221 |       distinct_orig_hist.push_back(orig_hist[i]);
222 |     }
223 | 
224 |   for (size_t iter = 0;
225 |        (iter < max_iter && bootstrap_estimates.size() < n_bootstraps); ++iter) {
226 |     if (VERBOSE && iter > 0 && iter % 72 == 0)
227 |       cerr << endl;  // bootstrap success progress only 72 char wide
228 | 
229 |     vector<double> yield_vector;
230 |     vector<double> hist;
231 |     resample_hist(rng, orig_hist_distinct_counts, distinct_orig_hist, hist);
232 | 
233 |     const double sample_vals_sum = get_counts_from_hist(hist);
234 | 
235 |     // resize boot_hist to remove excess zeros
236 |     while (hist.back() == 0)
237 |       hist.pop_back();
238 | 
239 |     // compute complexity curve by random sampling w/out replacement
240 |     const size_t distinct = accumulate(cbegin(hist), cend(hist), 0.0);
241 |     size_t curr_sample_sz = bin_step_size;
242 |     while (curr_sample_sz < sample_vals_sum) {
243 |       yield_vector.push_back(
244 |         interpolate_distinct(hist, sample_vals_sum, distinct, curr_sample_sz));
245 |       curr_sample_sz += bin_step_size;
246 |     }
247 | 
248 |     // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
249 |     size_t first_zero = 1;
250 |     while (first_zero < hist.size() && hist[first_zero] > 0)
251 |       ++first_zero;
252 | 
253 |     size_t max_terms = min(orig_max_terms, first_zero - 1);
254 |     // refit curve for lower bound (degree of approx is 1 less than
255 |     // max_terms)
256 |     max_terms = max_terms - (max_terms % 2 == 1);
257 | 
258 |     bool successful_bootstrap = false;
259 |     // defect mode, simple extrapolation
260 |     if (allow_defects) {
261 |       vector<double> ps_coeffs;
262 |       for (size_t j = 1; j <= max_terms; j++)
263 |         ps_coeffs.push_back(hist[j] * std::pow(-1.0, j + 1));
264 | 
265 |       const ContinuedFraction defect_cf(ps_coeffs, diagonal, max_terms);
266 | 
267 |       extrapolate_curve(defect_cf, initial_distinct, sample_vals_sum,
268 |                         curr_sample_sz, bin_step_size, max_extrap,
269 |                         yield_vector);
270 |       // no checking of curve in defect mode
271 |       bootstrap_estimates.push_back(yield_vector);
272 |       successful_bootstrap = true;
273 |     }
274 |     else {
275 |       // refit curve for lower bound
276 |       const ContinuedFractionApproximation lower_cfa(diagonal, max_terms);
277 |       const ContinuedFraction lower_cf(
278 |         lower_cfa.optimal_cont_frac_distinct(hist));
279 | 
280 |       // extrapolate the curve start
281 |       if (lower_cf.is_valid()) {
282 |         extrapolate_curve(lower_cf, initial_distinct, sample_vals_sum,
283 |                           curr_sample_sz, bin_step_size, max_extrap,
284 |                           yield_vector);
285 |         // sanity check
286 |         if (check_yield_estimates_stability(yield_vector)) {
287 |           bootstrap_estimates.push_back(yield_vector);
288 |           successful_bootstrap = true;
289 |         }
290 |       }
291 |     }
292 |     if (VERBOSE)
293 |       cerr << (successful_bootstrap ? '.' : '_');
294 |   }
295 |   if (VERBOSE)
296 |     cerr << endl;
297 |   if (bootstrap_estimates.size() < n_bootstraps)
298 |     throw runtime_error("too many defects in the approximation, "
299 |                         "consider running in defect mode");
300 | }
301 | 
302 | void
303 | vector_median_and_ci(const vector<vector<double>> &bootstrap_estimates,
304 |                      const double ci_level, vector<double> &yield_estimates,
305 |                      vector<double> &lower_ci_lognorm,
306 |                      vector<double> &upper_ci_lognorm) {
307 |   yield_estimates.clear();
308 |   lower_ci_lognorm.clear();
309 |   upper_ci_lognorm.clear();
310 |   assert(!bootstrap_estimates.empty());
311 | 
312 |   const size_t n_est = bootstrap_estimates.size();
313 |   vector<double> estimates_row(n_est, 0.0);
314 |   for (size_t i = 0; i < bootstrap_estimates[0].size(); i++) {
315 |     // estimates is in wrong order, work locally on const val
316 |     for (size_t k = 0; k < n_est; ++k)
317 |       estimates_row[k] = bootstrap_estimates[k][i];
318 | 
319 |     double median_estimate, lower_ci_estimate, upper_ci_estimate;
320 |     median_and_ci(estimates_row, ci_level, median_estimate, lower_ci_estimate,
321 |                   upper_ci_estimate);
322 |     std::sort(begin(estimates_row), end(estimates_row));
323 | 
324 |     yield_estimates.push_back(median_estimate);
325 |     lower_ci_lognorm.push_back(lower_ci_estimate);
326 |     upper_ci_lognorm.push_back(upper_ci_estimate);
327 |   }
328 | }
329 | 
330 | void
331 | write_predicted_complexity_curve(const string &outfile, const double c_level,
332 |                                  const double step_size,
333 |                                  const vector<double> &yield_estimates,
334 |                                  const vector<double> &yield_lower_ci_lognorm,
335 |                                  const vector<double> &yield_upper_ci_lognorm) {
336 |   std::ofstream of;
337 |   if (!outfile.empty())
338 |     of.open(outfile);
339 |   std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
340 | 
341 |   // clang-format off
342 |   out << "TOTAL_READS" << '\t'
343 |       << "EXPECTED_DISTINCT" << '\t'
344 |       << "LOWER_" << c_level << "CI" << '\t'
345 |       << "UPPER_" << c_level << "CI" << '\n';
346 |   // clang-format on
347 | 
348 |   out.setf(std::ios_base::fixed, std::ios_base::floatfield);
349 |   out.precision(1);
350 | 
351 |   out << 0 << '\t' << 0 << '\t' << 0 << '\t' << 0 << endl;
352 |   for (size_t i = 0; i < yield_estimates.size(); ++i)
353 |     out << (i + 1) * step_size << '\t' << yield_estimates[i] << '\t'
354 |         << yield_lower_ci_lognorm[i] << '\t' << yield_upper_ci_lognorm[i]
355 |         << endl;
356 | }
357 | 
358 | // vals_hist[j] = n_{j} = # (counts = j)
359 | // vals_hist_distinct_counts[k] = kth index j s.t. vals_hist[j] > 0
360 | // stores kth index of vals_hist that is positive
361 | // distinct_counts_hist[k] = vals_hist[vals_hist_distinct_counts[k]]
362 | // stores the kth positive value of vals_hist
363 | void
364 | resample_hist(mt19937 &gen, const vector<size_t> &vals_hist_distinct_counts,
365 |               const vector<double> &distinct_counts_hist,
366 |               vector<double> &out_hist) {
367 |   const size_t hist_size = distinct_counts_hist.size();
368 |   vector<uint32_t> sample_distinct_counts_hist(hist_size, 0);
369 | 
370 |   const uint32_t distinct =
371 |     accumulate(cbegin(distinct_counts_hist), cend(distinct_counts_hist), 0.0);
372 | 
373 |   multinomial(gen, distinct_counts_hist, distinct, sample_distinct_counts_hist);
374 | 
375 |   out_hist.clear();
376 |   out_hist.resize(vals_hist_distinct_counts.back() + 1, 0.0);
377 |   for (size_t i = 0; i < hist_size; i++)
378 |     out_hist[vals_hist_distinct_counts[i]] = sample_distinct_counts_hist[i];
379 | }
380 | 
381 | template <typename T>
382 | T
383 | median_from_sorted_vector(const vector<T> sorted_data, const size_t stride,
384 |                           const size_t n) {
385 |   if (n == 0 || sorted_data.empty())
386 |     return 0.0;
387 | 
388 |   const size_t lhs = (n - 1) / 2;
389 |   const size_t rhs = n / 2;
390 | 
391 |   if (lhs == rhs)
392 |     return sorted_data[lhs * stride];
393 | 
394 |   return (sorted_data[lhs * stride] + sorted_data[rhs * stride]) / 2.0;
395 | }
396 | 
397 | template <typename T>
398 | T
399 | quantile_from_sorted_vector(const vector<T> &sorted_data, const size_t stride,
400 |                             const size_t n, const double f) {
401 |   const double index = f * (n - 1);
402 |   const size_t lhs = static_cast<int>(index);
403 |   const double delta = index - lhs;
404 | 
405 |   if (n == 0 || sorted_data.empty())
406 |     return 0.0;
407 | 
408 |   if (lhs == n - 1)
409 |     return sorted_data[lhs * stride];
410 | 
411 |   return (1 - delta) * sorted_data[lhs * stride] +
412 |          delta * sorted_data[(lhs + 1) * stride];
413 | }
414 | 
415 | // Confidence interval stuff
416 | void
417 | median_and_ci(vector<double> estimates,  // by val so we can sort them
418 |               const double ci_level, double &median_estimate,
419 |               double &lower_ci_estimate, double &upper_ci_estimate) {
420 |   assert(!estimates.empty());
421 | 
422 |   std::sort(begin(estimates), end(estimates));
423 | 
424 |   const double alpha = 1.0 - ci_level;
425 |   const size_t N = estimates.size();
426 | 
427 |   median_estimate = median_from_sorted_vector(estimates, 1, N);
428 |   lower_ci_estimate = quantile_from_sorted_vector(estimates, 1, N, alpha / 2);
429 |   upper_ci_estimate =
430 |     quantile_from_sorted_vector(estimates, 1, N, 1.0 - alpha / 2);
431 | }
432 | 


--------------------------------------------------------------------------------
/src/common.hpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (C) 2013-2024 University of Southern California and
  2 |  *                         Andrew D. Smith and Timothy Daley
  3 |  *
  4 |  * Authors: Timothy Daley and Andrew Smith
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or
  7 |  * modify it under the terms of the GNU General Public License as
  8 |  * published by the Free Software Foundation, either version 3 of the
  9 |  * License, or (at your option) any later version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful, but
 12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 |  * General Public License for more details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program. If not, see
 18 |  * <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | #ifndef SRC_COMMON_HPP_
 22 | #define SRC_COMMON_HPP_
 23 | 
 24 | #include <cstddef>  // std::size_t
 25 | #include <cstdint>  // std::uint64_t
 26 | #include <fstream>
 27 | #include <ostream>
 28 | #include <random>
 29 | #include <stdexcept>
 30 | #include <string>
 31 | #include <vector>  // has std::size
 32 | 
 33 | double
 34 | GoodToulmin2xExtrap(const std::vector<double> &counts_hist);
 35 | 
 36 | double
 37 | interpolate_distinct(const std::vector<double> &hist, const std::size_t N,
 38 |                      const std::size_t S, const std::size_t n);
 39 | 
 40 | bool
 41 | extrap_single_estimate(const bool VERBOSE, const bool allow_defects,
 42 |                        const std::vector<double> &hist, std::size_t max_terms,
 43 |                        const int diagonal, const double step_size,
 44 |                        const double max_extrap,
 45 |                        std::vector<double> &yield_estimate);
 46 | 
 47 | void
 48 | extrap_bootstrap(const bool VERBOSE, const bool allow_defects,
 49 |                  const std::uint32_t seed, const std::vector<double> &orig_hist,
 50 |                  const std::size_t n_bootstraps,
 51 |                  const std::size_t orig_max_terms, const int diagonal,
 52 |                  const double bin_step_size, const double max_extrap,
 53 |                  const std::size_t max_iter,
 54 |                  std::vector<std::vector<double>> &bootstrap_estimates);
 55 | 
 56 | void
 57 | vector_median_and_ci(
 58 |   const std::vector<std::vector<double>> &bootstrap_estimates,
 59 |   const double ci_level, std::vector<double> &yield_estimates,
 60 |   std::vector<double> &lower_ci_lognorm, std::vector<double> &upper_ci_lognorm);
 61 | 
 62 | void
 63 | write_predicted_complexity_curve(
 64 |   const std::string &outfile, const double c_level, const double step_size,
 65 |   const std::vector<double> &yield_estimates,
 66 |   const std::vector<double> &yield_lower_ci_lognorm,
 67 |   const std::vector<double> &yield_upper_ci_lognorm);
 68 | 
 69 | template <typename T>
 70 | T
 71 | get_counts_from_hist(const std::vector<T> &h) {
 72 |   T c = 0.0;
 73 |   for (auto i = 0u; i < std::size(h); ++i)
 74 |     c += i * h[i];
 75 |   return c;
 76 | }
 77 | 
 78 | double
 79 | factorial(double x);
 80 | 
 81 | void
 82 | resample_hist(std::mt19937 &gen,
 83 |               const std::vector<std::size_t> &vals_hist_distinct_counts,
 84 |               const std::vector<double> &distinct_counts_hist,
 85 |               std::vector<double> &out_hist);
 86 | 
 87 | void
 88 | median_and_ci(std::vector<double> estimates,  // by val so we can sort them
 89 |               const double ci_level, double &median_estimate,
 90 |               double &lower_ci_estimate, double &upper_ci_estimate);
 91 | 
 92 | template <typename uint_type>
 93 | void
 94 | multinomial(std::mt19937 &gen, const std::vector<double> &mult_probs,
 95 |             uint_type trials, std::vector<uint_type> &result) {
 96 |   typedef std::binomial_distribution<uint32_t> binom_dist;
 97 | 
 98 |   result.clear();
 99 |   result.resize(std::size(mult_probs));
100 | 
101 |   double remaining_prob =
102 |     std::accumulate(std::begin(mult_probs), std::end(mult_probs), 0.0);
103 | 
104 |   auto r = std::begin(result);
105 |   auto p = std::begin(mult_probs);
106 | 
107 |   while (p != std::end(mult_probs)) {  // iterate to sample for each category
108 |     *r = binom_dist(trials, (*p) / remaining_prob)(gen);  // take the sample
109 | 
110 |     remaining_prob -= *p++;  // update remaining probability mass
111 |     trials -= *r++;          // update remaining trials needed
112 |   }
113 | 
114 |   if (trials > 0)
115 |     throw std::runtime_error("multinomial sampling failed");
116 | }
117 | 
118 | template <typename H>
119 | void
120 | report_histogram(const std::string &outfile, const H &h) {
121 |   std::ofstream out(outfile);
122 |   if (!out)
123 |     throw std::runtime_error("failed to open output file: " + outfile);
124 |   for (auto i = 0u; i < std::size(h); ++i)
125 |     if (h[i] > 0)
126 |       out << i << '\t' << static_cast<std::uint32_t>(h[i]) << '\n';
127 | }
128 | 
129 | #endif  // SRC_COMMON_HPP_
130 | 


--------------------------------------------------------------------------------
/src/continued_fraction.cpp:
--------------------------------------------------------------------------------
  1 | /*    Copyright (C) 2013 University of Southern California and
  2 |  *                       Andrew D. Smith and Timothy Daley
  3 |  *
  4 |  *    Authors: Andrew D. Smith and Timothy Daley
  5 |  *
  6 |  *    This program is free software: you can redistribute it and/or modify
  7 |  *    it under the terms of the GNU General Public License as published by
  8 |  *    the Free Software Foundation, either version 3 of the License, or
  9 |  *    (at your option) any later version.
 10 |  *
 11 |  *    This program is distributed in the hope that it will be useful,
 12 |  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 |  *    GNU General Public License for more details.
 15 |  *
 16 |  *    You should have received a copy of the GNU General Public License
 17 |  *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 18 |  */
 19 | 
 20 | #include "continued_fraction.hpp"
 21 | 
 22 | #include <algorithm>
 23 | #include <cassert>
 24 | #include <cmath>
 25 | #include <vector>
 26 | 
 27 | using std::fabs;
 28 | using std::isfinite;
 29 | using std::min;
 30 | using std::pow;
 31 | using std::vector;
 32 | 
 33 | // ADS: the std::pow function is used frequently to get (-1)^x for
 34 | // integer x. This doesn't make sense, and should be replaced at some
 35 | // point.
 36 | 
 37 | /* QUOTIENT DIFFERENCE ALGORITHM: compute continued fraction
 38 |  * coefficients vector for power series coefficients & vector for
 39 |  * continued fraction coefficients
 40 |  *
 41 |  * The negative sign for coefficients in the final loop is because we
 42 |  * evaluate a0/(1 + a1x/(1 + a2x/... while the algorithm is designed
 43 |  * for the a0/(1 - a1x/(1 - a2x/... see https://dlmf.nist.gov/3.10
 44 |  */
 45 | static void
 46 | quotdiff_algorithm(const vector<double> &ps_coeffs, vector<double> &cf_coeffs) {
 47 |   const size_t depth = ps_coeffs.size();  // degree of power series
 48 | 
 49 |   // q_table[0] never used, and undefined
 50 |   vector<vector<double>> q_table(depth, vector<double>(depth + 1, 0.0));
 51 |   // q_table[1][j] = ratio of ps coefficients
 52 |   for (size_t j = 0; j < depth - 1; j++)
 53 |     q_table[1][j] = ps_coeffs[j + 1] / ps_coeffs[j];
 54 | 
 55 |   // e_table[0] is always 0
 56 |   vector<vector<double>> e_table(depth, vector<double>(depth + 1, 0.0));
 57 |   // e_table[1] follows the general recurrence (same as in loop below)
 58 |   for (size_t j = 0; j < depth - 1; j++)
 59 |     e_table[1][j] = q_table[1][j + 1] - q_table[1][j] + e_table[0][j + 1];
 60 | 
 61 |   // using intial values of E(i)(j)'s and Q(i)(j)'s, fill rest of the
 62 |   // q table and e table
 63 |   for (size_t i = 2; i < depth; i++) {
 64 |     for (size_t j = 0; j < depth; j++)
 65 |       q_table[i][j] =
 66 |         q_table[i - 1][j + 1] * e_table[i - 1][j + 1] / e_table[i - 1][j];
 67 | 
 68 |     for (size_t j = 0; j < depth; j++)
 69 |       e_table[i][j] = q_table[i][j + 1] - q_table[i][j] + e_table[i - 1][j + 1];
 70 |   }
 71 | 
 72 |   cf_coeffs.resize(depth);
 73 |   // first CT coefficient is first PS coefficient
 74 |   cf_coeffs[0] = ps_coeffs[0];
 75 |   // set remaining CF coefficients from e and q table values
 76 |   for (size_t i = 1; i < depth; ++i)
 77 |     cf_coeffs[i] = (i % 2 == 0) ? -e_table[i / 2][0] : -q_table[(i + 1) / 2][0];
 78 | }
 79 | 
 80 | /* compute CF coeffs when upper_offset > 0 above the diagonal; this
 81 |  * means degree of polynomial in numerator of Pade approximant is
 82 |  * greater than degree of polynomial in the denominator
 83 |  */
 84 | static void
 85 | quotdiff_above_diagonal(const vector<double> &ps_coeffs, const size_t offset,
 86 |                         vector<double> &cf_coeffs,
 87 |                         vector<double> &offset_coeffs) {
 88 |   // get the high order PS coeffs for approximation by CF
 89 |   vector<double> high_ps_coeffs(begin(ps_coeffs) + offset, end(ps_coeffs));
 90 | 
 91 |   // use QD algorithm to determine CF coefficients
 92 |   quotdiff_algorithm(high_ps_coeffs, cf_coeffs);
 93 | 
 94 |   // first "offset" coeffs are equal to PS coeffs
 95 |   offset_coeffs = ps_coeffs;
 96 |   offset_coeffs.resize(offset);
 97 | }
 98 | 
 99 | // calculate CF coeffs when lower_offset > 0
100 | static void
101 | quotdiff_below_diagonal(const vector<double> &ps_coeffs, const size_t offset,
102 |                         vector<double> &cf_coeffs,
103 |                         vector<double> &offset_coeffs) {
104 |   // need to work with reciprocal series g = 1/f, then invert
105 |   vector<double> recip_ps_coeffs(ps_coeffs.size());
106 |   recip_ps_coeffs[0] = 1.0 / ps_coeffs[0];
107 |   for (size_t i = 1; i < ps_coeffs.size(); ++i) {
108 |     double x = 0.0;
109 |     for (size_t j = 0; j < i; ++j)
110 |       x += ps_coeffs[i - j] * recip_ps_coeffs[j];
111 | 
112 |     recip_ps_coeffs[i] = -x / ps_coeffs[0];
113 |   }
114 | 
115 |   // qd to compute cf_coeffs using remaining coeffs
116 |   vector<double> high_recip_ps_coeffs(begin(recip_ps_coeffs) + offset,
117 |                                       end(recip_ps_coeffs));
118 |   quotdiff_algorithm(high_recip_ps_coeffs, cf_coeffs);
119 | 
120 |   // set offset coeffs to 1st "offset" PS coeffs of 1/f (reciprocal)
121 |   offset_coeffs = recip_ps_coeffs;
122 |   offset_coeffs.resize(offset);
123 | }
124 | 
125 | void
126 | truncate_degree(const size_t n_terms, ContinuedFraction &the_cf) {
127 |   if (the_cf.degree < n_terms) {
128 |     the_cf = ContinuedFraction();
129 |   }
130 |   else {
131 |     the_cf.ps_coeffs.resize(n_terms);
132 |     the_cf.cf_coeffs.resize(n_terms - the_cf.offset_coeffs.size());
133 |     the_cf.degree = n_terms;
134 |   }
135 | }
136 | 
137 | ContinuedFraction::ContinuedFraction(const vector<double> &ps_cf, const int di,
138 |                                      const size_t dg) :
139 |   ps_coeffs(ps_cf), diagonal_idx(di), degree(dg) {
140 |   if (diagonal_idx == 0)
141 |     quotdiff_algorithm(ps_coeffs, cf_coeffs);
142 |   else if (diagonal_idx > 0)
143 |     quotdiff_above_diagonal(ps_coeffs, diagonal_idx, cf_coeffs, offset_coeffs);
144 |   else  // if (cont_frac_estimate.lower_offset > 0) {
145 |     quotdiff_below_diagonal(ps_coeffs, -diagonal_idx, cf_coeffs, offset_coeffs);
146 |   // NOTE: negative sign "-" (-diagonal_idx > 0) for below diagonal
147 | }
148 | 
149 | ////////////////////////////////////////////////////////////////////////
150 | //// FUNCTIONS TO EVALUATE CONTINUED FRACTIONS AT A POINT
151 | 
152 | static double
153 | get_rescale_value(const double numerator, const double denominator) {
154 |   static const double tolerance = 1e-20;  // magic
155 |   const double rescale_val = fabs(numerator) + fabs(denominator);
156 |   if (rescale_val > 1.0 / tolerance)
157 |     return 1.0 / rescale_val;
158 |   else if (rescale_val < tolerance)
159 |     return 1.0 / rescale_val;
160 |   return 1.0;
161 | }
162 | 
163 | /* calculate ContinuedFraction approx when there is no offset uses euler's
164 |  * recursion
165 |  */
166 | static double
167 | evaluate_on_diagonal(const vector<double> &cf_coeffs, const double val,
168 |                      const size_t depth) {
169 |   // initialize
170 |   double current_num = 0.0;
171 |   double prev_num1 = cf_coeffs[0];
172 |   double prev_num2 = 0.0;
173 | 
174 |   double current_denom = 0.0;
175 |   double prev_denom1 = 1.0;
176 |   double prev_denom2 = 1.0;
177 | 
178 |   for (size_t i = 1; i < min(cf_coeffs.size(), depth); i++) {
179 |     // calculate current values
180 |     current_num = prev_num1 + cf_coeffs[i] * val * prev_num2;
181 |     current_denom = prev_denom1 + cf_coeffs[i] * val * prev_denom2;
182 | 
183 |     // update previous values
184 |     prev_num2 = prev_num1;
185 |     prev_num1 = current_num;
186 | 
187 |     prev_denom2 = prev_denom1;
188 |     prev_denom1 = current_denom;
189 | 
190 |     // now rescale all values
191 |     const double rescale_val = get_rescale_value(current_num, current_denom);
192 | 
193 |     current_num *= rescale_val;
194 |     current_denom *= rescale_val;
195 | 
196 |     prev_num1 *= rescale_val;
197 |     prev_num2 *= rescale_val;
198 | 
199 |     prev_denom1 *= rescale_val;
200 |     prev_denom2 *= rescale_val;
201 |   }
202 |   return current_num / current_denom;
203 | }
204 | 
205 | static double
206 | evaluate_power_series(const vector<double> &ps_coeffs, const double val) {
207 |   double x = 0.0;
208 |   for (size_t i = 0; i < ps_coeffs.size(); i++)
209 |     x += ps_coeffs[i] * pow(val, i);
210 |   return x;
211 | }
212 | 
213 | /* evaluate CF when upper_offset > 0 using Euler's recursion */
214 | static double
215 | evaluate_above_diagonal(const vector<double> &cf_coeffs,
216 |                         const vector<double> &offset_coeffs, const double val,
217 |                         const size_t depth) {
218 |   const double cf_part =
219 |     evaluate_on_diagonal(cf_coeffs, val, depth - offset_coeffs.size());
220 | 
221 |   const double ps_part = evaluate_power_series(offset_coeffs, val);
222 | 
223 |   return ps_part + pow(val, offset_coeffs.size()) * cf_part;
224 | }
225 | 
226 | // calculate ContinuedFraction approx when lower_offdiag > 0
227 | static double
228 | evaluate_below_diagonal(const vector<double> &cf_coeffs,
229 |                         const vector<double> &offset_coeffs, const double val,
230 |                         const size_t depth) {
231 |   const double cf_part =
232 |     evaluate_on_diagonal(cf_coeffs, val, depth - offset_coeffs.size());
233 | 
234 |   const double ps_part = evaluate_power_series(offset_coeffs, val);
235 | 
236 |   // recall that if lower_offset > 0, we are working with 1/f, invert approx
237 |   return 1.0 / (ps_part + pow(val, offset_coeffs.size()) * cf_part);
238 | }
239 | 
240 | // evaluate CF at a given point
241 | double
242 | ContinuedFraction::operator()(const double val) const {
243 |   if (diagonal_idx > 0)
244 |     return evaluate_above_diagonal(cf_coeffs, offset_coeffs, val, degree);
245 |   else if (diagonal_idx < 0)
246 |     return evaluate_below_diagonal(cf_coeffs, offset_coeffs, val, degree);
247 |   else
248 |     return evaluate_on_diagonal(cf_coeffs, val, degree);
249 | }
250 | 
251 | std::ostream &
252 | operator<<(std::ostream &the_stream, const ContinuedFraction &cf) {
253 |   using std::ios_base;
254 |   using std::setw;
255 | 
256 |   ios_base::fmtflags orig_flags = the_stream.flags();
257 |   the_stream.setf(ios_base::fixed, ios_base::floatfield);
258 |   the_stream.precision(2);
259 |   the_stream << "OFFSET_COEFFS" << '\t' << "PS_COEFFS" << '\n';
260 |   const size_t offset = cf.offset_coeffs.size();
261 |   for (size_t i = 0; i < offset; ++i)
262 |     the_stream << setw(12) << cf.offset_coeffs[i] << '\t' << setw(12)
263 |                << cf.ps_coeffs[i] << '\n';
264 |   the_stream << "CF_COEFFS" << '\n';
265 |   for (size_t i = 0; i < cf.cf_coeffs.size(); ++i)
266 |     the_stream << setw(12) << cf.cf_coeffs[i] << '\t' << setw(12)
267 |                << cf.ps_coeffs[i + offset] << '\n';
268 |   the_stream.flags(orig_flags);
269 |   return the_stream;
270 | }
271 | 
272 | // estimate yields by evaluating the CF at given points
273 | void
274 | ContinuedFraction::extrapolate_distinct(const double max_value,
275 |                                         const double step_size,
276 |                                         vector<double> &estimates) const {
277 |   estimates.clear();
278 |   estimates.push_back(0);
279 |   for (double t = step_size; t <= max_value; t += step_size)
280 |     estimates.push_back(t * operator()(t));
281 | }
282 | 
283 | ////////////////////////////////////////////////////////////////////////
284 | ////////////////  CONTINUED FRACTION APPROXIMATION CLASS BELOW
285 | 
286 | typedef ContinuedFractionApproximation CFA;
287 | 
288 | const size_t CFA::min_allowed_degree = 4;
289 | const double CFA::search_max_val = 100;
290 | const double CFA::search_step_size = 0.05;
291 | 
292 | /* check if a sequence of estimates are "stable": in [0, infty,
293 |  * increasing, negative 2nd deriv
294 |  */
295 | bool
296 | check_yield_estimates_stability(const vector<double> &estimates) {
297 |   // require estimates are non-negative and finite
298 |   for (size_t i = 0; i < estimates.size(); ++i)
299 |     if (!std::isfinite(estimates[i]) || estimates[i] < 0.0)
300 |       return false;
301 | 
302 |   // require estimate to be increasing
303 |   for (size_t i = 1; i < estimates.size(); ++i)
304 |     if (estimates[i] < estimates[i - 1])
305 |       return false;
306 | 
307 |   // require negative second derivative
308 |   for (size_t i = 2; i < estimates.size(); ++i)
309 |     if (estimates[i - 1] - estimates[i - 2] < estimates[i] - estimates[i - 1])
310 |       return false;
311 | 
312 |   return !estimates.empty();
313 | }
314 | 
315 | /* Finds the optimal number of terms (i.e. degree, depth, etc.) of the
316 |  * continued fraction by checking for stability of estimates at
317 |  * specific points for yield. New way for searching for optimal CF
318 |  */
319 | ContinuedFraction
320 | CFA::optimal_cont_frac_distinct(const vector<double> &counts_hist) const {
321 |   // we expect to use an underestimate, but this is dealt with outside
322 |   // by ensuring we have an even number of max terms
323 | 
324 |   if (max_terms >= counts_hist.size())
325 |     return ContinuedFraction();
326 | 
327 |   vector<double> ps_coeffs;
328 |   for (size_t j = 1; j <= max_terms; j++)
329 |     ps_coeffs.push_back(counts_hist[j] * pow(-1.0, j + 1));
330 | 
331 |   ContinuedFraction full_cf(ps_coeffs, diagonal_idx, max_terms);
332 | 
333 |   // if max terms in {3,4,5,6}, check only that degree
334 |   if (max_terms >= 3 && max_terms <= 6) {
335 |     vector<double> estimates;
336 |     full_cf.extrapolate_distinct(search_max_val, search_step_size, estimates);
337 |     if (check_yield_estimates_stability(estimates))
338 |       return full_cf;
339 |   }
340 |   else {
341 |     // if max terms >= 7, start at 7 and check increasing cont frac's
342 |     for (size_t i = 7 + (max_terms % 2 == 0); i <= max_terms; i += 2) {
343 |       ContinuedFraction trunc_cf(full_cf);
344 |       truncate_degree(i, trunc_cf);
345 |       vector<double> estimates;
346 |       trunc_cf.extrapolate_distinct(search_max_val, search_step_size,
347 |                                     estimates);
348 |       if (check_yield_estimates_stability(estimates))
349 |         return trunc_cf;
350 |     }
351 |   }
352 |   // no stable continued fraction: return null
353 |   return ContinuedFraction();
354 | }
355 | 


--------------------------------------------------------------------------------
/src/continued_fraction.hpp:
--------------------------------------------------------------------------------
 1 | /*    Copyright (C) 2013 University of Southern California and
 2 |  *                       Andrew D. Smith and Timothy Daley
 3 |  *
 4 |  *    Authors: Andrew D. Smith and Timothy Daley
 5 |  *
 6 |  *    This program is free software: you can redistribute it and/or modify
 7 |  *    it under the terms of the GNU General Public License as published by
 8 |  *    the Free Software Foundation, either version 3 of the License, or
 9 |  *    (at your option) any later version.
10 |  *
11 |  *    This program is distributed in the hope that it will be useful,
12 |  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  *    GNU General Public License for more details.
15 |  *
16 |  *    You should have received a copy of the GNU General Public License
17 |  *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 |  */
19 | 
20 | #ifndef SRC_CONTINUED_FRACTION_HPP_
21 | #define SRC_CONTINUED_FRACTION_HPP_
22 | 
23 | #include <cstddef>
24 | #include <fstream>
25 | #include <iomanip>
26 | #include <numeric>
27 | #include <vector>
28 | 
29 | struct ContinuedFraction {
30 |   // Constructors
31 |   ContinuedFraction() : diagonal_idx(0), degree(0ul) {}
32 |   ContinuedFraction(const std::vector<double> &ps_cf, const int di,
33 |                     const size_t dg);
34 | 
35 |   // Evaluate the continued fraction
36 |   double operator()(const double val) const;
37 | 
38 |   //////////////////////////////////////////
39 |   // Extrapolation functions
40 | 
41 |   // Evaluate the continued fraction estimating distinct
42 |   // along a curve from 0 to max_value
43 |   void extrapolate_distinct(const double max_value, const double step_size,
44 |                             std::vector<double> &estimates) const;
45 | 
46 |   bool is_valid() const { return !cf_coeffs.empty(); }
47 | 
48 |   std::vector<double> ps_coeffs;
49 |   std::vector<double> cf_coeffs;
50 |   std::vector<double> offset_coeffs;
51 |   int diagonal_idx;
52 |   size_t degree;
53 | };
54 | 
55 | // get continued fraction with lower degree
56 | void
57 | truncate_degree(const size_t truncated_degree, ContinuedFraction &cf);
58 | 
59 | std::ostream &
60 | operator<<(std::ostream &out, const ContinuedFraction &cf);
61 | 
62 | class ContinuedFractionApproximation {
63 | public:
64 |   ContinuedFractionApproximation(const int di, const size_t mt) :
65 |     diagonal_idx(di), max_terms(mt) {}
66 | 
67 |   // find best cont frac approx for estimating distinct
68 |   ContinuedFraction
69 |   optimal_cont_frac_distinct(const std::vector<double> &counts_hist) const;
70 | 
71 | private:
72 |   int diagonal_idx;  // the diagonal to work with for estimates
73 |   size_t max_terms;  // the maximum number of terms to try for a CF
74 | 
75 |   /* note: these never change */
76 |   static const size_t min_allowed_degree;
77 | 
78 |   // largest value to search for lowerbound and stability
79 |   static const double search_max_val;
80 | 
81 |   // step size for search of lowerbound and stability
82 |   static const double search_step_size;
83 | };
84 | 
85 | bool
86 | check_yield_estimates_stability(const std::vector<double> &estimates);
87 | 
88 | #endif  // SRC_CONTINUED_FRACTION_HPP_
89 | 


--------------------------------------------------------------------------------
/src/dnmt_error.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2023 Andrew D. Smith
 2 |  *
 3 |  * Authors: Andrew Smith
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or
 6 |  * modify it under the terms of the GNU General Public License as
 7 |  * published by the Free Software Foundation, either version 3 of the
 8 |  * License, or (at your option) any later version.
 9 |  *
10 |  * This program is distributed in the hope that it will be useful, but
11 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 |  * General Public License for more details.
14 |  */
15 | 
16 | #ifndef SRC_DNMT_ERROR_HPP_
17 | #define SRC_DNMT_ERROR_HPP_
18 | 
19 | #include <cstdint>  // for int64_t
20 | #include <cstring>
21 | #include <sstream>
22 | #include <stdexcept>
23 | #include <string>
24 | 
25 | struct dnmt_error : public std::exception {
26 |   std::int64_t err{};    // error possibly from HTSlib
27 |   int the_errno{};       // ERRNO at time of construction
28 |   std::string msg;       // the message
29 |   std::string the_what;  // to report
30 |   dnmt_error(const std::int64_t err, const std::string &msg) :
31 |     err{err}, the_errno{errno}, msg{msg} {
32 |     std::ostringstream oss;
33 |     // clang-format off
34 |     oss << "[error: " << err << "][" << "ERRNO: " << the_errno << "]"
35 |         << "[" << strerror(the_errno) << "][" << msg << "]";
36 |     // clang-format on
37 |     the_what = oss.str();
38 |   }
39 |   explicit dnmt_error(const std::string &_msg) : dnmt_error(0, _msg) {}
40 |   const char *what() const noexcept override { return the_what.data(); }
41 | };
42 | 
43 | #endif  // SRC_DNMT_ERROR_HPP_
44 | 


--------------------------------------------------------------------------------
/src/gc_extrap.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (C) 2013-2024 University of Southern California and
  2 |  *                         Andrew D. Smith and Timothy Daley
  3 |  *
  4 |  * Authors: Timothy Daley and Andrew Smith
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or
  7 |  * modify it under the terms of the GNU General Public License as
  8 |  * published by the Free Software Foundation, either version 3 of the
  9 |  * License, or (at your option) any later version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful, but
 12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 |  * General Public License for more details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program. If not, see
 18 |  * <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | #include "gc_extrap.hpp"
 22 | 
 23 | #include "common.hpp"
 24 | #include "load_data_for_complexity.hpp"
 25 | 
 26 | #include <OptionParser.hpp>
 27 | 
 28 | #include <algorithm>
 29 | #include <cstddef>
 30 | #include <cstdint>
 31 | #include <filesystem>
 32 | #include <fstream>
 33 | #include <iostream>
 34 | #include <stdexcept>
 35 | #include <string>
 36 | #include <vector>
 37 | 
 38 | using std::cbegin;
 39 | using std::cend;
 40 | using std::cerr;
 41 | using std::endl;
 42 | using std::min;
 43 | using std::runtime_error;
 44 | using std::size_t;
 45 | using std::string;
 46 | using std::uint32_t;
 47 | using std::vector;
 48 | 
 49 | // ADS: functions same, header different (above and this one)
 50 | static void
 51 | write_predicted_coverage_curve(const string &outfile, const double c_level,
 52 |                                const double base_step_size,
 53 |                                const size_t bin_size,
 54 |                                const vector<double> &cvrg_estimates,
 55 |                                const vector<double> &cvrg_lower_ci_lognorm,
 56 |                                const vector<double> &cvrg_upper_ci_lognorm) {
 57 |   static constexpr double one_hundred = 100.0;
 58 |   std::ofstream of;
 59 |   if (!outfile.empty())
 60 |     of.open(outfile);
 61 |   std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
 62 | 
 63 |   const double percentile = one_hundred * c_level;
 64 |   // clang-format off
 65 |   out << "TOTAL_BASES" << '\t'
 66 |       << "EXPECTED_COVERED_BASES" << '\t'
 67 |       << "LOWER_" << percentile << "%CI" << '\t'
 68 |       << "UPPER_" << percentile << "%CI"
 69 |       << endl;
 70 |   // clang-format on
 71 | 
 72 |   out.setf(std::ios_base::fixed, std::ios_base::floatfield);
 73 |   out.precision(1);
 74 | 
 75 |   out << 0 << '\t' << 0 << '\t' << 0 << '\t' << 0 << endl;
 76 |   for (size_t i = 0; i < cvrg_estimates.size(); ++i)
 77 |     out << (i + 1) * base_step_size << '\t' << cvrg_estimates[i] * bin_size
 78 |         << '\t' << cvrg_lower_ci_lognorm[i] * bin_size << '\t'
 79 |         << cvrg_upper_ci_lognorm[i] * bin_size << endl;
 80 | }
 81 | 
 82 | int
 83 | gc_extrap_main(const int argc, const char *argv[]) {
 84 |   try {
 85 |     const size_t MIN_REQUIRED_COUNTS = 4;
 86 | 
 87 |     string outfile;
 88 |     string histogram_outfile;
 89 | 
 90 |     int diagonal = 0;
 91 |     size_t orig_max_terms = 100;
 92 |     size_t bin_size = 10;
 93 |     bool verbose = false;
 94 |     double base_step_size = 1.0e8;
 95 |     size_t max_width = 10000;
 96 |     bool SINGLE_ESTIMATE = false;
 97 |     double max_extrap = 1.0e12;
 98 |     size_t n_bootstraps = 100;
 99 |     uint32_t seed = 408;
100 |     bool allow_defects = false;
101 | 
102 |     bool NO_SEQUENCE = false;
103 |     double c_level = 0.95;
104 | #ifdef HAVE_HTSLIB
105 |     bool BAM_FORMAT_INPUT = false;
106 |     uint32_t n_threads{1};
107 | #endif
108 | 
109 |     const string description = R"(
110 | Extrapolate the size of the covered genome by mapped reads. This
111 | approach is described in Daley & Smith (2014). The method is the same
112 | as for lc_extrap: using rational function approximation to a
113 | power-series expansion for the number of "unobserved" bases in the
114 | initial sample. The gc_extrap method is adapted to deal with
115 | individual nucleotides rather than distinct reads.
116 | )";
117 |     string program_name = std::filesystem::path(argv[0]).filename();
118 |     program_name += " " + string(argv[1]);
119 | 
120 |     // ********* GET COMMAND LINE ARGUMENTS  FOR GC EXTRAP **********
121 |     OptionParser opt_parse(program_name, description, "<input-file>");
122 |     opt_parse.add_opt("output", 'o',
123 |                       "coverage yield output file (default: stdout)", false,
124 |                       outfile);
125 |     opt_parse.add_opt("max_width", 'w',
126 |                       "max fragment length, "
127 |                       "set equal to read length for single end reads",
128 |                       false, max_width);
129 |     opt_parse.add_opt("bin_size", 'b', "bin size", false, bin_size);
130 |     opt_parse.add_opt("extrap", 'e', "maximum extrapolation in base pairs",
131 |                       false, max_extrap);
132 |     opt_parse.add_opt("step", 's', "step size in bases between extrapolations",
133 |                       false, base_step_size);
134 |     opt_parse.add_opt("bootstraps", 'n', "number of bootstraps", false,
135 |                       n_bootstraps);
136 |     opt_parse.add_opt("cval", 'c', "level for confidence intervals", false,
137 |                       c_level);
138 |     opt_parse.add_opt("terms", 'x', "maximum number of terms", false,
139 |                       orig_max_terms);
140 |     opt_parse.add_opt("verbose", 'v', "print more information", false, verbose);
141 |     opt_parse.add_opt("hist-out", '\0', "output histogram to this file", false,
142 |                       histogram_outfile);
143 |     opt_parse.add_opt("bed", 'B',
144 |                       "input is in bed format without sequence information",
145 |                       false, NO_SEQUENCE);
146 |     opt_parse.add_opt("quick", 'Q',
147 |                       "quick mode: run gc_extrap without "
148 |                       "bootstrapping for confidence intervals",
149 |                       false, SINGLE_ESTIMATE);
150 |     opt_parse.add_opt("defects", 'D',
151 |                       "defects mode to extrapolate without testing for defects",
152 |                       false, allow_defects);
153 | #ifdef HAVE_HTSLIB
154 |     opt_parse.add_opt("bam", '\0', "input is in BAM format", false,
155 |                       BAM_FORMAT_INPUT);
156 |     opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM",
157 |                       false, n_threads);
158 | #endif
159 |     opt_parse.add_opt("seed", 'r', "seed for random number generator", false,
160 |                       seed);
161 |     opt_parse.set_show_defaults();
162 | 
163 |     vector<string> leftover_args;
164 |     opt_parse.parse(argc - 1, argv + 1, leftover_args);
165 |     if (argc == 2 || opt_parse.help_requested()) {
166 |       cerr << opt_parse.help_message() << endl;
167 |       cerr << opt_parse.about_message() << endl;
168 |       return EXIT_SUCCESS;
169 |     }
170 |     if (opt_parse.option_missing()) {
171 |       cerr << opt_parse.option_missing_message() << endl;
172 |       return EXIT_SUCCESS;
173 |     }
174 |     if (leftover_args.empty()) {
175 |       cerr << opt_parse.help_message() << endl;
176 |       return EXIT_SUCCESS;
177 |     }
178 |     const string infile = leftover_args.front();
179 |     // ****************************************************************
180 | 
181 |     vector<double> coverage_hist;
182 |     size_t n_reads = 0;
183 |     if (verbose)
184 |       cerr << "LOADING READS" << endl;
185 | 
186 |     if (NO_SEQUENCE) {
187 |       if (verbose)
188 |         cerr << "BED FORMAT" << endl;
189 |       n_reads = load_coverage_counts_GR(infile, seed, bin_size, max_width,
190 |                                         coverage_hist);
191 |     }
192 | #ifdef HAVE_HTSLIB
193 |     else if (BAM_FORMAT_INPUT) {
194 |       if (verbose)
195 |         cerr << "BAM_INPUT" << endl;
196 |       n_reads = load_coverage_counts_BAM(n_threads, infile, seed, bin_size,
197 |                                          max_width, coverage_hist);
198 |     }
199 | #endif
200 |     else {
201 |       if (verbose)
202 |         cerr << "MAPPED READ FORMAT" << endl;
203 |       n_reads = load_coverage_counts_MR(infile, seed, bin_size, max_width,
204 |                                         coverage_hist);
205 |     }
206 | 
207 |     const double total_bins = get_counts_from_hist(coverage_hist);
208 | 
209 |     const double distinct_bins =
210 |       accumulate(cbegin(coverage_hist), cend(coverage_hist), 0.0);
211 | 
212 |     const double avg_bins_per_read = total_bins / n_reads;
213 |     const double bin_step_size = base_step_size / bin_size;
214 | 
215 |     const size_t max_observed_count = coverage_hist.size() - 1;
216 | 
217 |     // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
218 |     size_t first_zero = 1;
219 |     while (first_zero < coverage_hist.size() && coverage_hist[first_zero] > 0)
220 |       ++first_zero;
221 | 
222 |     orig_max_terms = min(orig_max_terms, first_zero - 1);
223 | 
224 |     if (verbose)
225 |       cerr << "TOTAL READS         = " << n_reads << endl
226 |            << "BASE STEP SIZE      = " << base_step_size << endl
227 |            << "BIN STEP SIZE       = " << bin_step_size << endl
228 |            << "TOTAL BINS          = " << total_bins << endl
229 |            << "BINS PER READ       = " << avg_bins_per_read << endl
230 |            << "DISTINCT BINS       = " << distinct_bins << endl
231 |            << "TOTAL BASES         = " << total_bins * bin_size << endl
232 |            << "TOTAL COVERED BASES = " << distinct_bins * bin_size << endl
233 |            << "MAX COVERAGE COUNT  = " << max_observed_count << endl
234 |            << "COUNTS OF 1         = " << coverage_hist[1] << endl;
235 | 
236 |     if (!histogram_outfile.empty())
237 |       report_histogram(histogram_outfile, coverage_hist);
238 | 
239 |     // catch if all reads are distinct
240 |     if (orig_max_terms < MIN_REQUIRED_COUNTS)
241 |       throw runtime_error("max count before zero is les than min required "
242 |                           "count (4), sample not sufficiently deep or "
243 |                           "duplicates removed");
244 | 
245 |     // check to make sure library is not overly saturated
246 |     const double two_fold_extrap = GoodToulmin2xExtrap(coverage_hist);
247 |     if (two_fold_extrap < 0.0)
248 |       throw runtime_error("Library expected to saturate in doubling of "
249 |                           "experiment size, unable to extrapolate");
250 | 
251 |     if (verbose)
252 |       cerr << "[ESTIMATING COVERAGE CURVE]" << endl;
253 | 
254 |     vector<double> coverage_estimates;
255 | 
256 |     if (SINGLE_ESTIMATE) {
257 |       bool SINGLE_ESTIMATE_SUCCESS = extrap_single_estimate(
258 |         verbose, allow_defects, coverage_hist, orig_max_terms, diagonal,
259 |         bin_step_size, max_extrap / bin_size, coverage_estimates);
260 |       // IF FAILURE, EXIT
261 |       if (!SINGLE_ESTIMATE_SUCCESS)
262 |         throw runtime_error("SINGLE ESTIMATE FAILED, NEED TO RUN IN "
263 |                             "FULL MODE FOR ESTIMATES");
264 | 
265 |       std::ofstream of;
266 |       if (!outfile.empty())
267 |         of.open(outfile);
268 |       std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
269 | 
270 |       out << "TOTAL_BASES\tEXPECTED_DISTINCT" << endl;
271 | 
272 |       out.setf(std::ios_base::fixed, std::ios_base::floatfield);
273 |       out.precision(1);
274 | 
275 |       out << 0 << '\t' << 0 << endl;
276 |       for (size_t i = 0; i < coverage_estimates.size(); ++i)
277 |         out << (i + 1) * base_step_size << '\t'
278 |             << coverage_estimates[i] * bin_size << endl;
279 |     }
280 |     else {
281 |       if (verbose)
282 |         cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl;
283 | 
284 |       const size_t max_iter = 10 * n_bootstraps;
285 | 
286 |       vector<vector<double>> bootstrap_estimates;
287 |       extrap_bootstrap(verbose, allow_defects, seed, coverage_hist,
288 |                        n_bootstraps, orig_max_terms, diagonal, bin_step_size,
289 |                        max_extrap / bin_size, max_iter, bootstrap_estimates);
290 | 
291 |       if (verbose)
292 |         cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl;
293 |       vector<double> coverage_upper_ci_lognorm, coverage_lower_ci_lognorm;
294 |       vector_median_and_ci(bootstrap_estimates, c_level, coverage_estimates,
295 |                            coverage_lower_ci_lognorm,
296 |                            coverage_upper_ci_lognorm);
297 | 
298 |       if (verbose)
299 |         cerr << "[WRITING OUTPUT]" << endl;
300 | 
301 |       write_predicted_coverage_curve(
302 |         outfile, c_level, base_step_size, bin_size, coverage_estimates,
303 |         coverage_lower_ci_lognorm, coverage_upper_ci_lognorm);
304 |     }
305 |   }
306 |   catch (const std::exception &e) {
307 |     cerr << e.what() << endl;
308 |     return EXIT_FAILURE;
309 |   }
310 |   return EXIT_SUCCESS;
311 | }
312 | 


--------------------------------------------------------------------------------
/src/gc_extrap.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2013-2024 University of Southern California and
 2 |  *                         Andrew D. Smith and Timothy Daley
 3 |  *
 4 |  * Authors: Timothy Daley and Andrew Smith
 5 |  *
 6 |  * This program is free software: you can redistribute it and/or
 7 |  * modify it under the terms of the GNU General Public License as
 8 |  * published by the Free Software Foundation, either version 3 of the
 9 |  * License, or (at your option) any later version.
10 |  *
11 |  * This program is distributed in the hope that it will be useful, but
12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 |  * General Public License for more details.
15 |  *
16 |  * You should have received a copy of the GNU General Public License
17 |  * along with this program. If not, see
18 |  * <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | #ifndef SRC_GC_EXTRAP_HPP_
22 | #define SRC_GC_EXTRAP_HPP_
23 | 
24 | int
25 | gc_extrap_main(const int argc, const char *argv[]);
26 | 
27 | #endif  // SRC_GC_EXTRAP_HPP_
28 | 


--------------------------------------------------------------------------------
/src/lc_extrap.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (C) 2013-2024 University of Southern California and
  2 |  *                         Andrew D. Smith and Timothy Daley
  3 |  *
  4 |  * Authors: Timothy Daley and Andrew Smith
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or
  7 |  * modify it under the terms of the GNU General Public License as
  8 |  * published by the Free Software Foundation, either version 3 of the
  9 |  * License, or (at your option) any later version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful, but
 12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 |  * General Public License for more details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program. If not, see
 18 |  * <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | #include "lc_extrap.hpp"
 22 | 
 23 | #include "common.hpp"
 24 | #include "load_data_for_complexity.hpp"
 25 | 
 26 | #include <OptionParser.hpp>
 27 | 
 28 | #include <algorithm>
 29 | #include <cstddef>  // std::size_t
 30 | #include <cstdint>
 31 | #include <filesystem>
 32 | #include <fstream>
 33 | #include <iostream>
 34 | #include <numeric>
 35 | #include <stdexcept>
 36 | #include <string>
 37 | #include <vector>
 38 | 
 39 | using std::begin;
 40 | using std::cbegin;
 41 | using std::cend;
 42 | using std::cerr;
 43 | using std::end;
 44 | using std::endl;
 45 | using std::runtime_error;
 46 | using std::size_t;
 47 | using std::string;
 48 | using std::to_string;
 49 | using std::uint32_t;
 50 | using std::vector;
 51 | 
 52 | int
 53 | lc_extrap_main(const int argc, const char **argv) {
 54 |   try {
 55 |     static const size_t min_required_counts = 4;
 56 |     static const string min_required_counts_error_message =
 57 |       "max count before zero is less than min required count (" +
 58 |       to_string(min_required_counts) + ") duplicates removed";
 59 | 
 60 |     string outfile;
 61 |     string histogram_outfile;
 62 | 
 63 |     size_t orig_max_terms = 100;
 64 |     double max_extrap = 1.0e10;
 65 |     double step_size = 1e6;
 66 |     size_t n_bootstraps = 100;
 67 |     int diagonal = 0;
 68 |     double c_level = 0.95;
 69 |     uint32_t seed = 408;
 70 | 
 71 |     /* FLAGS */
 72 |     bool verbose = false;
 73 |     bool VALS_INPUT = false;
 74 |     bool PAIRED_END = false;
 75 |     bool HIST_INPUT = false;
 76 |     bool SINGLE_ESTIMATE = false;
 77 |     bool allow_defects = false;
 78 | 
 79 | #ifdef HAVE_HTSLIB
 80 |     bool BAM_FORMAT_INPUT = false;
 81 |     size_t MAX_SEGMENT_LENGTH = 5000;
 82 |     uint32_t n_threads{1};
 83 | #endif
 84 | 
 85 |     const string description =
 86 |       R"(
 87 | Extrapolate the complexity of a library. This is the approach
 88 | described in Daley & Smith (2013). The method applies rational
 89 | function approximation via continued fractions with the
 90 | original goal of estimating the number of distinct reads that a
 91 | sequencing library would yield upon deeper sequencing. This
 92 | method has been used for many different purposes since then.
 93 | )";
 94 |     string program_name = std::filesystem::path(argv[0]).filename();
 95 |     program_name += " " + string(argv[1]);
 96 | 
 97 |     /********** GET COMMAND LINE ARGUMENTS  FOR LC EXTRAP ***********/
 98 | 
 99 |     OptionParser opt_parse(program_name, description, "<input-file>");
100 |     opt_parse.add_opt("output", 'o', "yield output file (default: stdout)",
101 |                       false, outfile);
102 |     opt_parse.add_opt("extrap", 'e', "maximum extrapolation", false,
103 |                       max_extrap);
104 |     opt_parse.add_opt("step", 's', "extrapolation step size", false, step_size);
105 |     opt_parse.add_opt("boots", 'n', "number of bootstraps", false,
106 |                       n_bootstraps);
107 |     opt_parse.add_opt("cval", 'c', "level for confidence intervals", false,
108 |                       c_level);
109 |     opt_parse.add_opt("terms", 'x', "maximum terms in estimator", false,
110 |                       orig_max_terms);
111 |     opt_parse.add_opt("verbose", 'v', "print more info", false, verbose);
112 | #ifdef HAVE_HTSLIB
113 |     opt_parse.add_opt("bam", 'B', "input is in BAM format", false,
114 |                       BAM_FORMAT_INPUT);
115 |     opt_parse.add_opt("seg_len", 'l',
116 |                       "maximum segment length when merging "
117 |                       "paired end bam reads",
118 |                       false, MAX_SEGMENT_LENGTH);
119 |     opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM",
120 |                       false, n_threads);
121 | #endif
122 |     opt_parse.add_opt("pe", 'P', "input is paired end read file", false,
123 |                       PAIRED_END);
124 |     opt_parse.add_opt(
125 |       "vals", 'V', "input is a text file containing only the observed counts",
126 |       false, VALS_INPUT);
127 |     opt_parse.add_opt("hist", 'H',
128 |                       "input is a text file containing the observed histogram",
129 |                       false, HIST_INPUT);
130 |     opt_parse.add_opt("hist-out", '\0',
131 |                       "output histogram to this file (for non-hist input)",
132 |                       false, histogram_outfile);
133 |     opt_parse.add_opt("quick", 'Q',
134 |                       "quick mode (no bootstraps) for confidence intervals",
135 |                       false, SINGLE_ESTIMATE);
136 |     opt_parse.add_opt("defects", 'D', "no testing for defects", false,
137 |                       allow_defects);
138 |     opt_parse.add_opt("seed", 'r', "seed for random number generator", false,
139 |                       seed);
140 |     opt_parse.set_show_defaults();
141 |     vector<string> leftover_args;
142 |     opt_parse.parse(argc - 1, argv + 1, leftover_args);
143 |     if (argc == 2 || opt_parse.help_requested()) {
144 |       cerr << opt_parse.help_message() << endl;
145 |       cerr << opt_parse.about_message() << endl;
146 |       return EXIT_SUCCESS;
147 |     }
148 |     if (opt_parse.option_missing()) {
149 |       cerr << opt_parse.option_missing_message() << endl;
150 |       return EXIT_SUCCESS;
151 |     }
152 |     if (leftover_args.empty()) {
153 |       cerr << opt_parse.help_message() << endl;
154 |       return EXIT_SUCCESS;
155 |     }
156 |     const string input_file_name = leftover_args.front();
157 |     /******************************************************************/
158 | 
159 |     vector<double> counts_hist;
160 |     size_t n_reads = 0;
161 | 
162 |     /************ loading input ***************************************/
163 |     if (HIST_INPUT) {
164 |       if (verbose)
165 |         cerr << "HIST_INPUT" << endl;
166 |       n_reads = load_histogram(input_file_name, counts_hist);
167 |     }
168 |     else if (VALS_INPUT) {
169 |       if (verbose)
170 |         cerr << "VALS_INPUT" << endl;
171 |       n_reads = load_counts(input_file_name, counts_hist);
172 |     }
173 | #ifdef HAVE_HTSLIB
174 |     else if (BAM_FORMAT_INPUT) {
175 |       if (PAIRED_END) {
176 |         if (verbose)
177 |           cerr << "PAIRED_END_BAM_INPUT" << endl;
178 |         n_reads = load_counts_BAM_pe(n_threads, input_file_name, counts_hist);
179 |       }
180 |       else {  // single end
181 |         if (verbose)
182 |           cerr << "BAM_INPUT" << endl;
183 |         n_reads = load_counts_BAM_se(n_threads, input_file_name, counts_hist);
184 |       }
185 |     }
186 | #endif
187 |     else if (PAIRED_END) {
188 |       if (verbose)
189 |         cerr << "PAIRED_END_BED_INPUT" << endl;
190 |       n_reads = load_counts_BED_pe(input_file_name, counts_hist);
191 |     }
192 |     else {  // default is single end bed file
193 |       if (verbose)
194 |         cerr << "BED_INPUT" << endl;
195 |       n_reads = load_counts_BED_se(input_file_name, counts_hist);
196 |     }
197 |     /************ done loading input **********************************/
198 | 
199 |     const size_t max_observed_count = counts_hist.size() - 1;
200 |     const double distinct_reads =
201 |       std::accumulate(cbegin(counts_hist), cend(counts_hist), 0.0);
202 | 
203 |     // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
204 |     size_t first_zero = 1;
205 |     while (first_zero < counts_hist.size() && counts_hist[first_zero] > 0)
206 |       ++first_zero;
207 | 
208 |     // make sure the max terms is at most one less than the first zero
209 |     orig_max_terms = std::min(orig_max_terms, first_zero - 1);
210 |     orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1);
211 | 
212 |     const size_t distinct_counts =
213 |       std::count_if(cbegin(counts_hist), cend(counts_hist),
214 |                     [](const double x) { return x > 0.0; });
215 | 
216 |     if (verbose)
217 |       cerr << "TOTAL READS     = " << n_reads << endl
218 |            << "DISTINCT READS  = " << distinct_reads << endl
219 |            << "DISTINCT COUNTS = " << distinct_counts << endl
220 |            << "MAX COUNT       = " << max_observed_count << endl
221 |            << "COUNTS OF 1     = " << counts_hist[1] << endl
222 |            << "MAX TERMS       = " << orig_max_terms << endl;
223 | 
224 |     if (!histogram_outfile.empty())
225 |       report_histogram(histogram_outfile, counts_hist);
226 | 
227 |     // check to make sure library is not overly saturated
228 |     const double two_fold_extrap = GoodToulmin2xExtrap(counts_hist);
229 |     if (two_fold_extrap < 0.0)
230 |       throw runtime_error("Saturation expected at double initial sample size. "
231 |                           "Unable to extrapolate.");
232 | 
233 |     // check that min required count is satisfied
234 |     if (orig_max_terms < min_required_counts)
235 |       throw runtime_error(min_required_counts_error_message);
236 | 
237 |     if (verbose)
238 |       cerr << "[ESTIMATING YIELD CURVE]" << endl;
239 |     vector<double> yield_estimates;
240 | 
241 |     if (SINGLE_ESTIMATE) {
242 |       const bool single_estimate_success = extrap_single_estimate(
243 |         verbose, allow_defects, counts_hist, orig_max_terms, diagonal,
244 |         step_size, max_extrap, yield_estimates);
245 |       // exit on failure
246 |       if (!single_estimate_success)
247 |         throw runtime_error(
248 |           "single estimate failed, run full mode for estimates");
249 | 
250 |       std::ofstream of;
251 |       if (!outfile.empty())
252 |         of.open(outfile);
253 |       std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
254 | 
255 |       out << "TOTAL_READS\tEXPECTED_DISTINCT" << endl;
256 |       out.setf(std::ios_base::fixed, std::ios_base::floatfield);
257 |       out.precision(1);
258 | 
259 |       out << 0 << '\t' << 0 << endl;
260 |       for (size_t i = 0; i < yield_estimates.size(); ++i)
261 |         out << (i + 1) * step_size << '\t' << yield_estimates[i] << endl;
262 |     }
263 |     else {
264 |       if (verbose)
265 |         cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl;
266 | 
267 |       const size_t max_iter = 100 * n_bootstraps;
268 | 
269 |       vector<vector<double>> bootstrap_estimates;
270 |       extrap_bootstrap(verbose, allow_defects, seed, counts_hist, n_bootstraps,
271 |                        orig_max_terms, diagonal, step_size, max_extrap,
272 |                        max_iter, bootstrap_estimates);
273 | 
274 |       if (verbose)
275 |         cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl;
276 |       // yield ci
277 |       vector<double> yield_upper_ci_lognorm, yield_lower_ci_lognorm;
278 |       vector_median_and_ci(bootstrap_estimates, c_level, yield_estimates,
279 |                            yield_lower_ci_lognorm, yield_upper_ci_lognorm);
280 | 
281 |       if (verbose)
282 |         cerr << "[WRITING OUTPUT]" << endl;
283 | 
284 |       write_predicted_complexity_curve(outfile, c_level, step_size,
285 |                                        yield_estimates, yield_lower_ci_lognorm,
286 |                                        yield_upper_ci_lognorm);
287 |     }
288 |   }
289 |   catch (const std::exception &e) {
290 |     cerr << e.what() << endl;
291 |     return EXIT_FAILURE;
292 |   }
293 |   return EXIT_SUCCESS;
294 | }
295 | 


--------------------------------------------------------------------------------
/src/lc_extrap.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2013-2024 University of Southern California and
 2 |  *                         Andrew D. Smith and Timothy Daley
 3 |  *
 4 |  * Authors: Timothy Daley and Andrew Smith
 5 |  *
 6 |  * This program is free software: you can redistribute it and/or
 7 |  * modify it under the terms of the GNU General Public License as
 8 |  * published by the Free Software Foundation, either version 3 of the
 9 |  * License, or (at your option) any later version.
10 |  *
11 |  * This program is distributed in the hope that it will be useful, but
12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 |  * General Public License for more details.
15 |  *
16 |  * You should have received a copy of the GNU General Public License
17 |  * along with this program. If not, see
18 |  * <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | #ifndef SRC_LC_EXTRAP_HPP_
22 | #define SRC_LC_EXTRAP_HPP_
23 | 
24 | int
25 | lc_extrap_main(const int argc, const char *argv[]);
26 | 
27 | #endif  // SRC_LC_EXTRAP_HPP_
28 | 


--------------------------------------------------------------------------------
/src/load_data_for_complexity.hpp:
--------------------------------------------------------------------------------
 1 | /*    Copyright (C) 2014 University of Southern California and
 2 |  *                       Andrew D. Smith and Timothy Daley
 3 |  *
 4 |  *    Authors: Andrew D. Smith and Timothy Daley
 5 |  *
 6 |  *    This program is free software: you can redistribute it and/or modify
 7 |  *    it under the terms of the GNU General Public License as published by
 8 |  *    the Free Software Foundation, either version 3 of the License, or
 9 |  *    (at your option) any later version.
10 |  *
11 |  *    This program is distributed in the hope that it will be useful,
12 |  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  *    GNU General Public License for more details.
15 |  *
16 |  *    You should have received a copy of the GNU General Public License
17 |  *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 |  */
19 | 
20 | #ifndef SRC_LOAD_DATA_FOR_COMPLEXITY_HPP_
21 | #define SRC_LOAD_DATA_FOR_COMPLEXITY_HPP_
22 | 
23 | #include <cstddef>
24 | #include <cstdint>
25 | #include <string>
26 | #include <vector>
27 | 
28 | std::size_t
29 | load_coverage_counts_MR(const std::string &input_file_name,
30 |                         const std::uint32_t seed, const std::size_t bin_size,
31 |                         const std::size_t max_width,
32 |                         std::vector<double> &coverage_hist);
33 | 
34 | std::size_t
35 | load_coverage_counts_GR(const std::string &input_file_name,
36 |                         const std::uint32_t seed, const std::size_t bin_size,
37 |                         const std::size_t max_width,
38 |                         std::vector<double> &coverage_hist);
39 | 
40 | std::size_t
41 | load_histogram(const std::string &filename, std::vector<double> &counts_hist);
42 | 
43 | std::size_t
44 | load_counts(const std::string &input_file_name,
45 |             std::vector<double> &counts_hist);
46 | 
47 | std::size_t
48 | load_counts_BED_pe(const std::string &input_file_name,
49 |                    std::vector<double> &counts_hist);
50 | 
51 | std::size_t
52 | load_counts_BED_se(const std::string &input_file_name,
53 |                    std::vector<double> &counts_hist);
54 | 
55 | #ifdef HAVE_HTSLIB
56 | std::size_t
57 | load_counts_BAM_pe(const std::uint32_t n_threads,
58 |                    const std::string &input_file_name,
59 |                    std::vector<double> &counts_hist);
60 | 
61 | std::size_t
62 | load_counts_BAM_se(const std::uint32_t n_threads,
63 |                    const std::string &input_file_name,
64 |                    std::vector<double> &counts_hist);
65 | 
66 | std::size_t
67 | load_coverage_counts_BAM(const std::uint32_t n_threads,
68 |                          const std::string &input_file_name,
69 |                          const std::uint32_t seed, const std::size_t bin_size,
70 |                          const std::size_t max_width,
71 |                          std::vector<double> &coverage_hist);
72 | 
73 | #endif  // HAVE_HTSLIB
74 | 
75 | #endif  // SRC_LOAD_DATA_FOR_COMPLEXITY_HPP_
76 | 


--------------------------------------------------------------------------------
/src/moment_sequence.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (C) 2013-2015
  2 |  *               University of Southern California and
  3 |  *               Andrew D. Smith and Timothy Daley
  4 |  *
  5 |  * Authors: Andrew D. Smith and Timothy Daley
  6 |  *
  7 |  * This program is free software: you can redistribute it and/or
  8 |  * modify it under the terms of the GNU General Public License as
  9 |  * published by the Free Software Foundation, either version 3 of the
 10 |  * License, or (at your option) any later version.
 11 |  *
 12 |  * This program is distributed in the hope that it will be useful, but
 13 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 14 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 |  * General Public License for more details.
 16 |  *
 17 |  * You should have received a copy of the GNU General Public License
 18 |  * along with this program.  If not, see
 19 |  * <http://www.gnu.org/licenses/>.
 20 |  */
 21 | 
 22 | #include "moment_sequence.hpp"
 23 | 
 24 | #include <algorithm>
 25 | #include <cassert>
 26 | #include <cmath>
 27 | #include <fstream>
 28 | #include <iomanip>
 29 | #include <iostream>
 30 | #include <numeric>
 31 | #include <string>
 32 | #include <utility>  // std::swap
 33 | #include <vector>
 34 | 
 35 | using std::begin;
 36 | using std::cbegin;
 37 | using std::cend;
 38 | using std::cerr;
 39 | using std::endl;
 40 | using std::find_if;
 41 | using std::isfinite;
 42 | using std::isinf;
 43 | using std::max;
 44 | using std::setprecision;
 45 | using std::string;
 46 | using std::swap;
 47 | using std::transform;
 48 | using std::vector;
 49 | 
 50 | void
 51 | LU_decomp(vector<vector<double>> &A, vector<int> &P) {
 52 |   const size_t N = A.size();
 53 |   double absA{};
 54 |   size_t i, k;
 55 | 
 56 |   P.clear();
 57 |   for (size_t x = 0; x <= N; x++)
 58 |     P.push_back(x);
 59 | 
 60 |   for (i = 0; i < N; i++) {
 61 |     double maxA = 0.0;
 62 |     size_t imax = i;
 63 | 
 64 |     for (k = i; k < N; k++)
 65 |       if ((absA = fabs(A[k][i])) > maxA) {
 66 |         maxA = absA;
 67 |         imax = k;
 68 |       }
 69 | 
 70 |     if (imax != i) {
 71 |       // pivoting P
 72 |       size_t j = P[i];
 73 |       P[i] = P[imax];
 74 |       P[imax] = j;
 75 | 
 76 |       // pivoting rows of A
 77 |       vector<double> ptr(A[i]);
 78 |       A[i] = A[imax];
 79 |       A[imax] = ptr;
 80 | 
 81 |       // counting pivots starting from N (for determinant)
 82 |       P[N]++;
 83 |     }
 84 | 
 85 |     for (size_t j = i + 1; j < N; j++) {
 86 |       A[j][i] /= A[i][i];
 87 | 
 88 |       for (k = i + 1; k < N; k++)
 89 |         A[j][k] -= A[j][i] * A[i][k];
 90 |     }
 91 |   }
 92 | }
 93 | 
 94 | double
 95 | LU_determinant(const vector<vector<double>> &A, const vector<int> &P) {
 96 |   const size_t N = A.size();
 97 | 
 98 |   double det = A[0][0];
 99 |   for (size_t i = 1; i < N; ++i)
100 |     det *= A[i][i];
101 | 
102 |   if ((P[N] - N) % 2 == 0)
103 |     return det;
104 | 
105 |   return -det;
106 | }
107 | 
108 | /////////////////////////////////////////////////////
109 | // test Hankel moment matrix
110 | // ensure moment sequence is positive definite
111 | // truncate moment sequence to ensure pos def
112 | size_t
113 | ensure_pos_def_mom_seq(vector<double> &moments, const double tolerance,
114 |                        const bool VERBOSE) {
115 |   const size_t min_hankel_dim = 1;
116 |   size_t hankel_dim = 2;
117 |   if (moments.size() < 2 * hankel_dim) {
118 |     if (VERBOSE)
119 |       cerr << "too few moments" << endl;
120 |     return min_hankel_dim;
121 |   }
122 | 
123 |   while (2 * hankel_dim - 1 < moments.size()) {
124 |     vector<vector<double>> hankel_mat(hankel_dim,
125 |                                       vector<double>(hankel_dim, 0.0));
126 |     for (size_t c_idx = 0; c_idx < hankel_dim; c_idx++)
127 |       for (size_t r_idx = 0; r_idx < hankel_dim; r_idx++)
128 |         hankel_mat[c_idx][r_idx] = moments[c_idx + r_idx];
129 | 
130 |     vector<int> perm;
131 |     LU_decomp(hankel_mat, perm);
132 |     const double hankel_mat_det = LU_determinant(hankel_mat, perm);
133 | 
134 |     vector<vector<double>> shift_hankel_matrix(hankel_dim,
135 |                                                vector<double>(hankel_dim, 0.0));
136 |     for (size_t c_idx = 0; c_idx < hankel_dim; c_idx++)
137 |       for (size_t r_idx = 0; r_idx < hankel_dim; r_idx++)
138 |         shift_hankel_matrix[c_idx][r_idx] = moments[c_idx + r_idx + 1];
139 | 
140 |     vector<int> s_perm;
141 |     LU_decomp(shift_hankel_matrix, s_perm);
142 |     const double shift_hankel_mat_det =
143 |       LU_determinant(shift_hankel_matrix, s_perm);
144 | 
145 |     if (VERBOSE) {
146 |       cerr << "dim" << '\t' << "hankel_det" << '\t' << "shifted_hankel_det"
147 |            << endl;
148 |       cerr << hankel_dim << '\t' << hankel_mat_det << '\t'
149 |            << shift_hankel_mat_det << endl;
150 |     }
151 | 
152 |     if (hankel_mat_det > tolerance && shift_hankel_mat_det > tolerance) {
153 |       hankel_dim++;
154 |     }
155 |     else {
156 |       hankel_dim--;
157 |       moments.resize(2 * hankel_dim);
158 |       return hankel_dim;
159 |     }
160 |   }
161 | 
162 |   return max(hankel_dim - 1, min_hankel_dim);
163 | }
164 | 
165 | /////////////////////////////////////////////////////
166 | // 3 term relations
167 | 
168 | // check 3 term recurrence to avoid non-positive elements
169 | // truncate if non-positive element found
170 | static void
171 | check_three_term_relation(vector<double> &a, vector<double> &b) {
172 |   // abort if first entry is zero or smaller
173 |   if (a[0] <= 0.0) {
174 |     a.clear();
175 |     b.clear();
176 |   }
177 | 
178 |   for (size_t i = 0; i < b.size(); i++)
179 |     // ADS: some strange logic here
180 |     if (b[i] <= 0.0 || !isfinite(b[i]) || a[i + 1] <= 0.0 ||
181 |         !isfinite(a[i + 1])) {
182 |       b.resize(i);
183 |       a.resize(i + 1);
184 |       break;
185 |     }
186 | }
187 | 
188 | // check the moment sequence to avoid non-positive elements and
189 | // truncate at first non-positive element if found
190 | static void
191 | check_moment_sequence(vector<double> &obs_moms) {
192 |   if (obs_moms[0] <= 0.0 || !isfinite(obs_moms[0]))
193 |     obs_moms.clear();
194 | 
195 |   for (size_t i = 1; i < obs_moms.size(); i++) {
196 |     if (obs_moms[i] <= 0.0 || !isfinite(obs_moms[i])) {
197 |       obs_moms.resize(i + 1);
198 |       break;
199 |     }
200 |   }
201 | }
202 | 
203 | void
204 | MomentSequence::unmodified_Chebyshev() {
205 |   const size_t n_points = static_cast<size_t>(floor(moments.size() / 2));
206 |   vector<double> a(n_points, 0.0);
207 |   vector<double> b(n_points - 1, 0.0);
208 | 
209 |   vector<vector<double>> sigma(2 * n_points, vector<double>(2 * n_points, 0.0));
210 |   // initialization
211 |   a[0] = moments[1] / moments[0];
212 |   // sigma[-1][l] = 0
213 |   for (size_t l = 0; l < 2 * n_points; l++)
214 |     sigma[0][l] = moments[l];
215 | 
216 |   for (size_t k = 1; k <= n_points; k++) {
217 |     for (size_t l = k; l < 2 * n_points - k; l++) {
218 |       sigma[k][l] = sigma[k - 1][l + 1] - a[k - 1] * sigma[k - 1][l];
219 |       if (k > 1)
220 |         sigma[k][l] -= b[k - 2] * sigma[k - 2][l];
221 |     }
222 |     if (k != n_points) {
223 |       a[k] =
224 |         sigma[k][k + 1] / sigma[k][k] - sigma[k - 1][k] / sigma[k - 1][k - 1];
225 |       b[k - 1] = sigma[k][k] / sigma[k - 1][k - 1];
226 |     }
227 |   }
228 | 
229 |   alpha = a;
230 |   beta = b;
231 | }
232 | 
233 | // un-normalized 3 term recurrence
234 | void
235 | MomentSequence::full_3term_recurrence(vector<double> &full_alpha,
236 |                                       vector<double> &full_beta) {
237 |   const size_t n_points = std::size(moments) / 2;
238 |   vector<double> a(n_points, 0.0);
239 |   vector<double> b(n_points - 1, 0.0);
240 | 
241 |   vector<vector<double>> sigma(2 * n_points, vector<double>(2 * n_points, 0.0));
242 |   // initialization
243 |   a[0] = moments[1] / moments[0];
244 |   // sigma[-1][l] = 0
245 |   for (size_t l = 0; l < 2 * n_points; l++)
246 |     sigma[0][l] = moments[l];
247 | 
248 |   for (size_t k = 1; k <= n_points; k++) {
249 |     for (size_t l = k; l < 2 * n_points - k; l++) {
250 |       sigma[k][l] = sigma[k - 1][l + 1] - a[k - 1] * sigma[k - 1][l];
251 |       if (k > 1)
252 |         sigma[k][l] -= b[k - 2] * sigma[k - 2][l];
253 |     }
254 |     if (k != n_points) {
255 |       a[k] =
256 |         sigma[k][k + 1] / sigma[k][k] - sigma[k - 1][k] / sigma[k - 1][k - 1];
257 |       b[k - 1] = sigma[k][k] / sigma[k - 1][k - 1];
258 |     }
259 |   }
260 | 
261 |   full_alpha.swap(a);
262 |   full_beta.swap(b);
263 | }
264 | 
265 | ////////////////////////////////////////////////////
266 | // Constructor
267 | 
268 | MomentSequence::MomentSequence(const vector<double> &obs_moms) :
269 |   moments(obs_moms) {
270 |   vector<double> holding_moms(moments);
271 |   // make sure the moments are all positive
272 |   check_moment_sequence(holding_moms);
273 |   moments = holding_moms;
274 | 
275 |   // calculate 3-term recurrence
276 |   unmodified_Chebyshev();
277 | }
278 | 
279 | /////////////////////////////////////////////////////
280 | // Quadrature Methods
281 | 
282 | // one iteration of QR:
283 | // following eq's 3.3 of Golub & Welsh
284 | // one iteration is Z_N-1*Z_N-2*...*Z_1*X*Z_1*...*Z_N-1
285 | // Z_j is givens matrix to zero out the j+1,j'th element of X
286 | static void
287 | QRiteration(vector<double> &alpha, vector<double> &beta,
288 |             vector<double> &weights) {
289 |   // initialize variables
290 |   vector<double> sin_theta(alpha.size(), 0.0);
291 |   vector<double> cos_theta(alpha.size(), 0.0);
292 | 
293 |   vector<double> a(alpha.size(), 0.0);
294 |   vector<double> a_bar(alpha.size(), 0.0);
295 |   a_bar[0] = alpha[0];
296 | 
297 |   vector<double> b(beta);
298 |   vector<double> b_bar(alpha.size(), 0.0);
299 |   b_bar[0] = alpha[0];
300 |   vector<double> b_tilde(alpha.size(), 0.0);
301 |   b_tilde[0] = beta[0];
302 | 
303 |   vector<double> d(alpha.size(), 0.0);
304 |   d[0] = beta[0];
305 | 
306 |   vector<double> z(weights);
307 |   vector<double> z_bar(weights.size(), 0.0);
308 |   z_bar[0] = z[0];
309 | 
310 |   for (size_t j = 0; j < alpha.size() - 1; j++) {
311 |     // for d and b_bar, j here is j-1 in G&W
312 |     if (d[j] == 0.0 && b_bar[j] == 0.0) {
313 |       sin_theta[j] = 0.0;
314 |       cos_theta[j] = 1.0;
315 |     }
316 |     else {
317 |       sin_theta[j] = d[j] / sqrt(d[j] * d[j] + b_bar[j] * b_bar[j]);
318 |       cos_theta[j] = b_bar[j] / sqrt(d[j] * d[j] + b_bar[j] * b_bar[j]);
319 |     }
320 | 
321 |     a[j] = (a_bar[j] * cos_theta[j] * cos_theta[j] +
322 |             2 * b_tilde[j] * cos_theta[j] * sin_theta[j] +
323 |             alpha[j + 1] * sin_theta[j] * sin_theta[j]);
324 | 
325 |     a_bar[j + 1] = (a_bar[j] * sin_theta[j] * sin_theta[j] -
326 |                     2 * b_tilde[j] * cos_theta[j] * sin_theta[j] +
327 |                     alpha[j + 1] * cos_theta[j] * cos_theta[j]);
328 | 
329 |     if (j != 0)
330 |       b[j - 1] = sqrt(d[j] * d[j] + b_bar[j] * b_bar[j]);
331 | 
332 |     b_bar[j + 1] = ((a_bar[j] - alpha[j + 1]) * sin_theta[j] * cos_theta[j] +
333 |                     b_tilde[j] * (sin_theta[j] * sin_theta[j] -
334 |                                   cos_theta[j] * cos_theta[j]));
335 | 
336 |     b_tilde[j + 1] = -beta[j + 1] * cos_theta[j];
337 | 
338 |     d[j + 1] = beta[j + 1] * sin_theta[j];
339 | 
340 |     z[j] = z_bar[j] * cos_theta[j] + weights[j + 1] * sin_theta[j];
341 | 
342 |     z_bar[j + 1] = z_bar[j] * sin_theta[j] - weights[j + 1] * cos_theta[j];
343 |   }
344 | 
345 |   // last entries set equal to final "holding" values
346 |   a.back() = a_bar.back();
347 |   b.back() = b_bar.back();
348 |   z.back() = z_bar.back();
349 | 
350 |   swap(alpha, a);
351 |   swap(beta, b);
352 |   swap(weights, z);
353 | }
354 | 
355 | static bool
356 | check_positivity(const vector<double> &v) {
357 |   const auto non_pos = [](const double x) { return x <= 0.0 || isinf(x); };
358 |   return find_if(cbegin(v), cend(v), non_pos) == cend(v);
359 | }
360 | 
361 | bool
362 | MomentSequence::Lower_quadrature_rules(const size_t n_points, const double tol,
363 |                                        const size_t max_iter,
364 |                                        vector<double> &points,
365 |                                        vector<double> &weights) {
366 |   // make sure that points.size() will be less than n_points
367 |   vector<double> a(alpha);
368 |   a.resize((n_points < alpha.size()) ? n_points : alpha.size());
369 |   vector<double> b(beta);
370 |   b.resize((n_points - 1 < beta.size()) ? n_points - 1 : beta.size());
371 | 
372 |   check_three_term_relation(a, b);
373 | 
374 |   // See Gautschi pgs 10-13,
375 |   // the nu here is the square of the off-diagonal
376 |   // of the Jacobi matrix
377 |   for (size_t i = 0; i < b.size(); i++)
378 |     b[i] = sqrt(b[i]);
379 | 
380 |   vector<double> eigenvec(a.size(), 0.0);
381 |   eigenvec[0] = 1.0;
382 |   vector<double> eigenvals(a);
383 |   vector<double> qr_beta(b);
384 | 
385 |   // in QR, off-diagonals go to zero use off diags for convergence
386 |   double error_sum = 0.0;
387 |   for (size_t i = 0; i < qr_beta.size(); i++)
388 |     error_sum += fabs(qr_beta[i]);
389 | 
390 |   size_t iter = 0;
391 |   while (iter < max_iter && error_sum > tol) {
392 |     QRiteration(eigenvals, qr_beta, eigenvec);
393 | 
394 |     error_sum = 0.0;
395 |     for (size_t i = 0; i < qr_beta.size(); i++)
396 |       error_sum += fabs(qr_beta[i]);
397 |     iter++;
398 |   }
399 | 
400 |   // eigenvalues are on diagonal of J
401 |   const bool points_are_positive = check_positivity(eigenvals);
402 |   if (points_are_positive) {
403 |     swap(points, eigenvals);
404 |     swap(weights, eigenvec);
405 |   }
406 | 
407 |   // square entries in the weights vector
408 |   transform(cbegin(weights), cend(weights), begin(weights),
409 |             [](const double x) { return x * x; });
410 | 
411 |   return points_are_positive;
412 | }
413 | 


--------------------------------------------------------------------------------
/src/moment_sequence.hpp:
--------------------------------------------------------------------------------
 1 | /*    Copyright (C) 2013 University of Southern California and
 2 |  *                       Andrew D. Smith and Timothy Daley
 3 |  *
 4 |  *    Authors: Andrew D. Smith and Timothy Daley
 5 |  *
 6 |  *    This program is free software: you can redistribute it and/or modify
 7 |  *    it under the terms of the GNU General Public License as published by
 8 |  *    the Free Software Foundation, either version 3 of the License, or
 9 |  *    (at your option) any later version.
10 |  *
11 |  *    This program is distributed in the hope that it will be useful,
12 |  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 |  *    GNU General Public License for more details.
15 |  *
16 |  *    You should have received a copy of the GNU General Public License
17 |  *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 |  */
19 | 
20 | #ifndef SRC_MOMENT_SEQUENCE_HPP_
21 | #define SRC_MOMENT_SEQUENCE_HPP_
22 | 
23 | #include <cstddef>
24 | #include <numeric>
25 | #include <vector>
26 | 
27 | // test Hankel moment matrix to ensure the moment sequence
28 | // is positive definite
29 | std::size_t
30 | ensure_pos_def_mom_seq(std::vector<double> &moments, const double tolerance,
31 |                        const bool VERBOSE);
32 | 
33 | struct MomentSequence {
34 |   MomentSequence() {}
35 |   explicit MomentSequence(const std::vector<double> &obs_moms);
36 | 
37 |   MomentSequence(const std::vector<double> &a, const std::vector<double> &b) :
38 |     alpha(a), beta(b) {}
39 | 
40 |   // Estimate 3-term recurrence
41 |   // these will be removed from the header when they are tested
42 |   void unmodified_Chebyshev();
43 | 
44 |   void full_3term_recurrence(std::vector<double> &full_alpha,
45 |                              std::vector<double> &full_beta);
46 | 
47 |   // quadrature rules using QR on Jacobi matrix
48 |   bool Lower_quadrature_rules(const std::size_t n_points,
49 |                               const double tolerance,
50 |                               const std::size_t max_iter,
51 |                               std::vector<double> &points,
52 |                               std::vector<double> &weights);
53 | 
54 |   std::vector<double> moments;
55 |   // 3-term recurrence
56 |   std::vector<double> alpha;
57 |   std::vector<double> beta;
58 | };
59 | 
60 | #endif  // SRC_MOMENT_SEQUENCE_HPP_
61 | 


--------------------------------------------------------------------------------
/src/pop_size.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (C) 2013-2024 University of Southern California and
  2 |  *                         Andrew D. Smith and Timothy Daley
  3 |  *
  4 |  * Authors: Timothy Daley and Andrew Smith
  5 |  *
  6 |  * This program is free software: you can redistribute it and/or
  7 |  * modify it under the terms of the GNU General Public License as
  8 |  * published by the Free Software Foundation, either version 3 of the
  9 |  * License, or (at your option) any later version.
 10 |  *
 11 |  * This program is distributed in the hope that it will be useful, but
 12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 |  * General Public License for more details.
 15 |  *
 16 |  * You should have received a copy of the GNU General Public License
 17 |  * along with this program. If not, see
 18 |  * <http://www.gnu.org/licenses/>.
 19 |  */
 20 | 
 21 | #include "pop_size.hpp"
 22 | 
 23 | #include "common.hpp"
 24 | #include "load_data_for_complexity.hpp"
 25 | 
 26 | #include <OptionParser.hpp>
 27 | 
 28 | #include <algorithm>
 29 | #include <cstddef>  // std::size_t
 30 | #include <cstdint>
 31 | #include <filesystem>
 32 | #include <fstream>
 33 | #include <iostream>
 34 | #include <stdexcept>
 35 | #include <string>
 36 | #include <vector>
 37 | 
 38 | using std::cbegin;
 39 | using std::cend;
 40 | using std::cerr;
 41 | using std::count_if;
 42 | using std::endl;
 43 | using std::min;
 44 | using std::runtime_error;
 45 | using std::size_t;
 46 | using std::string;
 47 | using std::to_string;
 48 | using std::uint32_t;
 49 | using std::vector;
 50 | 
 51 | int
 52 | pop_size_main(const int argc, const char *argv[]) {
 53 |   try {
 54 |     static const size_t min_required_counts = 4;
 55 |     static const string min_required_counts_error_message =
 56 |       "max count before zero is less than min required count (" +
 57 |       to_string(min_required_counts) + ") duplicates removed";
 58 | 
 59 |     string outfile;
 60 |     string histogram_outfile;
 61 | 
 62 |     size_t orig_max_terms = 100;
 63 |     double max_extrap = 0.0;
 64 |     double step_size = 0.0;
 65 |     size_t n_desired_steps = 50;
 66 |     size_t n_bootstraps = 100;
 67 |     int diagonal = 0;
 68 |     double c_level = 0.95;
 69 |     uint32_t seed = 408;
 70 | 
 71 |     /* FLAGS */
 72 |     bool verbose = false;
 73 |     bool VALS_INPUT = false;
 74 |     bool PAIRED_END = false;
 75 |     bool HIST_INPUT = false;
 76 |     bool SINGLE_ESTIMATE = false;
 77 |     bool allow_defects = false;
 78 | 
 79 | #ifdef HAVE_HTSLIB
 80 |     bool BAM_FORMAT_INPUT = false;
 81 |     size_t MAX_SEGMENT_LENGTH = 5000;
 82 |     uint32_t n_threads{1};
 83 | #endif
 84 | 
 85 |     const string description = R"(
 86 | Estimate the total population size using the approach described in
 87 | Daley & Smith (2013), extrapolating to very long range. Default
 88 | parameters assume that the initial sample represents at least 1e-9 of
 89 | the population, which is sufficient for every example application we
 90 | have seen.
 91 | )";
 92 |     string program_name = std::filesystem::path(argv[0]).filename();
 93 |     program_name += " " + string(argv[1]);
 94 | 
 95 |     /********** GET COMMAND LINE ARGUMENTS  FOR LC EXTRAP ***********/
 96 | 
 97 |     OptionParser opt_parse(program_name, description, "<input-file>");
 98 |     opt_parse.add_opt("output", 'o', "yield output file (default: stdout)",
 99 |                       false, outfile);
100 |     opt_parse.add_opt("extrap", 'e', "maximum extrapolation", false,
101 |                       max_extrap);
102 |     opt_parse.add_opt("steps", 's', "number of steps", false, n_desired_steps);
103 |     opt_parse.add_opt("boots", 'n', "number of bootstraps", false,
104 |                       n_bootstraps);
105 |     opt_parse.add_opt("cval", 'c', "level for confidence intervals", false,
106 |                       c_level);
107 |     opt_parse.add_opt("terms", 'x', "maximum terms in estimator", false,
108 |                       orig_max_terms);
109 |     opt_parse.add_opt("verbose", 'v', "print more info", false, verbose);
110 | #ifdef HAVE_HTSLIB
111 |     opt_parse.add_opt("bam", 'B', "input is in BAM format", false,
112 |                       BAM_FORMAT_INPUT);
113 |     opt_parse.add_opt("seg_len", 'l',
114 |                       "maximum segment length when merging "
115 |                       "paired end bam reads",
116 |                       false, MAX_SEGMENT_LENGTH);
117 |     opt_parse.add_opt("threads", 't', "number of threads for decompressing BAM",
118 |                       false, n_threads);
119 | #endif
120 |     opt_parse.add_opt("pe", 'P', "input is paired end read file", false,
121 |                       PAIRED_END);
122 |     opt_parse.add_opt(
123 |       "vals", 'V', "input is a text file containing only the observed counts",
124 |       false, VALS_INPUT);
125 |     opt_parse.add_opt("hist", 'H',
126 |                       "input is a text file containing the observed histogram",
127 |                       false, HIST_INPUT);
128 |     opt_parse.add_opt("hist-out", '\0',
129 |                       "output histogram to this file (for non-hist input)",
130 |                       false, histogram_outfile);
131 |     opt_parse.add_opt("quick", 'Q',
132 |                       "quick mode (no bootstraps) for confidence intervals",
133 |                       false, SINGLE_ESTIMATE);
134 |     opt_parse.add_opt("defects", 'D', "no testing for defects", false,
135 |                       allow_defects);
136 |     opt_parse.add_opt("seed", 'r', "seed for random number generator", false,
137 |                       seed);
138 |     opt_parse.set_show_defaults();
139 |     vector<string> leftover_args;
140 |     // ADS: suspect bug below; "-about" isn't working.
141 |     opt_parse.parse(argc - 1, argv + 1, leftover_args);
142 |     if (argc == 2 || opt_parse.help_requested()) {
143 |       cerr << opt_parse.help_message() << endl;
144 |       cerr << opt_parse.about_message() << endl;
145 |       return EXIT_SUCCESS;
146 |     }
147 |     if (opt_parse.option_missing()) {
148 |       cerr << opt_parse.option_missing_message() << endl;
149 |       return EXIT_SUCCESS;
150 |     }
151 |     if (leftover_args.empty()) {
152 |       cerr << opt_parse.help_message() << endl;
153 |       return EXIT_SUCCESS;
154 |     }
155 |     const string input_file_name = leftover_args.front();
156 |     /******************************************************************/
157 | 
158 |     vector<double> counts_hist;
159 |     size_t n_reads = 0;
160 | 
161 |     /************ loading input ***************************************/
162 |     if (HIST_INPUT) {
163 |       if (verbose)
164 |         cerr << "HIST_INPUT" << endl;
165 |       n_reads = load_histogram(input_file_name, counts_hist);
166 |     }
167 |     else if (VALS_INPUT) {
168 |       if (verbose)
169 |         cerr << "VALS_INPUT" << endl;
170 |       n_reads = load_counts(input_file_name, counts_hist);
171 |     }
172 | #ifdef HAVE_HTSLIB
173 |     else if (BAM_FORMAT_INPUT && PAIRED_END) {
174 |       if (verbose)
175 |         cerr << "PAIRED_END_BAM_INPUT" << endl;
176 |       n_reads = load_counts_BAM_pe(n_threads, input_file_name, counts_hist);
177 |     }
178 |     else if (BAM_FORMAT_INPUT) {
179 |       if (verbose)
180 |         cerr << "BAM_INPUT" << endl;
181 |       n_reads = load_counts_BAM_se(n_threads, input_file_name, counts_hist);
182 |     }
183 | #endif
184 |     else if (PAIRED_END) {
185 |       if (verbose)
186 |         cerr << "PAIRED_END_BED_INPUT" << endl;
187 |       n_reads = load_counts_BED_pe(input_file_name, counts_hist);
188 |     }
189 |     else {  // default is single end bed file
190 |       if (verbose)
191 |         cerr << "BED_INPUT" << endl;
192 |       n_reads = load_counts_BED_se(input_file_name, counts_hist);
193 |     }
194 |     /************ done loading input **********************************/
195 | 
196 |     const size_t max_observed_count = counts_hist.size() - 1;
197 |     const double distinct_reads =
198 |       accumulate(cbegin(counts_hist), cend(counts_hist), 0.0);
199 | 
200 |     // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
201 |     size_t first_zero = 1;
202 |     while (first_zero < counts_hist.size() && counts_hist[first_zero] > 0)
203 |       ++first_zero;
204 | 
205 |     orig_max_terms = min(orig_max_terms, first_zero - 1);
206 |     orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1);
207 | 
208 |     if (max_extrap < 1.0)
209 |       max_extrap = 1000000000 * distinct_reads;
210 |     if (step_size < 1.0)
211 |       step_size = (max_extrap - distinct_reads) / n_desired_steps;
212 | 
213 |     const size_t distinct_counts =
214 |       std::count_if(begin(counts_hist), end(counts_hist),
215 |                     [](const double x) { return x > 0.0; });
216 | 
217 |     if (verbose)
218 |       cerr << "TOTAL READS     = " << n_reads << endl
219 |            << "DISTINCT READS  = " << distinct_reads << endl
220 |            << "DISTINCT COUNTS = " << distinct_counts << endl
221 |            << "MAX COUNT       = " << max_observed_count << endl
222 |            << "COUNTS OF 1     = " << counts_hist[1] << endl
223 |            << "MAX TERMS       = " << orig_max_terms << endl;
224 | 
225 |     if (!histogram_outfile.empty())
226 |       report_histogram(histogram_outfile, counts_hist);
227 | 
228 |     // check to make sure library is not overly saturated
229 |     const double two_fold_extrap = GoodToulmin2xExtrap(counts_hist);
230 |     if (two_fold_extrap < 0.0)
231 |       throw runtime_error("Saturation expected at double initial sample size."
232 |                           " Unable to extrapolate");
233 | 
234 |     // const size_t total_reads = get_counts_from_hist(counts_hist);
235 | 
236 |     // assert(total_reads == n_reads); // ADS: why commented out?
237 | 
238 |     // check that min required count is satisfied
239 |     if (orig_max_terms < min_required_counts)
240 |       throw runtime_error(min_required_counts_error_message);
241 | 
242 |     if (verbose)
243 |       cerr << "[ESTIMATING YIELD CURVE]" << endl;
244 | 
245 |     vector<double> yield_estimates;
246 | 
247 |     if (SINGLE_ESTIMATE) {
248 |       const bool single_estimate_success = extrap_single_estimate(
249 |         verbose, allow_defects, counts_hist, orig_max_terms, diagonal,
250 |         step_size, max_extrap, yield_estimates);
251 |       // IF FAILURE, EXIT
252 |       if (!single_estimate_success)
253 |         throw runtime_error("single estimate failed, run "
254 |                             "full mode for estimates");
255 | 
256 |       std::ofstream of;
257 |       if (!outfile.empty())
258 |         of.open(outfile.c_str());
259 |       std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
260 | 
261 |       out << "TOTAL_READS\tEXPECTED_DISTINCT" << endl;
262 |       out.setf(std::ios_base::fixed, std::ios_base::floatfield);
263 |       out.precision(1);
264 | 
265 |       out << 0 << '\t' << 0 << endl;
266 |       for (size_t i = 0; i < yield_estimates.size(); ++i)
267 |         out << (i + 1) * step_size << '\t' << yield_estimates[i] << endl;
268 |     }
269 |     else {
270 |       if (verbose)
271 |         cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl;
272 | 
273 |       const size_t max_iter = 100 * n_bootstraps;
274 | 
275 |       vector<vector<double>> bootstrap_estimates;
276 |       extrap_bootstrap(verbose, allow_defects, seed, counts_hist, n_bootstraps,
277 |                        orig_max_terms, diagonal, step_size, max_extrap,
278 |                        max_iter, bootstrap_estimates);
279 | 
280 |       if (verbose)
281 |         cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl;
282 |       // yield ci
283 |       vector<double> yield_upper_ci_lognorm, yield_lower_ci_lognorm;
284 | 
285 |       vector_median_and_ci(bootstrap_estimates, c_level, yield_estimates,
286 |                            yield_lower_ci_lognorm, yield_upper_ci_lognorm);
287 |       if (verbose)
288 |         cerr << "[WRITING OUTPUT]" << endl;
289 | 
290 |       std::ofstream of;
291 |       if (!outfile.empty())
292 |         of.open(outfile);
293 |       std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
294 | 
295 |       out.setf(std::ios_base::fixed, std::ios_base::floatfield);
296 |       out.precision(1);
297 | 
298 |       const size_t n_ests = yield_estimates.size() - 1;
299 |       if (n_ests < 2)
300 |         throw runtime_error("problem with number of estimates in pop_size");
301 | 
302 |       const bool converged =
303 |         (yield_estimates[n_ests] - yield_estimates[n_ests - 1] < 1.0);
304 | 
305 |       out << "pop_size_estimate" << '\t' << "lower_ci" << '\t' << "upper_ci"
306 |           << endl;
307 |       out << yield_estimates.back() << '\t' << yield_lower_ci_lognorm.back()
308 |           << '\t' << yield_upper_ci_lognorm.back();
309 |       if (!converged)
310 |         out << "\tnot_converged";
311 |       out << endl;
312 |     }
313 |   }
314 |   catch (const std::exception &e) {
315 |     cerr << e.what() << endl;
316 |     return EXIT_FAILURE;
317 |   }
318 |   return EXIT_SUCCESS;
319 | }
320 | 


--------------------------------------------------------------------------------
/src/pop_size.hpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2013-2024 University of Southern California and
 2 |  *                         Andrew D. Smith and Timothy Daley
 3 |  *
 4 |  * Authors: Timothy Daley and Andrew Smith
 5 |  *
 6 |  * This program is free software: you can redistribute it and/or
 7 |  * modify it under the terms of the GNU General Public License as
 8 |  * published by the Free Software Foundation, either version 3 of the
 9 |  * License, or (at your option) any later version.
10 |  *
11 |  * This program is distributed in the hope that it will be useful, but
12 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 |  * General Public License for more details.
15 |  *
16 |  * You should have received a copy of the GNU General Public License
17 |  * along with this program. If not, see
18 |  * <http://www.gnu.org/licenses/>.
19 |  */
20 | 
21 | #ifndef SRC_POP_SIZE_HPP_
22 | #define SRC_POP_SIZE_HPP_
23 | 
24 | int
25 | pop_size_main(const int argc, const char *argv[]);
26 | 
27 | #endif  // SRC_POP_SIZE_HPP_
28 | 


--------------------------------------------------------------------------------
/src/preseq.cpp:
--------------------------------------------------------------------------------
 1 | /* preseq: to predict properties of genomic sequencing libraries
 2 |  *
 3 |  * Copyright (C) 2013-2024 University of Southern California and
 4 |  *                         Andrew D. Smith and Timothy Daley
 5 |  *
 6 |  * Authors: Timothy Daley, Chao Deng, Victoria Helus, and Andrew Smith
 7 |  *
 8 |  * This program is free software: you can redistribute it and/or
 9 |  * modify it under the terms of the GNU General Public License as
10 |  * published by the Free Software Foundation, either version 3 of the
11 |  * License, or (at your option) any later version.
12 |  *
13 |  * This program is distributed in the hope that it will be useful, but
14 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
15 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 |  * General Public License for more details.
17 |  *
18 |  * You should have received a copy of the GNU General Public License
19 |  * along with this program. If not, see
20 |  * <http://www.gnu.org/licenses/>.
21 |  */
22 | 
23 | #include <config.h>
24 | 
25 | #include "common.hpp"
26 | 
27 | // the preseq commands
28 | #include "bound_pop.hpp"
29 | #include "c_curve.hpp"
30 | #include "gc_extrap.hpp"
31 | #include "lc_extrap.hpp"
32 | #include "pop_size.hpp"
33 | 
34 | #include <fstream>
35 | #include <iomanip>
36 | #include <iostream>
37 | #include <string>
38 | 
39 | static std::string
40 | usage_message() {
41 |   std::ostringstream oss;
42 |   oss << "preseq: a program for analyzing library complexity\n"
43 |          "Version: ";
44 |   oss << VERSION;
45 |   oss << "\n\n"
46 |          "Usage: preseq <command> [OPTIONS]\n\n"
47 |          "<command>: c_curve    generate complexity curve for a library\n"
48 |          "           lc_extrap  predict the yield for future experiments\n"
49 |          "           gc_extrap  predict genome coverage low input\n"
50 |          "                      sequencing experiments\n"
51 |          "           bound_pop  lower bound on population size\n"
52 |          "           pop_size   estimate number of unique species\n";
53 |   return oss.str();
54 | }
55 | 
56 | int
57 | main(const int argc, const char *argv[]) {
58 |   if (argc < 2) {
59 |     std::cerr << usage_message() << std::endl;
60 |     return EXIT_SUCCESS;
61 |   }
62 | 
63 |   static const std::string cmd = argv[1];
64 | 
65 |   if (cmd == "lc_extrap")
66 |     return lc_extrap_main(argc, argv);
67 | 
68 |   if (cmd == "c_curve")
69 |     return c_curve_main(argc, argv);
70 | 
71 |   if (cmd == "gc_extrap")
72 |     return gc_extrap_main(argc, argv);
73 | 
74 |   if (cmd == "bound_pop")
75 |     return bound_pop_main(argc, argv);
76 | 
77 |   if (cmd == "pop_size")
78 |     return pop_size_main(argc, argv);
79 | 
80 |   std::cerr << "Error: unrecognized command: " << argv[1] << std::endl
81 |             << usage_message() << std::endl;
82 | 
83 |   return EXIT_FAILURE;
84 | }
85 | 


--------------------------------------------------------------------------------
/tests/data/c_curve_input.hist:
--------------------------------------------------------------------------------
 1 | 1	982419
 2 | 2	6060
 3 | 3	214
 4 | 4	63
 5 | 5	32
 6 | 6	21
 7 | 7	14
 8 | 8	9
 9 | 9	6
10 | 10	3
11 | 11	6
12 | 12	2
13 | 13	2
14 | 14	2
15 | 15	3
16 | 16	2
17 | 24	2
18 | 31	1
19 | 


--------------------------------------------------------------------------------
/tests/md5sum.txt:
--------------------------------------------------------------------------------
1 | 91ef0368a7da1a55e3acad083485df8b  tests/c_curve_output.txt
2 | ba02e52a5f3bc7646998e7ade1c7e35e  tests/lc_extrap_output.txt
3 | c8895e94346231a5beb4d867df3bb480  tests/gc_extrap_output.txt
4 | 


--------------------------------------------------------------------------------
/tests/scripts/test_c_curve.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This file is part of preseq
 4 | #
 5 | # Copyright (C) 2024: Andrew D. Smith
 6 | #
 7 | # Authors: Andrew D. Smith
 8 | #
 9 | # This is free software: you can redistribute it and/or modify it
10 | # under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This software is distributed in the hope that it will be useful, but
15 | # WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 | # General Public License for more details.
18 | 
19 | infile=tests/c_curve_input.hist
20 | outfile=tests/c_curve_output.txt
21 | if [[ -e "${infile}" ]]; then
22 |     ./preseq c_curve -o "${outfile}" -s 100000 -H "${infile}"
23 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
24 |     if [[ "${x}" != "OK" ]]; then
25 |         exit 1;
26 |     fi
27 | else
28 |     echo "${infile} not found";
29 |     exit 77;
30 | fi
31 | 


--------------------------------------------------------------------------------
/tests/scripts/test_gc_extrap.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This file is part of preseq
 4 | #
 5 | # Copyright (C) 2024: Andrew D. Smith
 6 | #
 7 | # Authors: Andrew D. Smith
 8 | #
 9 | # This is free software: you can redistribute it and/or modify it
10 | # under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This software is distributed in the hope that it will be useful, but
15 | # WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 | # General Public License for more details.
18 | 
19 | infile=tests/gc_extrap_input.mr
20 | outfile=tests/gc_extrap_output.txt
21 | if [[ -e "${infile}" ]]; then
22 |     ./preseq gc_extrap -o "${outfile}" "${infile}"
23 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
24 |     if [[ "${x}" != "OK" ]]; then
25 |         exit 1;
26 |     fi
27 | else
28 |     echo "${infile} not found";
29 |     exit 77;
30 | fi
31 | 


--------------------------------------------------------------------------------
/tests/scripts/test_lc_extrap.test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This file is part of preseq
 4 | #
 5 | # Copyright (C) 2024: Andrew D. Smith
 6 | #
 7 | # Authors: Andrew D. Smith
 8 | #
 9 | # This is free software: you can redistribute it and/or modify it
10 | # under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This software is distributed in the hope that it will be useful, but
15 | # WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 | # General Public License for more details.
18 | 
19 | infile=tests/lc_extrap_input.vals
20 | outfile=tests/lc_extrap_output.txt
21 | if [[ -e "${infile}" ]]; then
22 |     ./preseq lc_extrap -o "${outfile}" -V "${infile}"
23 |     x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
24 |     if [[ "${x}" != "OK" ]]; then
25 |         exit 1;
26 |     fi
27 | else
28 |     echo "${infile} not found";
29 |     exit 77;
30 | fi
31 | 


--------------------------------------------------------------------------------