├── .VERSION.in
├── .gitignore
├── .gitmodules
├── .travis.yml
├── .version_files
├── Dockerfile
├── Dockerfile.slim.in
├── HACKING.org
├── LICENSE
├── README.org
├── VERSION
├── context
    └── nanocall
    │   └── .gitignore
├── script
    ├── build-slim-docker-image
    └── get-lddtree
└── src
    ├── .gitignore
    ├── CMakeLists.txt
    ├── builtin_models
        ├── .gitignore
        ├── builtin_model_init_lists.inl
        ├── builtin_model_names.inl
        ├── builtin_model_num.inl
        ├── builtin_model_strands.inl
        ├── make-builtin-model-initializers
        ├── r73.c.p1.006.ont.model
        ├── r73.c.p2.006.ont.model
        └── r73.t.006.ont.model
    ├── cmake
        └── FindHDF5.cmake
    ├── get-dir-version
    ├── nanocall
        ├── Builtin_Model.cpp
        ├── Builtin_Model.hpp
        ├── CMakeLists.txt
        ├── Event.hpp
        ├── Fast5_Summary.hpp
        ├── Forward_Backward.hpp
        ├── Forward_Backward_Custom.hpp
        ├── Kmer.hpp
        ├── Parameter_Trainer.hpp
        ├── Pore_Model.hpp
        ├── State_Transitions.hpp
        ├── Viterbi.hpp
        ├── compute-scaled-pore-model.cpp
        ├── compute-state-transitions.cpp
        ├── fs_support.hpp
        ├── global_assert.hpp
        ├── list-directory.cpp
        ├── nanocall.cpp
        ├── run-fwbw.cpp
        └── run-viterbi.cpp
    └── version
        ├── CMakeLists.txt
        ├── version.cpp
        └── version.hpp


/.VERSION.in:
--------------------------------------------------------------------------------
1 | ${VERSION}
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /build*
2 | /local*
3 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "src/fast5"]
 2 | 	path = src/fast5
 3 | 	url = https://github.com/mateidavid/fast5.git
 4 | [submodule "src/tclap"]
 5 | 	path = src/tclap
 6 | 	url = https://github.com/mateidavid/tclap.git
 7 | [submodule "src/hpptools"]
 8 | 	path = src/hpptools
 9 | 	url = https://github.com/mateidavid/hpptools.git
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # travis.yml for github.com/mateidavid/nanocall
 2 | 
 3 | sudo: required
 4 | 
 5 | services: docker
 6 | 
 7 | before_install:
 8 |     - sudo apt-get update -y
 9 |     - sudo apt-get install -y -o Dpkg::Options::="--force-confnew" docker-engine
10 | 
11 | install: script/build-slim-docker-image
12 | 
13 | before_script: docker images --all --no-trunc
14 | 
15 | script: docker run --rm nanocall
16 | 


--------------------------------------------------------------------------------
/.version_files:
--------------------------------------------------------------------------------
1 | VERSION
2 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:stable
 2 | MAINTAINER Matei David <matei.david.at.oicr.on.ca>
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | # install prerequisites
 6 | RUN for i in 1 2 3; do \
 7 |         apt-get update \
 8 |         && break; sleep 1; \
 9 |     done && \
10 |     for i in 1 2 3; do \
11 |         apt-get install -y \
12 |              build-essential \
13 |              cmake \
14 |              libhdf5-dev \
15 |         && break; sleep 1; \
16 |     done
17 | 
18 | # if necessary, specify compiler
19 | #RUN apt-get install -y g++-4.9 g++-5 g++-6
20 | #ENV CC=gcc-4.9
21 | #ENV CXX=g++-4.9
22 | 
23 | # add source
24 | ADD . /src/
25 | 
26 | # build and install
27 | RUN mkdir -p /src/build && \
28 |     cd /src/build && \
29 |     cmake ../src && \
30 |     make && \
31 |     make install
32 | 
33 | VOLUME ["/data"]
34 | WORKDIR /data
35 | ENTRYPOINT ["/usr/local/bin/nanocall"]
36 | CMD ["--version"]
37 | 


--------------------------------------------------------------------------------
/Dockerfile.slim.in:
--------------------------------------------------------------------------------
 1 | FROM debian:stable
 2 | MAINTAINER Matei David <matei.david.at.oicr.on.ca>
 3 | ARG DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | ADD lddtree.tgz /
 6 | 
 7 | # use host timezone
 8 | ENV TZ=${TZ}
 9 | RUN ln -snf /usr/share/zoneinfo/${TZ} /etc/localtime && echo ${TZ} > /etc/timezone
10 | 
11 | # use host id
12 | RUN groupadd --gid ${GROUP_ID} ${GROUP_NAME}
13 | RUN useradd --create-home --uid ${USER_ID} --gid ${GROUP_ID} ${USER_NAME}
14 | USER ${USER_NAME}
15 | 
16 | VOLUME ["/data"]
17 | WORKDIR /data
18 | ENTRYPOINT ["/usr/local/bin/nanocall"]
19 | CMD ["--version"]
20 | 


--------------------------------------------------------------------------------
/HACKING.org:
--------------------------------------------------------------------------------
 1 | # -*- mode:org; mode:visual-line; coding:utf-8; -*-
 2 | 
 3 | **** Change build type
 4 | 
 5 | The default build type is =Release= (optimizations, no assertions). If any crashes are experienced, the first step in addressing them is to redo the run with a =Test= (optimizations, assertions) or =Debug= (no optimizations, assertions) build type. This is achieved using, e.g. =-DCMAKE_BUILD_TYPE=Test=.
 6 | 
 7 | **** Using other packaged tools
 8 | 
 9 | *Note*: The various tools are only built in =Test= or =Debug= build types.
10 | 
11 | #+BEGIN_EXAMPLE
12 | FAST5_FILE=$SIMPSONLAB/data/nanopore/ecoli/sqk006/pass/LomanLabz_PC_Ecoli_K12_MG1655_20150924_MAP006_1_5005_1_ch9_file72_strand.fast5
13 | nanocall ${FAST5_FILE} > ${FAST5_FILE}.fa
14 | compute-state-transitions -p .001 -t .1 -k .1 >transitions.tsv
15 | compute-scaled-pore-model -f $FAST5_FILE >model.tsv
16 | get_events $FAST5_FILE | egrep -v '^(#|mean)' | tawk '{print $1,$3,$2,$4}' >events.tsv
17 | run-viterbi -d info -p model.tsv -s transitions.tsv -e events.tsv | { echo ">$(basename $FAST5_FILE)"; cat; } >out.fa
18 | run-viterbi -d debug -p model.tsv -s transitions.tsv -e <(awk 'NR>=100 && NR<200' events.tsv) |& tee log
19 | run-fwbw -d info -p model.tsv -s transitions.tsv -e <(awk 'NR>=100 && NR<200' events.tsv) -o fwbw.tsv
20 | #+END_EXAMPLE
21 | 
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Matei David, Ontario Institute for Cancer Research
 4 | Copyright (c) 2015 Jared Simpson, Ontario Institute for Cancer Research
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE. 
23 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
  1 | # -*- mode:org; mode:visual-line; coding:utf-8; -*-
  2 | 
  3 | ** Nanocall: An Oxford Nanopore Basecaller
  4 | 
  5 | [[http://travis-ci.org/mateidavid/nanocall][http://travis-ci.org/mateidavid/nanocall.svg?branch=master]] [[https://tldrlegal.com/license/mit-license][http://img.shields.io/:license-mit-blue.svg]]
  6 | 
  7 | *** Introduction
  8 | 
  9 | Nanocall is an alternative, open source, MIT licensed, basecaller for Oxford Nanopore Technologies (ONT) sequencing data. Published in [[https://doi.org/10.1093/bioinformatics/btw569][Bioinformatics, 2016]].
 10 | 
 11 | For the official ONT basecaller, see [[https://metrichor.com/s/][Metrichor]].
 12 | 
 13 | **** Usefulness of Nanocall on recent ONT sequencing data
 14 | 
 15 | To understand the usefulness of Nanocall compared to Metrichor, some background is in order.
 16 | 
 17 | *Before summer 2016*: Before the summer of 2016, and the release of the R9 sequencing pore:
 18 | 
 19 | - Metrichor was the only available basecaller for ONT data.
 20 | - Metrichor's source code was closed.
 21 | - Metrichor was only available as a cloud service.
 22 | 
 23 | This state of affairs prompted us to develop Nanocall as an open-source local basecaller alternative to Metrichor.
 24 | 
 25 | *After summer 2016*: The summer of 2016 has brought along several significant developments from ONT:
 26 | 
 27 | - A new sequencing pore R9 was released: ([[https://nanoporetech.com/about-us/news/update-new-r9-nanopore-faster-more-accurate-sequencing-and-new-ten-minute-preparation][ONT Press Release, May 2016]]).
 28 | - The Metrichor source code was opened (under a development license).
 29 | - ONT provided an official option for local basecalling: ([[https://nanoporetech.com/about-us/news/local-basecalling-now-available-enabling-minion-usage-field][ONT Press Release, Aug 2016]]).
 30 | 
 31 | As a result, Nanocall's usefulness is now limited to:
 32 | 
 33 | - a platform for developing new basecalling ideas, and
 34 | - situations where, for various reasons, you do not have access to the official ONT basecaller(s).
 35 | 
 36 | If you want to use Nanocall on R9 data, Nanocall does support it directly, but its accuracy is significantly lower than that of Metrichor (unlike the case of R7.3, where the two had similar accuracy). The reason for the discrepancy is that Metrichor on R9 uses a more elaborate RNN-based approach, compared to the simple HMM-based one in Nanocall.
 37 | 
 38 | **** Levels of ONT sequencing data
 39 | 
 40 | Most people are only used to dealing with DNA bases. However, to understand where Nanocall fits in, we observe that there are 3 levels of ONT sequencing data:
 41 | 
 42 | - Raw samples. These are direct (picoamp) current measurements, taken at preset intervals as the DNA molecule is threaded through the pore. This data is passed through the USB cable from the MinION to the controlling laptop running MinKNOW. These are stored in =fast5= files at paths such as =/Raw/Reads/Read_29/Signal=.
 43 | 
 44 | - Events. Each event is an aggregation of multiple consecutive raw samples, (ideally) corresponding to a certain DNA context found in the pore. The process of computing events from raw samples is referred to as /event detection/. These are stored in =fast5= files at paths such as =/Analyses/EventDetection_000/Reads/Read_29/Events=.
 45 | 
 46 | - DNA bases. These are the usual, finished product. The process of computing DNA bases from events is referred to as /basecalling/. These are stored in =fast5= files at paths such as =/Analyses/Basecall_2D_000/BaseCalled_2D/Fastq=.
 47 | 
 48 | On R7.3, event detection was performed locally by MinKNOW, and events were passed on to, and used by Metrichor. Since Nanocall was developed as an alternative local basecaller for R7.3 data, /Nanocall is designed to work with events, not with raw samples/.
 49 | 
 50 | On (at least some versions of) R9, Metrichor would entirely redo the event detection directly from raw samples, disregarding any event detection done locally by MinKNOW. As such, it is less uncommon with R9 (than with R7.3) to see =fast5= files without events. Nanocall cannot be run directly on such files. To use Nanocall on R9 data, you must either configure MinKNOW to perform local event detection, or pass the files through Metrichor to use its event detection.
 51 | 
 52 | *** Installation
 53 | 
 54 | Nanocall can be built from source in a classical UNIX environment, or directly under [[https://www.docker.com/what-docker][Docker]]. The Docker build might run under Windows, though this is not tested.
 55 | 
 56 | **** Under a Classical UNIX Environment
 57 | 
 58 | Nanocall uses =cmake= for configuration and =make= for building. The prerequisites needed for building are =zlib= and =hdf5=. On UNIX systems, =hdf5= can be optionally built as a submodule.
 59 | Example build:
 60 | 
 61 | #+BEGIN_EXAMPLE
 62 | mkdir /some/source/dir && cd /some/source/dir
 63 | git clone --recursive https://github.com/mateidavid/nanocall.git
 64 | cd nanocall
 65 | mkdir build && cd build
 66 | cmake ../src [-DCMAKE_INSTALL_PREFIX=/some/install/dir] [-DBUILD_HDF5=1] [-DHDF5_ROOT=/path/to/hdf5]
 67 | make
 68 | make install
 69 | /some/install/dir/bin/nanocall --version
 70 | #+END_EXAMPLE
 71 | 
 72 | *Notes*:
 73 | 
 74 | - The default install prefix is =/usr/local=.
 75 | 
 76 | - Setting =BUILD_HDF5= will cause =hdf5= to be downloaded and built as a submodule.
 77 | 
 78 | - Setting =HDF5_ROOT= is only necessary if a copy of =hdf5= is installed in a non-standard location. This is not needed when =BUILD_HDF5= is used.
 79 | 
 80 | **** Under Docker
 81 | 
 82 | To avoid dealing with prerequisites, Nanocall can be conveniently built under Docker. The installation and configuration of Docker itself is outside of the scope of this document.
 83 | 
 84 | ***** Simple "fat" build
 85 | 
 86 | The simplest way to run Nanocall under Docker is:
 87 | 
 88 | #+BEGIN_EXAMPLE
 89 | docker build -t nanocall https://github.com/mateidavid/nanocall.git
 90 | docker run --rm nanocall --version
 91 | docker run --rm -u $(id -u):$(id -g) -v /path/to/data:/data nanocall -t 4 . >output.fa
 92 | #+END_EXAMPLE
 93 | 
 94 | Howver, there are several problems with this build:
 95 | 
 96 | - The docker image is "fat", in that it contains all the build time dependencies of Nanocall, which are not needed at run time.
 97 | 
 98 | - Without using =-u=, the image will create files with a UID of 0 on the mounted volumes of the host. To remove them, you will have to use =sudo rm= or =sudo chown=.
 99 | 
100 | - The timezone inside the image might be different from the host. This might confuse programs which depend on comparing modification times, most notably =make=.
101 | 
102 | ***** Alternate "slim" build
103 | 
104 | To alleviate the problems mentioned above, you can build a "slim" Docker image as follows:
105 | 
106 | #+BEGIN_EXAMPLE
107 | git clone --recursive --depth 1 https://github.com/mateidavid/nanocall.git
108 | nanocall/script/build-slim-docker-image
109 | docker run --rm nanocall --version
110 | docker run --rm -v /path/to/data:/data nanocall -t 4 . >output.fa
111 | #+END_EXAMPLE
112 | 
113 | *** Usage Examples
114 | 
115 | #+BEGIN_EXAMPLE
116 | # Check version
117 | nanocall --version
118 | 
119 | # Check command line parameters
120 | nanocall --help
121 | 
122 | # Run on single file, save output and log
123 | nanocall /path/to/file.fast5 >output.fa 2>log
124 | 
125 | # Run on directory, using 24 threads, discard log
126 | nanocall -t 24 /path/to/data >output.fa 2>/dev/null
127 | 
128 | # Run on file-of-file-names
129 | nanocall /path/to/files.fofn >output.fa
130 | 
131 | # Run Docker build on directory, using 4 threads
132 | # Note: -u is not needed with the "slim" build
133 | docker run --rm -u $(id -u):$(id -g) -v /path/to/data:/data nanocall -t 4 . >output.fa
134 | #+END_EXAMPLE
135 | 
136 | *** License
137 | 
138 | Released under the [[file:LICENSE][MIT license]].
139 | 
140 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.7.4
2 | 


--------------------------------------------------------------------------------
/context/nanocall/.gitignore:
--------------------------------------------------------------------------------
1 | /Dockerfile
2 | /lddtree.tgz
3 | 


--------------------------------------------------------------------------------
/script/build-slim-docker-image:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eEu
 3 | trap 'echo "$0: line $LINENO: exit code $?" >&2' ERR
 4 | log () { echo "$@" >&2; }
 5 | crash () { log "error: $@"; exit 1; }
 6 | 
 7 | _prog_path=$(readlink -e "${BASH_SOURCE[0]}")
 8 | _prog_name=$(basename "$_prog_path")
 9 | _prog_dir=$(dirname "$_prog_path")
10 | _prog_args=()
11 | 
12 | export TZ=$(cat /etc/timezone)
13 | export USER_ID=$(id -u)
14 | export USER_NAME=$(id -un)
15 | export GROUP_ID=$(id -g)
16 | export GROUP_NAME=$(id -gn)
17 | ROOT_DIR=$(cd "$_prog_dir"/..; pwd -P)
18 | 
19 | # build default fat image
20 | docker build -t nanocall:build "$ROOT_DIR"
21 | 
22 | # extract lddtree for nanocall
23 | mkdir -p "$ROOT_DIR/build-slim-image"
24 | docker run --rm -v "$ROOT_DIR":/data --entrypoint=/bin/bash nanocall:build -c 'apt-get install -y pax-utils >/dev/null 2>&1 && /data/script/get-lddtree /usr/local/bin/nanocall' >"$ROOT_DIR"/build-slim-image/lddtree.tgz
25 | 
26 | # remove fat image
27 | docker rmi nanocall:build
28 | 
29 | # create slim Dockerfile
30 | envsubst <"$ROOT_DIR/Dockerfile.slim.in" >"$ROOT_DIR/build-slim-image/Dockerfile"
31 | 
32 | # build slim image
33 | docker build -t nanocall "$ROOT_DIR/build-slim-image"
34 | 


--------------------------------------------------------------------------------
/script/get-lddtree:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eEu
 3 | trap 'echo "$0: line $LINENO: exit code $?" >&2' ERR
 4 | log () { echo "$@" >&2; }
 5 | crash () { log "error: $@"; exit 1; }
 6 | 
 7 | _prog_path=$(readlink -e "${BASH_SOURCE[0]}")
 8 | _prog_name=$(basename "$_prog_path")
 9 | _prog_dir=$(dirname "$_prog_path")
10 | _prog_args=()
11 | 
12 | lddtree -a -l "$@" |
13 | sort |
14 | uniq |
15 | tar -chvzf - --files-from -
16 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | /optional*
2 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | message(STATUS "Processing: ${CMAKE_CURRENT_SOURCE_DIR}")
  2 | 
  3 | # set build type
  4 | if(NOT CMAKE_BUILD_TYPE)
  5 |     set(CMAKE_BUILD_TYPE "Release" CACHE STRING
  6 |        "Choose the type of build, options are: Debug Test Release GProf GProfRel."
  7 |        FORCE)
  8 | endif()
  9 | message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 10 | if(NOT ${CMAKE_BUILD_TYPE} AND NOT ${CMAKE_BUILD_TYPE} STREQUAL "Release")
 11 |     set(CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Enable verbose make output.")
 12 |     message(STATUS "Enabling verbose make output.")
 13 | endif()
 14 | 
 15 | # general cmake settings
 16 | cmake_minimum_required(VERSION 2.8.12)
 17 | project(NANOCALL C CXX)
 18 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 19 | include(CheckLibraryExists)
 20 | include(CheckIncludeFileCXX)
 21 | include(ExternalProject)
 22 | 
 23 | # force out-of-source build
 24 | if(${PROJECT_BINARY_DIR} STREQUAL ${PROJECT_SOURCE_DIR})
 25 |     message(FATAL_ERROR "In-source build not supported.")
 26 | endif()
 27 | # check that "nanocall.cpp" exists; if not, assume the source dir is wrong
 28 | if(NOT EXISTS "${PROJECT_SOURCE_DIR}/nanocall/nanocall.cpp")
 29 |     message(FATAL_ERROR "${PROJECT_SOURCE_DIR}: source files not found")
 30 | endif()
 31 | 
 32 | # set project-related variables
 33 | set(PACKAGE_BUGREPORT "mdavid@oicr.on.ca")
 34 | set(PACKAGE_URL "https://github.com/jts/nanocall")
 35 | set(PACKAGE "${PROJECT_NAME}")
 36 | set(PACKAGE_NAME "${PROJECT_NAME}")
 37 | set(PACKAGE_TARNAME "${PROJECT_NAME}")
 38 | 
 39 | # directory where to build optional submodules
 40 | set(OPTIONAL_SUBMODULE_PREFIX
 41 |     ${PROJECT_SOURCE_DIR}/optional
 42 |     CACHE INTERNAL "Directory for installing optional submodules")
 43 | 
 44 | # header and source directories
 45 | set(SUBDIRS
 46 |     nanocall version
 47 |     CACHE INTERNAL "Subdirectories to descend into")
 48 | set(HEADER_SUBDIRS
 49 |     builtin_models fast5/src hpptools/include tclap/include
 50 |     nanocall version
 51 |     CACHE INTERNAL "Subdirectories containing header files")
 52 | 
 53 | ### Resolve external dependencies
 54 | #
 55 | # prefer static libraries
 56 | #set(CMAKE_FIND_LIBRARY_SUFFIXES .a .so)
 57 | 
 58 | # bake-in RPATH to prevent library search problems
 59 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH true)
 60 | set(CMAKE_BUILD_WITH_INSTALL_RPATH true)
 61 | set(CMAKE_LIBRARY_PATH $ENV{LD_LIBRARY_PATH})
 62 | 
 63 | # find zlib
 64 | # => ZLIB_INCLUDE_DIRS, ZLIB_LIBRARIES
 65 | find_package(ZLIB REQUIRED)
 66 | 
 67 | # find HDF5
 68 | # => HDF5_INCLUDE_DIRS, HDF5_LIBRARIES
 69 | if(BUILD_HDF5)
 70 |     message(STATUS "Building HDF5 in: ${OPTIONAL_SUBMODULE_PREFIX}")
 71 |     if(UNIX)
 72 |         # use local copy if available
 73 |         if(EXISTS /tmp/hdf5-1.8.16.tar.bz2)
 74 |             set(HDF5_URL /tmp/hdf5-1.8.16.tar.bz2)
 75 |         else()
 76 |             set(HDF5_URL http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.8.16/src/hdf5-1.8.16.tar.bz2)
 77 |         endif()
 78 |         ExternalProject_Add(hdf5
 79 |             PREFIX ${OPTIONAL_SUBMODULE_PREFIX}
 80 |             URL ${HDF5_URL}
 81 |             URL_MD5 79c1593573ebddf734eee8d43ecfe483
 82 |             CONFIGURE_COMMAND ./configure --prefix=${OPTIONAL_SUBMODULE_PREFIX} --disable-hl --disable-shared --enable-threadsafe
 83 |             BUILD_IN_SOURCE 1
 84 |             BUILD_COMMAND make
 85 |             INSTALL_COMMAND make install
 86 |             )
 87 |         set(HDF5_INCLUDE_DIRS ${OPTIONAL_SUBMODULE_PREFIX}/include)
 88 |         set(HDF5_LIBRARIES ${OPTIONAL_SUBMODULE_PREFIX}/lib/libhdf5.a)
 89 |     elseif(WIN32)
 90 |         message(FATAL_ERROR "Building HDF5 not supported on Windows")
 91 |     endif()
 92 | else()
 93 |     find_package(HDF5 1.8.11 REQUIRED)
 94 | endif()
 95 | 
 96 | # Problem:
 97 | #
 98 | # By default, if using target_link_libraries(... ${HDF5_LIBRARIES}), cmake
 99 | # attempts to use the short name "-lhdf5" as an argument to ld, *without* setting
100 | # "-L" appropriately. It's unclear why, or if this is related to LD_LIBRARY_PATH.
101 | # The problem arises if an older libhdf5 is found instead. The solution below
102 | # uses an IMPORTED library. In this case, cmake will use the full path during
103 | # linking.
104 | #
105 | # Also, setting up RPATH apropriately using cmake is... elusive. If baking-in of
106 | # linker paths is needed, be sure to use LD_RUN_PATH during make- this will be
107 | # honoured by g++.
108 | #
109 | # Refs:
110 | # https://cmake.org/pipermail/cmake/2013-December/056655.html
111 | # https://cmake.org/Wiki/CMake/Tutorials/Exporting_and_Importing_Targets
112 | #
113 | add_library(libhdf5 UNKNOWN IMPORTED)
114 | set_property(TARGET libhdf5 PROPERTY IMPORTED_LOCATION ${HDF5_LIBRARIES})
115 | #get_filename_component(HDF5_LIBRARIES_DIR ${HDF5_LIBRARIES} DIRECTORY)
116 | #set(CMAKE_INSTALL_RPATH ${HDF5_LIBRARIES_DIR})
117 | 
118 | message(STATUS "HDF5_INCLUDE_DIRS=${HDF5_INCLUDE_DIRS}")
119 | message(STATUS "HDF5_LIBRARIES=${HDF5_LIBRARIES}")
120 | 
121 | # message(STATUS "CMAKE_LIBRARY_PATH=${CMAKE_LIBRARY_PATH}")
122 | # message(STATUS "CMAKE_SYSTEM_LIBRARY_PATH=${CMAKE_SYSTEM_LIBRARY_PATH}")
123 | # message(STATUS "CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES=${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}")
124 | # message(STATUS "CMAKE_INSTALL_RPATH=${CMAKE_INSTALL_RPATH}")
125 | # message(STATUS "CMAKE_INSTALL_RPATH_USE_LINK_PATH=${CMAKE_INSTALL_RPATH_USE_LINK_PATH}")
126 | # message(STATUS "CMAKE_BUILD_WITH_INSTALL_RPATH=${CMAKE_BUILD_WITH_INSTALL_RPATH}")
127 | # message(STATUS "CMAKE_SKIP_BUILD_RPATH=${CMAKE_SKIP_BUILD_RPATH}")
128 | 
129 | ### Prepare default compilation flags
130 | #
131 | # headers
132 | include_directories(SYSTEM
133 |     ${ZLIB_INCLUDE_DIRS}
134 |     ${HDF5_INCLUDE_DIRS}
135 |     )
136 | include_directories(
137 |     ${PROJECT_BINARY_DIR}
138 |     ${HEADER_SUBDIRS}
139 |     )
140 | get_directory_property(include_directories INCLUDE_DIRECTORIES)
141 | message(STATUS "INCLUDE_DIRECTORIES='${include_directories}'")
142 | 
143 | ### general compile flags
144 | set(EXTRA_FLAGS "-std=c++11 -pthread -Wall -Wextra -pedantic")
145 | 
146 | # compiler-specific flags
147 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
148 |     set(EXTRA_FLAGS "${EXTRA_FLAGS} -fmax-errors=1")
149 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
150 |     set(EXTRA_FLAGS "${EXTRA_FLAGS} -ferror-limit=1")
151 | endif()
152 | 
153 | # consolidate compile flags
154 | set(CMAKE_CXX_FLAGS "$ENV{CXXFLAGS} ${EXTRA_FLAGS}")
155 | message(STATUS "CMAKE_CXX_FLAGS='${CMAKE_CXX_FLAGS}'")
156 | 
157 | ### build-specific compile flags
158 | set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g3 -ggdb -fno-inline -fno-eliminate-unused-debug-types")
159 | set(CMAKE_CXX_FLAGS_TEST "-O3 -g3 -fno-eliminate-unused-debug-types")
160 | set(CMAKE_CXX_FLAGS_TEST_O2 "-O2 -g3 -fno-eliminate-unused-debug-types")
161 | set(CMAKE_CXX_FLAGS_TEST_O1 "-O1 -g3 -fno-eliminate-unused-debug-types")
162 | set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -DDISABLE_ASSERTS")
163 | set(CMAKE_CXX_FLAGS_GPROF "-O3 -g3 -pg")
164 | set(CMAKE_CXX_FLAGS_GPROFREL "-O3 -DNDEBUG -DDISABLE_ASSERTS -pg")
165 | 
166 | # link flags
167 | #set(CMAKE_EXE_LINKER_FLAGS "-Wl,-rpath=$ENV{LD_RUN_PATH} -Wl,--as-needed")
168 | if(APPLE)
169 | 	set(CMAKE_EXE_LINKER_FLAGS "-Wl")
170 | else()
171 | 	set(CMAKE_EXE_LINKER_FLAGS "-Wl,--as-needed")
172 | endif()
173 | message(STATUS "CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'")
174 | 
175 | ### This target updates the package version
176 | #
177 | add_custom_target(package_version
178 |     ${PROJECT_SOURCE_DIR}/get-dir-version -v -d ${PROJECT_SOURCE_DIR}/.. -r ${PROJECT_SOURCE_DIR}/.. -o ${PROJECT_BINARY_DIR}/package_version.h -t PACKAGE_VERSION
179 |     )
180 | 
181 | ### Enable testing targets
182 | #enable_testing()
183 | 
184 | ### Descend into subdirectories
185 | #
186 | foreach(dir ${SUBDIRS})
187 |     add_subdirectory(${dir})
188 | endforeach()
189 | 


--------------------------------------------------------------------------------
/src/builtin_models/.gitignore:
--------------------------------------------------------------------------------
1 | *.model
2 | 


--------------------------------------------------------------------------------
/src/builtin_models/builtin_model_names.inl:
--------------------------------------------------------------------------------
 1 | {
 2 |   "r73.t.006.ont.model"
 3 |   ,
 4 |   "r73.c.p1.006.ont.model"
 5 |   ,
 6 |   "r73.c.p2.006.ont.model"
 7 |   ,
 8 |   "r9.t.007.ont.model"
 9 |   ,
10 |   "r9.c.p1.007.ont.model"
11 |   ,
12 |   "r9.c.p2.007.ont.model"
13 | }
14 | 


--------------------------------------------------------------------------------
/src/builtin_models/builtin_model_num.inl:
--------------------------------------------------------------------------------
1 | 6
2 | 


--------------------------------------------------------------------------------
/src/builtin_models/builtin_model_strands.inl:
--------------------------------------------------------------------------------
 1 | {
 2 |   0
 3 |   ,
 4 |   1
 5 |   ,
 6 |   1
 7 |   ,
 8 |   0
 9 |   ,
10 |   1
11 |   ,
12 |   1
13 | }
14 | 


--------------------------------------------------------------------------------
/src/builtin_models/make-builtin-model-initializers:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import os
 5 | import re
 6 | import sys
 7 | 
 8 | description = '''
 9 |   Transform the given model files into C++ initializer list suitable for inclusion during preprocessing.
10 | '''
11 | default_files = [
12 |     '0:r73.t.006.ont.model',
13 |     '1:r73.c.p1.006.ont.model',
14 |     '1:r73.c.p2.006.ont.model',
15 |     '0:r9.t.007.ont.model',
16 |     '1:r9.c.p1.007.ont.model',
17 |     '1:r9.c.p2.007.ont.model']
18 | parser = argparse.ArgumentParser(description=description, epilog='')
19 | parser.add_argument('input', nargs='*', default=default_files, help='Input models, in the form <strand>:<file>')
20 | args = parser.parse_args()
21 | 
22 | m_strands = list()
23 | m_names = list()
24 | 
25 | for fn in args.input:
26 |     if fn[0] not in '012' or fn[1] != ':':
27 |         print('error parsing model name [' + fn + ']: expecting <strand>:<file>', file=sys.stderr)
28 |         sys.exit(1)
29 |     m_strands.append(int(fn[0]))
30 |     m_names.append(fn[2:])
31 | 
32 | f_out = open('builtin_model_num.inl', 'w')
33 | print(str(len(m_names)), file=f_out)
34 | f_out.close()
35 | 
36 | f_out = open('builtin_model_names.inl', 'w')
37 | print('{', file=f_out)
38 | for i in range(len(m_names)):
39 |     if i > 0:
40 |         print('  ,', file=f_out)
41 |     print('  "' + m_names[i] + '"', file=f_out)
42 | print('}', file=f_out)
43 | f_out.close()
44 | 
45 | f_out = open('builtin_model_strands.inl', 'w')
46 | print('{', file=f_out)
47 | for i in range(len(m_strands)):
48 |     if i > 0:
49 |         print('  ,', file=f_out)
50 |     print('  ' + str(m_strands[i]), file=f_out)
51 | print('}', file=f_out)
52 | f_out.close()
53 | 
54 | f_out = open('builtin_model_init_lists.inl', 'w')
55 | print('{', file=f_out)
56 | for i in range(len(m_names)):
57 |     if i > 0:
58 |         print('  ,', file=f_out)
59 |     print('  {', file=f_out)
60 |     f = open(m_names[i])
61 |     first_line = True
62 |     for line in f:
63 |         l = line.strip().split()
64 |         if len(l) < 5 or re.search(r'[^ACGT]', l[0].upper()) != None:
65 |             continue
66 |         if not first_line:
67 |             print('    ,', file=f_out)
68 |         first_line = False
69 |         print('    ' + ', '.join(l[1:5]), file=f_out)
70 |     print('  }', file=f_out)
71 | print('}', file=f_out)
72 | f_out.close()
73 | 
74 | 


--------------------------------------------------------------------------------
/src/cmake/FindHDF5.cmake:
--------------------------------------------------------------------------------
 1 | # Find HDF5 (https://www.hdfgroup.org/)
 2 | # Uses hint:
 3 | #   HDF5_ROOT
 4 | # Sets:
 5 | #   HDF5_FOUND
 6 | #   HDF5_INCLUDE_DIRS
 7 | #   HDF5_LIBRARIES
 8 | # Saves:
 9 | #   HDF5_ROOT
10 | #   HDF5_INCLUDE_DIRS_CACHED
11 | #   HDF5_LIBRARIES_CACHED
12 | 
13 | if(NOT "${OLD_HDF5_ROOT}" OR NOT "${HDF5_ROOT}" STREQUAL "${OLD_HDF5_ROOT}")
14 |     message(STATUS "Detecting HDF5: redetecing with new HDF5_ROOT=${HDF5_ROOT} (OLD_HDF5_ROOT=${OLD_HDF5_ROOT}).")
15 |     unset(HDF5_INCLUDE_DIRS_CACHED CACHE)
16 |     unset(HDF5_LIBRARIES_CACHED CACHE)
17 | else()
18 |     message(STATUS "Detecting HDF5: HDF5_ROOT=${HDF5_ROOT} is not new; using cached paths.")
19 |     message(STATUS "HDF5_INCLUDE_DIRS_CACHED=${HDF5_INCLUDE_DIRS_CACHED}")
20 |     message(STATUS "HDF5_LIBRARIES_CACHED=${HDF5_LIBRARIES_CACHED}")
21 | endif()
22 | set(OLD_HDF5_ROOT ${HDF5_ROOT} CACHE INTERNAL "Last used value of HDF5_ROOT")
23 | 
24 | # find headers
25 | find_path(HDF5_INCLUDE_DIRS_CACHED H5pubconf.h PATHS ${HDF5_ROOT}/include NO_DEFAULT_PATH)
26 | find_path(HDF5_INCLUDE_DIRS_CACHED H5pubconf.h PATH_SUFFIXES hdf5 hdf5/serial)
27 | if(HDF5_INCLUDE_DIRS_CACHED)
28 |     execute_process(
29 |         COMMAND grep H5_VERSION ${HDF5_INCLUDE_DIRS_CACHED}/H5pubconf.h
30 |         COMMAND awk "{print \$3}"
31 |         COMMAND tr -d "\"\n"
32 |         OUTPUT_VARIABLE HDF5_INCLUDE_DIRS_VERSION
33 |         )
34 |     message(STATUS "Found HDF5 headers version ${HDF5_INCLUDE_DIRS_VERSION} in: ${HDF5_INCLUDE_DIRS_CACHED}")
35 | endif()
36 | 
37 | # find library
38 | find_library(HDF5_LIBRARIES_CACHED hdf5 PATHS ${HDF5_ROOT}/lib ${HDF5_ROOT}/lib64 NO_DEFAULT_PATH)
39 | find_library(HDF5_LIBRARIES_CACHED hdf5 PATH_SUFFIXES hdf5 hdf5/serial)
40 | 
41 | include(FindPackageHandleStandardArgs)
42 | find_package_handle_standard_args(HDF5
43 |     REQUIRED_VARS HDF5_INCLUDE_DIRS_CACHED HDF5_LIBRARIES_CACHED
44 |     VERSION_VAR HDF5_INCLUDE_DIRS_VERSION
45 |     #"HDF5 library (https://www.hdfgroup.org/) not found. Specify location with -DHDF5_ROOT=<path>"
46 |     )
47 | mark_as_advanced(HDF5_INCLUDE_DIRS_CACHED HDF5_LIBRARIES_CACHED)
48 | 
49 | if(HDF5_FOUND)
50 |     set(HDF5_INCLUDE_DIRS ${HDF5_INCLUDE_DIRS_CACHED})
51 |     set(HDF5_LIBRARIES ${HDF5_LIBRARIES_CACHED})
52 | endif()
53 | 


--------------------------------------------------------------------------------
/src/get-dir-version:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | trap 'echo "exit code $?: LINENO=$LINENO BASH_LINENO=\"${BASH_LINENO[@]}\" FUNCNAME=\"${FUNCNAME[@]}\"" >&2' ERR
  3 | set -eEu -o pipefail
  4 | prog_name=$(basename "$0")
  5 | 
  6 | usage() {
  7 |     cat <<EOF
  8 | usage: $prog_name [-hv] [-d <dir>] [-c <ver_cmd>] [-f <ver_file>] [-r <proj_root>] [-o <out_file>] [-t <tag>]
  9 | 
 10 | Determine source tree version using the first method that works:
 11 | 1. Change directory to <dir>, and run <ver_cmd> to describe its version.
 12 | 2. If <ver_file> exists, read version from that file.
 13 | 3. If the directory <proj_root> ends with "-[v]<num>.<num>[.<num>]", use the
 14 | suffix as version.
 15 | 4. Failing all else, set version to "unknown".
 16 | Write the version as a single CPP define rule, defining the symbol <tag>. Send
 17 | the output to <out_file>. If <out_file> exists and already contains the correct
 18 | definition, do not update it.
 19 | 
 20 | The defaults are:
 21 | <dir>: .
 22 | <ver_cmd>: git describe --always --dirty --tags
 23 | <ver_file>: VERSION
 24 | <proj_root>: ..
 25 | <out_file>: stdout
 26 | <tag>: VERSION
 27 | 
 28 | Other options:
 29 |  -v : verbose messages
 30 |  -h : this help
 31 | EOF
 32 | }
 33 | 
 34 | log() {
 35 |     ! [ $verbose ] || echo "$@" >&2
 36 | }
 37 | 
 38 | dir=
 39 | version_command="git describe --always --dirty --tags"
 40 | version_file="VERSION"
 41 | project_root=..
 42 | output_file=
 43 | define_tag="VERSION"
 44 | verbose=
 45 | 
 46 | OPTIND=1
 47 | while getopts "d:c:f:r:o:t:vh" OPT "$@"; do
 48 |     case $OPT in
 49 |         d)
 50 |             dir=$OPTARG
 51 |             ;;
 52 |         c)
 53 |             version_command=$OPTARG
 54 |             ;;
 55 |         f)
 56 |             version_file=$OPTARG
 57 |             ;;
 58 |         r)
 59 |             project_root=$OPTARG
 60 |             ;;
 61 |         o)
 62 |             output_file=$OPTARG
 63 |             ;;
 64 |         t)
 65 |             define_tag=$OPTARG
 66 |             ;;
 67 |         v)
 68 |             verbose=1
 69 |             ;;
 70 |         h)
 71 |             usage
 72 |             exit 0
 73 |             ;;
 74 |         *)
 75 |             usage >&2
 76 |             exit 1
 77 |             ;;
 78 |     esac
 79 | done
 80 | shift $(($OPTIND - 1))
 81 | 
 82 | ! [ "$dir" ] || cd "$dir"
 83 | version=$(
 84 |     (eval "$version_command" 2>/dev/null && log "got version from: $version_command") ||
 85 |     (cat "$version_file" 2>/dev/null && log "got version from file: $version_file") ||
 86 |     ([ -d "$project_root" ] &&
 87 |         name=$(basename "$(cd "$project_root"; pwd -P)") &&
 88 |         [[ "$name" =~ -v?[0-9]+.[0-9]+(.[0-9]+)?$ ]] &&
 89 |         echo "${name##*-}" &&
 90 |         log "got version from project root folder") ||
 91 |     (echo "unknown" && log "did not find version"))
 92 | version=${version#v}
 93 | log "found version=$version"
 94 | 
 95 | if [ -r "$output_file" ]; then
 96 |     existing_version=$(awk -v tag="$define_tag" '$1=="#define"&&$2==tag {print $3}' <"$output_file" |
 97 |         sed 's/^"//;s/"$//')
 98 |     if [ "$existing_version" ]; then
 99 |         log "found existing_version=$existing_version"
100 |         if [ "$version" = "$existing_version" ]; then
101 |             log "version up to date"
102 |             exit 0
103 |         fi
104 |     fi
105 | fi
106 | 
107 | echo "#define ${define_tag} \"$version\"" |
108 | if [ "$output_file" ]; then
109 |     cat >"$output_file"
110 | else
111 |     cat
112 | fi
113 | 


--------------------------------------------------------------------------------
/src/nanocall/Builtin_Model.cpp:
--------------------------------------------------------------------------------
 1 | #include "Builtin_Model.hpp"
 2 | 
 3 | const unsigned Builtin_Model::num =
 4 | #include "builtin_model_num.inl"
 5 |     ;
 6 | 
 7 | const unsigned Builtin_Model::strands[] =
 8 | #include "builtin_model_strands.inl"
 9 |     ;
10 | 
11 | const std::string Builtin_Model::names[] =
12 | #include "builtin_model_names.inl"
13 |     ;
14 | 
15 | const std::vector< float > Builtin_Model::init_lists[] =
16 | #include "builtin_model_init_lists.inl"
17 |     ;
18 | 


--------------------------------------------------------------------------------
/src/nanocall/Builtin_Model.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef __BUILTIN_MODEL_HPP
 2 | #define __BUILTIN_MODEL_HPP
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | struct Builtin_Model
 8 | {
 9 |     static const unsigned num;
10 |     static const unsigned strands[];
11 |     static const std::string names[];
12 |     static const std::vector< float > init_lists[];
13 | }; // struct Builtin_Model
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/nanocall/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | message(STATUS "Processing: ${CMAKE_CURRENT_SOURCE_DIR}")
 2 | 
 3 | add_executable(nanocall
 4 |     nanocall.cpp
 5 |     Builtin_Model.cpp
 6 |     )
 7 | target_link_libraries(nanocall
 8 |     version
 9 |     libhdf5
10 |     ${CMAKE_DL_LIBS}
11 |     ${ZLIB_LIBRARIES}
12 |     )
13 | install(TARGETS nanocall RUNTIME DESTINATION bin)
14 | 
15 | if(NOT ${CMAKE_BUILD_TYPE} STREQUAL "Release")
16 |     add_executable(compute-state-transitions compute-state-transitions.cpp)
17 | 
18 |     add_executable(compute-scaled-pore-model compute-scaled-pore-model.cpp)
19 |     target_link_libraries(compute-scaled-pore-model libhdf5 ${CMAKE_DL_LIBS} ${ZLIB_LIBRARIES})
20 | 
21 |     add_executable(run-fwbw run-fwbw.cpp)
22 |     target_link_libraries(run-fwbw ${ZLIB_LIBRARIES})
23 | 
24 |     add_executable(run-viterbi run-viterbi.cpp)
25 |     target_link_libraries(run-viterbi ${ZLIB_LIBRARIES})
26 | 
27 |     add_executable(list-directory list-directory.cpp)
28 | endif()
29 | 


--------------------------------------------------------------------------------
/src/nanocall/Event.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef __EVENT_HPP
  2 | #define __EVENT_HPP
  3 | 
  4 | #include <algorithm>
  5 | #include <cmath>
  6 | #include <iostream>
  7 | #include <vector>
  8 | 
  9 | #include "fast5.hpp"
 10 | #include "logger.hpp"
 11 | #include "alg.hpp"
 12 | 
 13 | template < typename Float_Type, unsigned Kmer_Size >
 14 | class Event
 15 | {
 16 | public:
 17 |     Float_Type mean;
 18 |     Float_Type corrected_mean;
 19 |     Float_Type stdv;
 20 |     Float_Type start;
 21 |     Float_Type length;
 22 |     Float_Type log_mean;
 23 |     Float_Type log_corrected_mean;
 24 |     Float_Type log_stdv;
 25 |     //Float_Type log_start;
 26 |     //
 27 |     Float_Type orig_mean;
 28 |     Float_Type p_model_state;
 29 |     std::array< char, Kmer_Size > model_state;
 30 |     unsigned model_state_idx;
 31 |     int move;
 32 |     //
 33 |     void update_logs()
 34 |     {
 35 |         assert(mean > 0);
 36 |         log_mean = std::log(mean);
 37 |         assert(corrected_mean > 0);
 38 |         log_corrected_mean = std::log(corrected_mean);
 39 |         if (stdv == 0.0)
 40 |         {
 41 |             stdv = 0.01;
 42 |         }
 43 |         log_stdv = std::log(stdv);
 44 |         //log_start = std::log(start);
 45 |     }
 46 |     void set_model_state(const std::string& s)
 47 |     {
 48 |         assert(s.size() == Kmer_Size);
 49 |         std::copy_n(s.begin(), Kmer_Size, model_state.begin());
 50 |     }
 51 |     friend std::ostream & operator << (std::ostream& os, const Event& ev)
 52 |     {
 53 |         os << ev.mean << '\t'
 54 |            << ev.stdv << '\t'
 55 |            << ev.start << '\t'
 56 |            << ev.length;
 57 |         return os;
 58 |     }
 59 |     friend std::istream & operator >> (std::istream& is, Event& ev)
 60 |     {
 61 |         is >> ev.mean
 62 |            >> ev.stdv
 63 |            >> ev.start
 64 |            >> ev.length;
 65 |         ev.corrected_mean = ev.mean;
 66 |         ev.update_logs();
 67 |         return is;
 68 |     }
 69 | }; // class Event
 70 | 
 71 | template < typename Float_Type, unsigned Kmer_Size >
 72 | struct Event_Sequence
 73 |     : std::vector< Event< Float_Type, Kmer_Size > >
 74 | {
 75 |     typedef std::vector< Event< Float_Type, Kmer_Size > > Base;
 76 |     using Base::Base;
 77 |     void apply_drift_correction(Float_Type drift)
 78 |     {
 79 |         for (auto& e : *this)
 80 |         {
 81 |             e.corrected_mean -= drift * e.start;
 82 |             e.log_corrected_mean = std::log(e.corrected_mean);
 83 |         }
 84 |     }
 85 |     std::string get_base_seq() const
 86 |     {
 87 |         std::string res;
 88 |         const Base& v = *this;
 89 |         res.assign(v[0].model_state.begin(), v[0].model_state.end());
 90 |         for (unsigned i = 1; i < v.size(); ++i)
 91 |         {
 92 |             unsigned a = std::min((unsigned)v[i].move, (unsigned)Kmer_Size);
 93 |             unsigned b = Kmer_Size - a;
 94 |             assert(std::string(v[i - 1].model_state.begin() + a, v[i - 1].model_state.end())
 95 |                    == std::string(v[i].model_state.begin(), v[i].model_state.begin() + b));
 96 |             res += std::string(v[i].model_state.begin() + b, v[i].model_state.end());
 97 |         }
 98 |         return res;
 99 |     }
100 | }; // struct Event_Sequence
101 | 
102 | #endif
103 | 


--------------------------------------------------------------------------------
/src/nanocall/Fast5_Summary.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef __FAST5_SUMMARY_HPP
  2 | #define __FAST5_SUMMARY_HPP
  3 | 
  4 | #include <array>
  5 | #include <string>
  6 | #include <vector>
  7 | #include <memory>
  8 | 
  9 | #ifndef H5_HAVE_THREADSAFE
 10 | #include <mutex>
 11 | #endif
 12 | 
 13 | #include "Pore_Model.hpp"
 14 | #include "State_Transitions.hpp"
 15 | #include "Event.hpp"
 16 | #include "fast5.hpp"
 17 | #include "alg.hpp"
 18 | 
 19 | template < typename Float_Type, unsigned Kmer_Size >
 20 | class Fast5_Summary
 21 | {
 22 | public:
 23 |     typedef Pore_Model< Float_Type, Kmer_Size > Pore_Model_Type;
 24 |     typedef Pore_Model_Dict< Float_Type, Kmer_Size > Pore_Model_Dict_Type;
 25 |     typedef Pore_Model_Parameters< Float_Type > Pore_Model_Parameters_Type;
 26 |     typedef Event< Float_Type, Kmer_Size > Event_Type;
 27 |     typedef Event_Sequence< Float_Type, Kmer_Size > Event_Sequence_Type;
 28 |     typedef State_Transition_Parameters< Float_Type > State_Transition_Parameters_Type;
 29 | 
 30 |     std::string file_name;
 31 |     std::string base_file_name;
 32 |     std::string read_id;
 33 |     std::string bc_grp;
 34 |     std::array< std::array< std::string, 2 >, 3 > preferred_model;
 35 |     std::map< std::array< std::string, 2 >, Pore_Model_Parameters_Type > pm_params_m;
 36 |     std::map< std::array< std::string, 2 >, std::array< State_Transition_Parameters_Type, 2 > > st_params_m;
 37 |     std::array< unsigned, 4 > strand_bounds;
 38 |     std::array< Float_Type, 2 > time_length;
 39 |     unsigned num_ed_events;
 40 |     Float_Type sampling_rate;
 41 |     Float_Type abasic_level;
 42 |     bool valid;
 43 |     bool scale_strands_together;
 44 | 
 45 |     // from fast5 file
 46 |     std::unique_ptr< std::vector< fast5::EventDetection_Event_Entry > > ed_events_ptr;
 47 |     // filtered
 48 |     std::array< std::unique_ptr< Event_Sequence_Type >, 2 > events_ptr;
 49 |     //std::array< Event_Sequence_Type, 2 > events;
 50 | 
 51 |     const std::vector< fast5::EventDetection_Event_Entry >& ed_events() const
 52 |     {
 53 |         assert(ed_events_ptr);
 54 |         return *ed_events_ptr;
 55 |     }
 56 |     std::vector< fast5::EventDetection_Event_Entry >& ed_events()
 57 |     {
 58 |         assert(ed_events_ptr);
 59 |         return *ed_events_ptr;
 60 |     }
 61 |     const Event_Sequence_Type& events(unsigned st) const
 62 |     {
 63 |         assert(st < 2);
 64 |         assert(events_ptr[st]);
 65 |         return *events_ptr[st];
 66 |     }
 67 |     Event_Sequence_Type& events(unsigned st)
 68 |     {
 69 |         assert(st < 2);
 70 |         assert(events_ptr[st]);
 71 |         return *events_ptr[st];
 72 |     }
 73 | 
 74 |     static unsigned& min_ed_events()
 75 |     {
 76 |         static unsigned _min_ed_events = 10;
 77 |         return _min_ed_events;
 78 |     }
 79 | 
 80 |     static unsigned& max_ed_events()
 81 |     {
 82 |         static unsigned _max_ed_events = 100000;
 83 |         return _max_ed_events;
 84 |     }
 85 | 
 86 |     static std::string& eventdetection_group()
 87 |     {
 88 |         static std::string _eventdetection_group = "000";
 89 |         return _eventdetection_group;
 90 |     }
 91 | 
 92 |     // percent of top events to ignore
 93 |     static double& abasic_level_top_percent()
 94 |     {
 95 |         static double _abasic_level_top_percent = 1.0;
 96 |         return _abasic_level_top_percent;
 97 |     }
 98 | 
 99 |     // what to add to top level
100 |     static double& abasic_level_top_offset()
101 |     {
102 |         static double _abasic_level_top_offset = 0.0;
103 |         return _abasic_level_top_offset;
104 |     }
105 | 
106 |     // window size to consider for hairpin detection
107 |     static unsigned& hairpin_island_window_size()
108 |     {
109 |         static unsigned _hairpin_island_window_size = 10;
110 |         return _hairpin_island_window_size;
111 |     }
112 | 
113 |     // window load to consider for hairpin detection
114 |     static unsigned& hairpin_island_window_load()
115 |     {
116 |         static unsigned _hairpin_island_window_load = 5;
117 |         return _hairpin_island_window_load;
118 |     }
119 | 
120 |     // if set, do not split strands
121 |     static unsigned& template_only()
122 |     {
123 |         static unsigned _template_only = 0;
124 |         return _template_only;
125 |     }
126 | 
127 |     // trim margins: after start, before end, before hairpin start, after hairpin end
128 |     static std::array< unsigned, 4 >& trim_margins()
129 |     {
130 |         static std::array< unsigned, 4 > _trim_margins = {{ 50u, 50u, 50u, 50u }};
131 |         return _trim_margins;
132 |     }
133 | 
134 |     Fast5_Summary() : valid(false) {}
135 |     Fast5_Summary(const std::string fn, const Pore_Model_Dict_Type& models, bool sst)
136 |         : valid(false) { summarize(fn, models, sst); }
137 | 
138 |     void summarize(const std::string& fn, const Pore_Model_Dict_Type& models, bool sst)
139 |     {
140 |         valid = true;
141 |         // initialize fields
142 |         file_name = fn;
143 |         auto pos = file_name.find_last_of('/');
144 |         base_file_name = (pos != std::string::npos? file_name.substr(pos + 1) : file_name);
145 |         if (base_file_name.substr(base_file_name.size() - 6) == ".fast5")
146 |         {
147 |             base_file_name.resize(base_file_name.size() - 6);
148 |         }
149 |         read_id = base_file_name;
150 |         strand_bounds = {{ 0, 0, 0, 0 }};
151 |         time_length = {{ 0.0, 0.0 }};
152 |         num_ed_events = 0;
153 |         abasic_level = 0.0;
154 |         fast5::File f;
155 |         do
156 |         {
157 |             try
158 |             {
159 |                 // open file
160 |                 f.open(file_name); // can throw
161 |                 // get sampling rate
162 |                 if (not f.have_sampling_rate())
163 |                 {
164 |                     LOG("Fast5_Summary", info) << file_name << ": missing sampling rate" << std::endl;
165 |                     break;
166 |                 }
167 |                 sampling_rate = f.get_sampling_rate(); // can throw
168 |                 if (sampling_rate < 1000.0 or sampling_rate > 10000.0)
169 |                 {
170 |                     LOG("Fast5_Summary", warning) << file_name << ": unexpected sampling rate: " << sampling_rate << std::endl;
171 |                     break;
172 |                 }
173 |                 // get ed event params and ed events
174 |                 if (not f.have_eventdetection_events(eventdetection_group()))
175 |                 {
176 |                     LOG("Fast5_Summary", info) << file_name << ": missing eventdetection events" << std::endl;
177 |                     break;
178 |                 }
179 |                 auto ed_params = f.get_eventdetection_event_params(eventdetection_group()); // can throw
180 |                 if (not ed_params.read_id.empty())
181 |                 {
182 |                     read_id = ed_params.read_id;
183 |                 }
184 |                 load_ed_events(&f); // also sets num_ed_events
185 |                 if (num_ed_events < trim_margins()[0] + trim_margins()[1] + min_ed_events())
186 |                 {
187 |                     LOG("Fast5_Summary", info)
188 |                         << file_name << ": not enough eventdetection events: " << num_ed_events << std::endl;
189 |                     num_ed_events = 0;
190 |                     break;
191 |                 }
192 |                 // get abasic level
193 |                 abasic_level = detect_abasic_level();
194 |                 if (abasic_level <= 1.0)
195 |                 {
196 |                     LOG("Fast5_Summary", info)
197 |                         << file_name << ": abasic level too low: " << abasic_level << std::endl;
198 |                     num_ed_events = 0;
199 |                     break;
200 |                 }
201 |                 // detect strands
202 |                 strand_bounds = {{ trim_margins()[0], num_ed_events - trim_margins()[1], 0, 0 }};
203 |                 if (not template_only()) detect_strands();
204 |                 if (strand_bounds[1] <= strand_bounds[0])
205 |                 {
206 |                     LOG("Fast5_Summary", info) << file_name << ": no template strand detected" << std::endl;
207 |                     num_ed_events = 0;
208 |                     break;
209 |                 }
210 |                 scale_strands_together = (sst
211 |                                           and strand_bounds[1] - strand_bounds[0] >= min_ed_events()
212 |                                           and strand_bounds[3] - strand_bounds[2] >= min_ed_events());
213 |                 // compute time lengths
214 |                 load_events(&f);
215 |                 for (unsigned st = 0; st < 2; ++st)
216 |                 {
217 |                     if (events(st).size() < min_ed_events()) continue;
218 |                     time_length[st] = events(st).rbegin()->start + events(st).rbegin()->length;
219 |                 }
220 |                 //
221 |                 // compute initial model scalings
222 |                 //
223 |                 if (scale_strands_together)
224 |                 {
225 |                     auto r0 = alg::mean_stdv_of< Float_Type >(
226 |                         events(0),
227 |                         [] (const Event_Type& ev) { return ev.mean; });
228 |                     auto r1 = alg::mean_stdv_of< Float_Type >(
229 |                         events(1),
230 |                         [] (const Event_Type& ev) { return ev.mean; });
231 |                     for (const auto& p0 : models)
232 |                         if (p0.second.strand() == 0 or p0.second.strand() == 2)
233 |                             for (const auto& p1 : models)
234 |                                 if (p1.second.strand() == 1 or p1.second.strand() == 2)
235 |                                 {
236 |                                     std::array< std::string, 2 > m_name = {{ p0.first, p1.first }};
237 |                                     Pore_Model_Parameters_Type pm_params;
238 |                                     pm_params.scale = (r0.second / p0.second.stdv()
239 |                                                        + r1.second / p1.second.stdv()) / 2;
240 |                                     pm_params.shift = (r0.first - pm_params.scale * p0.second.mean()
241 |                                                        + r1.first - pm_params.scale * p1.second.mean()) / 2;
242 |                                     LOG("Fast5_Summary", debug)
243 |                                         << "initial_scaling read [" << read_id
244 |                                         << "] strand [2] model [" << m_name[0] << "+" << m_name[1]
245 |                                         << "] pm_params [" << pm_params << "]" << std::endl;
246 |                                     pm_params_m[m_name] = std::move(pm_params);
247 |                                     st_params_m[m_name][0] = State_Transition_Parameters_Type();
248 |                                     st_params_m[m_name][1] = State_Transition_Parameters_Type();
249 |                                 }
250 |                 }
251 |                 else // not scale_strands_together
252 |                 {
253 |                     for (unsigned st = 0; st < 2; ++st)
254 |                     {
255 |                         if (events(st).size() < min_ed_events()) continue;
256 |                         auto r = alg::mean_stdv_of< Float_Type >(
257 |                             events(st),
258 |                             [] (const Event_Type& ev) { return ev.mean; });
259 |                         for (const auto& p : models)
260 |                         {
261 |                             if (p.second.strand() == st or p.second.strand() == 2)
262 |                             {
263 |                                 std::array< std::string, 2 > m_name;
264 |                                 m_name[st] = p.first;
265 |                                 Pore_Model_Parameters_Type pm_params;
266 |                                 pm_params.scale = r.second / p.second.stdv();
267 |                                 pm_params.shift = r.first - pm_params.scale * p.second.mean();
268 |                                 LOG("Fast5_Summary", debug)
269 |                                     << "initial_scaling read [" << read_id
270 |                                     << "] strand [" << st
271 |                                     << "] model [" << m_name[st]
272 |                                     << "] pm_params [" << pm_params << "]" << std::endl;
273 |                                 pm_params_m[m_name] = std::move(pm_params);
274 |                                 st_params_m[m_name][st] = State_Transition_Parameters_Type();
275 |                             }
276 |                         }
277 |                     }
278 |                 }
279 |                 // detect basecall group to write
280 |                 auto bc_grp_l = f.get_basecall_group_list();
281 |                 static const std::string bc_grp_prefix("Nanocall_");
282 |                 std::set< std::string > used_tags;
283 |                 for (const auto& bc_grp : bc_grp_l)
284 |                 {
285 |                     if (bc_grp.size() <= bc_grp_prefix.size()) continue;
286 |                     auto p = std::mismatch(bc_grp_prefix.begin(),
287 |                                            bc_grp_prefix.end(),
288 |                                            bc_grp.begin());
289 |                     if (p.first != bc_grp_prefix.end()) continue;
290 |                     std::string tag(p.second, bc_grp.end());
291 |                     std::clog << "found basecall group: " << tag << std::endl;
292 |                     used_tags.emplace(std::move(tag));
293 |                 }
294 |                 for (unsigned i = 0; i < 1000; ++i)
295 |                 {
296 |                     std::ostringstream tmp;
297 |                     tmp << std::setw(3) << std::setfill('0') << i;
298 |                     if (not used_tags.count(tmp.str()))
299 |                     {
300 |                         bc_grp = bc_grp_prefix + tmp.str();
301 |                         break;
302 |                     }
303 |                 }
304 |                 if (bc_grp.empty())
305 |                 {
306 |                     LOG(error)
307 |                         << "no available basecall tag" << std::endl;
308 |                     std::exit(EXIT_FAILURE);
309 |                 }
310 |             }
311 |             catch (hdf5_tools::Exception& e)
312 |             {
313 |                 LOG(warning) << file_name << ": HDF5 error: " << e.what() << std::endl;
314 |                 num_ed_events = 0;
315 |             }
316 |         } while (false);
317 |         drop_events();
318 |         ed_events_ptr.reset();
319 |     } // summarize
320 | 
321 |     void load_events(fast5::File* f_p = nullptr)
322 |     {
323 |         assert(valid);
324 |         drop_events();
325 |         if (num_ed_events == 0)
326 |         {
327 |             return;
328 |         }
329 |         bool must_load_ed_events = not ed_events_ptr;
330 |         if (must_load_ed_events)
331 |         {
332 | #ifndef H5_HAVE_THREADSAFE
333 |             static std::mutex fast5_mutex;
334 |             std::lock_guard< std::mutex > fast5_lock(fast5_mutex);
335 | #endif
336 |             bool must_open_file = not f_p;
337 |             if (must_open_file)
338 |             {
339 |                 f_p = new fast5::File(file_name);
340 |             }
341 |             assert(f_p->is_open());
342 |             load_ed_events(f_p);
343 |             if (must_open_file)
344 |             {
345 |                 delete f_p;
346 |             }
347 |         }
348 |         for (unsigned st = 0; st < 2; ++st)
349 |         {
350 |             events_ptr[st] = typename decltype(events_ptr)::value_type(new typename decltype(events_ptr)::value_type::element_type ());
351 |             for (unsigned j = strand_bounds[2 * st]; j < strand_bounds[2 * st + 1]; ++j)
352 |             {
353 |                 if (filter_ed_event(ed_events()[j], abasic_level))
354 |                 {
355 |                     Event_Type e;
356 |                     e.mean = ed_events()[j].mean;
357 |                     e.corrected_mean = e.mean;
358 |                     e.stdv = ed_events()[j].stdv;
359 |                     e.start = (ed_events()[j].start - ed_events()[strand_bounds[scale_strands_together? 0 : 2 * st]].start) / sampling_rate;
360 |                     e.length = ed_events()[j].length / sampling_rate;
361 |                     e.update_logs();
362 |                     events(st).emplace_back(std::move(e));
363 |                 }
364 |             }
365 |         }
366 |         if (must_load_ed_events)
367 |         {
368 |             ed_events_ptr.reset();
369 |         }
370 |     }
371 |     void drop_events()
372 |     {
373 |         for (unsigned st = 0; st < 2; ++st)
374 |         {
375 |             events_ptr[st].reset();
376 |         }
377 |     }
378 | 
379 |     void add_basecall_seq(const std::string& name, unsigned st, const std::string& seq, int default_qual = 33) const
380 |     {
381 |         try
382 |         {
383 |             // open file
384 |             fast5::File f(file_name, true); // can throw
385 |             // write seq
386 |             f.add_basecall_seq(st, bc_grp, name, seq, default_qual);
387 |         }
388 |         catch (hdf5_tools::Exception& e)
389 |         {
390 |             LOG(warning) << file_name << ": HDF5 error: " << e.what() << std::endl;
391 |         }
392 |     }
393 | 
394 |     void add_basecall_events(unsigned st, const Event_Sequence_Type& ev) const
395 |     {
396 |         try
397 |         {
398 |             // open file
399 |             fast5::File f(file_name, true); // can throw
400 |             // write seq
401 |             f.add_basecall_events(st, bc_grp, ev);
402 |         }
403 |         catch (hdf5_tools::Exception& e)
404 |         {
405 |             LOG(warning) << file_name << ": HDF5 error: " << e.what() << std::endl;
406 |         }
407 |     }
408 | 
409 |     void add_basecall_model(unsigned st, const Pore_Model_Type& model) const
410 |     {
411 |         try
412 |         {
413 |             // open file
414 |             fast5::File f(file_name, true); // can throw
415 |             // write model params
416 |             f.add_basecall_model(st, bc_grp, model.get_state_vector());
417 |         }
418 |         catch (hdf5_tools::Exception& e)
419 |         {
420 |             LOG(warning) << file_name << ": HDF5 error: " << e.what() << std::endl;
421 |         }
422 |     }
423 | 
424 |     void add_basecall_model_params(unsigned st, const Pore_Model_Parameters_Type& params) const
425 |     {
426 |         try
427 |         {
428 |             // open file
429 |             fast5::File f(file_name, true); // can throw
430 |             // write model params
431 |             f.add_basecall_model_params(st, bc_grp, params);
432 |         }
433 |         catch (hdf5_tools::Exception& e)
434 |         {
435 |             LOG(warning) << file_name << ": HDF5 error: " << e.what() << std::endl;
436 |         }
437 |     }
438 | 
439 |     friend std::ostream& operator << (std::ostream& os, const Fast5_Summary& fs)
440 |     {
441 |         os << "[base_file_name=" << fs.base_file_name << " valid=" << fs.valid;
442 |         if (fs.valid)
443 |         {
444 |             os << " num_ed_events=" << fs.num_ed_events;
445 |             if (fs.num_ed_events > 0)
446 |             {
447 |                 os << " read_id=" << fs.read_id
448 |                    << " abasic_level=" << fs.abasic_level
449 |                    << " strand_bounds=[" << fs.strand_bounds[0] << ","
450 |                    << fs.strand_bounds[1] << ","
451 |                    << fs.strand_bounds[2] << ","
452 |                    << fs.strand_bounds[3]
453 |                    << "] time_length=[" << fs.time_length[0] << "," << fs.time_length[1] << "]";
454 |             }
455 |         }
456 |         os << "]";
457 |         return os;
458 |     }
459 | 
460 |     static void write_tsv_header(std::ostream& os)
461 |     {
462 |         os << "file_name" << "\tread_name" << "\tnum_ed_events" << "\tabasic_level"
463 |            << "\ttemplate_start_idx" << "\ttemplate_end_idx"
464 |            << "\tcomplement_start_idx" << "\tcomplement_end_idx";
465 |         for (unsigned st = 0; st < 2; ++st)
466 |         {
467 |             os << "\tn" << st << "_model_name"
468 |                << "\tn" << st << "_scale"
469 |                << "\tn" << st << "_shift"
470 |                << "\tn" << st << "_drift"
471 |                << "\tn" << st << "_var"
472 |                << "\tn" << st << "_scale_sd"
473 |                << "\tn" << st << "_var_sd"
474 |                << "\tn" << st << "_p_stay"
475 |                << "\tn" << st << "_p_skip";
476 |         }
477 |     }
478 | 
479 |     void write_tsv(std::ostream& os) const
480 |     {
481 |         os << base_file_name << '\t' << read_id << '\t' << num_ed_events << '\t' << abasic_level
482 |            << '\t' << strand_bounds[0] << '\t' << strand_bounds[1]
483 |            << '\t' << strand_bounds[2] << '\t' << strand_bounds[3];
484 |         for (unsigned st = 0; st < 2; ++st)
485 |         {
486 |             os << '\t';
487 |             if (not preferred_model[st][st].empty())
488 |             {
489 |                 os << preferred_model[st][st] << '\t';
490 |                 pm_params_m.at(preferred_model[st]).write_tsv(os);
491 |                 os << '\t';
492 |                 st_params_m.at(preferred_model[st])[st].write_tsv(os);
493 |             }
494 |             else
495 |             {
496 |                 os << ".\t";
497 |                 Pore_Model_Parameters_Type().write_tsv(os);
498 |                 os << '\t';
499 |                 State_Transition_Parameters_Type().write_tsv(os);
500 |             }
501 |         }
502 |     }
503 | 
504 | private:
505 |     void load_ed_events(fast5::File* f_p)
506 |     {
507 |         ed_events_ptr = decltype(ed_events_ptr)(
508 |             new typename decltype(ed_events_ptr)::element_type(
509 |                 f_p->get_eventdetection_events(eventdetection_group())));
510 |         if (num_ed_events == 0)
511 |         {
512 |             if (ed_events().size() > max_ed_events())
513 |             {
514 |                 LOG("Fast5_Summary", info)
515 |                     << file_name << ": using only " << max_ed_events()
516 |                     << " of " << ed_events().size() << " events" << std::endl;
517 |                 num_ed_events = max_ed_events();
518 |             }
519 |             else
520 |             {
521 |                 num_ed_events = ed_events().size();
522 |             }
523 |         }
524 |         ed_events().resize(num_ed_events);
525 |     }
526 | 
527 |     // crude detection of abasic level
528 |     Float_Type detect_abasic_level()
529 |     {
530 |         //
531 |         // exclude top abasic_level_top_percent() levels
532 |         // add abasic_level_top_offset()
533 |         //
534 |         std::vector< Float_Type > s;
535 |         s.resize(ed_events().size());
536 |         unsigned i;
537 |         for (i = 0; i < ed_events().size(); ++i)
538 |         {
539 |             s[i] = ed_events()[i].mean;
540 |         }
541 |         std::sort(s.begin(), s.end());
542 |         return s[(double)s.size() * (1.0 - abasic_level_top_percent() / 100.0)] + abasic_level_top_offset();
543 |     } // detect_abasic_level()
544 | 
545 |     std::vector< std::pair< unsigned, unsigned > > find_islands_5_consec() const
546 |     {
547 |         //
548 |         // find islands of >= 5 consecutive events at high level
549 |         //
550 |         std::vector< std::pair< unsigned, unsigned > > islands;
551 |         unsigned i = 0;
552 |         while (i < ed_events().size())
553 |         {
554 |             if (ed_events()[i].mean >= abasic_level)
555 |             {
556 |                 unsigned j = i + 1;
557 |                 while (j < ed_events().size() and ed_events()[j].mean >= abasic_level) ++j;
558 |                 if (j - i >= 5)
559 |                {
560 |                     islands.push_back(std::make_pair(i, j));
561 |                     LOG("Fast5_Summary", debug) << "abasic_island [" << i << "," << j << "]" << std::endl;
562 |                 }
563 |                 i = j + 1;
564 |             }
565 |             else
566 |             {
567 |                 ++i;
568 |             }
569 |         }
570 |         return islands;
571 |     }
572 | 
573 |     std::vector< std::pair< unsigned, unsigned > > find_islands_5_of_10_consec() const
574 |     {
575 |         //
576 |         // find islands of >= 5/10 consecutive events at high level
577 |         //
578 |         std::vector< std::pair< unsigned, unsigned > > islands;
579 |         unsigned i = 0;
580 |         unsigned window_start = 0;
581 |         unsigned window_count = 0;
582 |         while (i < ed_events().size())
583 |         {
584 |             if (ed_events()[i].mean >= abasic_level)
585 |             {
586 |                 while (window_start + 10 <= i)
587 |                 {
588 |                     if (ed_events()[window_start].mean >= abasic_level)
589 |                     {
590 |                         --window_count;
591 |                     }
592 |                     ++window_start;
593 |                 }
594 |                 while (window_start < i and ed_events()[window_start].mean < abasic_level)
595 |                 {
596 |                     ++window_start;
597 |                 }
598 |                 assert(i < window_start + 10);
599 |                 ++window_count;
600 |                 if (window_count >= 5)
601 |                 {
602 |                     islands.push_back(std::make_pair(window_start, i));
603 |                     LOG("Fast5_Summary", debug) << "abasic_island [" << window_start << "," << i << "]" << std::endl;
604 |                     window_start = i + 1;
605 |                     window_count = 0;
606 |                 }
607 |             }
608 |             ++i;
609 |         }
610 |         return islands;
611 |     }
612 | 
613 |     // crude detection of hairpin islands
614 |     // look for >= hairping_window_load/hairpin_window_size consecutive events at high level
615 |     std::vector< std::pair< unsigned, unsigned > > find_hairpin_islands() const
616 |     {
617 |         std::vector< std::pair< unsigned, unsigned > > islands;
618 |         unsigned i = 0;
619 |         unsigned window_start = 0;
620 |         unsigned window_count = 0;
621 |         while (i < ed_events().size())
622 |         {
623 |             if (ed_events()[i].mean >= abasic_level)
624 |             {
625 |                 while (window_start + hairpin_island_window_size() <= i)
626 |                 {
627 |                     if (ed_events()[window_start].mean >= abasic_level)
628 |                     {
629 |                         --window_count;
630 |                     }
631 |                     ++window_start;
632 |                 }
633 |                 while (window_start < i and ed_events()[window_start].mean < abasic_level)
634 |                 {
635 |                     ++window_start;
636 |                 }
637 |                 assert(i < window_start + hairpin_island_window_size());
638 |                 ++window_count;
639 |                 if (window_count >= hairpin_island_window_load())
640 |                 {
641 |                     islands.push_back(std::make_pair(window_start, i));
642 |                     LOG("Fast5_Summary", debug) << "abasic_island [" << window_start << "," << i << "]" << std::endl;
643 |                     window_start = i + 1;
644 |                     window_count = 0;
645 |                 }
646 |             }
647 |             ++i;
648 |         }
649 |         return islands;
650 |     } // find_hairpin_islands()
651 | 
652 |     // crude detection of strands in event sequence
653 |     void detect_strands()
654 |     {
655 |         LOG("Fast5_Summary", debug)
656 |             << "num_events=" << ed_events().size()
657 |             << " abasic_level=" << abasic_level << std::endl;
658 |         //
659 |         // find islands of consecutive events at high level
660 |         //
661 |         auto islands = find_islands_5_consec(); //find_hairpin_islands();
662 |         //
663 |         // merge islands within 50bp of each other
664 |         //
665 |         for (unsigned i = 1; i < islands.size(); ++i)
666 |         {
667 |             if (islands[i - 1].second + std::max(trim_margins()[2], trim_margins()[3]) >= islands[i].first)
668 |             {
669 |                 LOG("Fast5_Summary", debug) << "merge_islands "
670 |                           << "[" << islands[i - 1].first << "," << islands[i - 1].second << "] with "
671 |                           << "[" << islands[i].first << "," << islands[i].second << "]" << std::endl;
672 |                 islands[i - 1].second = islands[i].second;
673 |                 islands.erase(islands.begin() + i);
674 |                 i = 0;
675 |             }
676 |         }
677 |         LOG("Fast5_Summary", debug)
678 |             << "final_islands: " << alg::os_join(
679 |                 islands, " ",
680 |                 [] (const std::pair< unsigned, unsigned >& p) {
681 |                     std::ostringstream tmp;
682 |                     tmp << "[" << p.first << "," << p.second << "]";
683 |                     return tmp.str();
684 |                 }) << std::endl;
685 |         if (islands.empty())
686 |         {
687 |             LOG("Fast5_Summary", info)
688 |                 << "template_only read_id=[" << read_id << "]" << std::endl;
689 |             return;
690 |         }
691 |         //
692 |         // pick island closest to the middle of the event sequence
693 |         //
694 |         auto dist_to_middle = [&] (const std::pair< unsigned, unsigned >& p) {
695 |             return std::min((unsigned)std::abs((long)p.first - (long)ed_events().size() / 2),
696 |                             (unsigned)std::abs((long)p.second - (long)ed_events().size() / 2));
697 |         };
698 |         auto it = alg::min_of(islands, dist_to_middle);
699 |         // check island is in the middle third; if not, intepret it as template only
700 |         if (dist_to_middle(*it) > ed_events().size() / 6)
701 |         {
702 |             LOG("Fast5_Summary", info)
703 |                 << "drop_read read_id=[" << read_id
704 |                 << "] islands=[" << alg::os_join(
705 |                     islands, " ",
706 |                     [] (const std::pair< unsigned, unsigned >& p) {
707 |                         std::ostringstream tmp;
708 |                         tmp << "[" << p.first << "," << p.second << "]";
709 |                         return tmp.str();
710 |                     })
711 |                 << "]" << std::endl;
712 |             return;
713 |         }
714 |         else
715 |         {
716 |             LOG("Fast5_Summary", debug)
717 |                 << "hairpin_island [" << it->first << "," << it->second << "]" << std::endl;
718 |             strand_bounds[0] = trim_margins()[0];
719 |             if (islands[0].first < trim_margins()[0] + trim_margins()[2])
720 |             {
721 |                 strand_bounds[0] = std::max(strand_bounds[0], islands[0].second);
722 |             }
723 |             strand_bounds[1] = it->first - trim_margins()[2];
724 |             strand_bounds[2] = it->first + trim_margins()[3];
725 |             strand_bounds[3] = ed_events().size() - trim_margins()[1];
726 |             if (islands[islands.size() - 1].second > ed_events().size() - (trim_margins()[3] + trim_margins()[1]))
727 |             {
728 |                 strand_bounds[3] = std::min(strand_bounds[3], islands[islands.size() - 1].first);
729 |             }
730 |         }
731 |     } // detect_strands()
732 | 
733 |     // crude filtering of eventdetection events
734 |     static bool filter_ed_event(const fast5::EventDetection_Event_Entry& e, Float_Type abasic_level)
735 |     {
736 |         if (e.mean >= abasic_level)
737 |         {
738 |             return false;
739 |         }
740 |         if (e.stdv > 4.0)
741 |         {
742 |             return false;
743 |         }
744 |         return true;
745 |     } // filter_ed_event()
746 | }; // struct Fast5_Summary
747 | 
748 | #endif
749 | 


--------------------------------------------------------------------------------
/src/nanocall/Forward_Backward.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef __FORWARD_BACKWARD_HPP
  2 | #define __FORWARD_BACKWARD_HPP
  3 | 
  4 | #include <cmath>
  5 | #include <iostream>
  6 | #include <vector>
  7 | #include <set>
  8 | 
  9 | #include "Pore_Model.hpp"
 10 | #include "State_Transitions.hpp"
 11 | #include "logsumset.hpp"
 12 | #include "logger.hpp"
 13 | 
 14 | template < typename Float_Type, unsigned Kmer_Size = 6 >
 15 | class Forward_Backward
 16 | {
 17 | public:
 18 |     typedef Kmer< Kmer_Size > Kmer_Type;
 19 |     typedef Pore_Model< Float_Type, Kmer_Size > Pore_Model_Type;
 20 |     typedef State_Transitions< Float_Type, Kmer_Size > State_Transitions_Type;
 21 |     typedef Event< Float_Type, Kmer_Size > Event_Type;
 22 |     typedef Event_Sequence< Float_Type, Kmer_Size > Event_Sequence_Type;
 23 |     typedef logsum::logsumset< Float_Type > LogSumSet_Type;
 24 | 
 25 |     struct Matrix_Entry
 26 |     {
 27 |         Float_Type alpha; // := Pr[ E_1 ... E_i, S_i = j ]
 28 |         Float_Type beta;  // := Pr[ E_{i+1} ... E_n | S_i = j ]
 29 |     }; // struct Matrix_Entry
 30 | 
 31 |     static const unsigned n_states = Pore_Model_Type::n_states;
 32 | 
 33 |     void clear() { _m.clear(); }
 34 |     unsigned n_events() const { return _m.size() / n_states; }
 35 | 
 36 |     // i: event index
 37 |     // j: state/kmer index
 38 |     const Matrix_Entry& cell(unsigned i, unsigned j) const { return _m[i * n_states + j]; }
 39 |     Matrix_Entry& cell(unsigned i, unsigned j) { return _m[i * n_states + j]; }
 40 | 
 41 |     Float_Type log_posterior(unsigned i, unsigned j) const { return cell(i, j).alpha + cell(i, j).beta - _log_pr_data; }
 42 |     Float_Type log_pr_data() const { return _log_pr_data; }
 43 | 
 44 |     static unsigned& n_threads() { static unsigned _n_threads = 1; return _n_threads; }
 45 | 
 46 |     void fill(const Pore_Model_Type& pm,
 47 |               const State_Transitions_Type& st,
 48 |               const Event_Sequence_Type& ev)
 49 |     {
 50 |         clear();
 51 |         unsigned n_events = ev.size();
 52 |         _m.resize(n_states * n_events);
 53 |         Float_Type log_n_states = std::log(static_cast< Float_Type >(n_states));
 54 |         LogSumSet_Type s(false);
 55 |         //
 56 |         // forward: alpha, i == 0
 57 |         //
 58 |         {
 59 |             unsigned i = 0;
 60 |             LOG("Forward_Backward", debug1) << "forward: i=" << i << std::endl;
 61 |             for (unsigned j = 0; j < n_states; ++j)
 62 |             {
 63 |                 cell(i, j).alpha = pm.log_pr_corrected_emission(j, ev[0]) - log_n_states;
 64 |                 LOG("Forward_Backward", debug2)
 65 |                     << "i=" << i << " j=" << j << " kmer_j=" << Kmer_Type::to_string(j)
 66 |                     << " alpha=" << cell(i, j).alpha << std::endl;
 67 |             }
 68 |         }
 69 |         //
 70 |         // forward: alpha, i > 0
 71 |         //
 72 |         for (unsigned i = 1; i < ev.size(); ++i)
 73 |         {
 74 |             LOG("Forward_Backward", debug1) << "forward: i=" << i << std::endl;
 75 |             for (unsigned j = 0; j < n_states; ++j)
 76 |             {
 77 |                 s.clear();
 78 |                 for (const auto& p : st.neighbours(j).from_v)
 79 |                 {
 80 |                     const unsigned& j_prev = p.first;
 81 |                     const Float_Type& log_pr_transition = p.second;
 82 |                     s.add(log_pr_transition + cell(i - 1, j_prev).alpha);
 83 |                 }
 84 |                 cell(i, j).alpha = pm.log_pr_corrected_emission(j, ev[i]) + s.val();
 85 |                 LOG("Forward_Backward", debug2)
 86 |                     << "i=" << i << " j=" << j << " kmer_j=" << Kmer_Type::to_string(j)
 87 |                     << " alpha=" << cell(i, j).alpha << std::endl;
 88 |             }
 89 |         }
 90 |         //
 91 |         // backward: beta, i == n-1
 92 |         //
 93 |         {
 94 |             unsigned i = ev.size() - 1;
 95 |             LOG("Forward_Backward", debug1) << "backward: i=" << i << std::endl;
 96 |             for (unsigned j = 0; j < n_states; ++j)
 97 |             {
 98 |                 cell(i, j).beta = 0;
 99 |                 LOG("Forward_Backward", debug2)
100 |                     << "i=" << i << " j=" << j << " kmer_j=" << Kmer_Type::to_string(j)
101 |                     << " beta=" << cell(i, j).beta << std::endl;
102 |             }
103 |         }
104 |         //
105 |         // backward: beta, i < n-1
106 |         //
107 |         for (unsigned ip1 = ev.size() - 1; ip1 > 0; --ip1)
108 |         {
109 |             unsigned i = ip1 - 1;
110 |             LOG("Forward_Backward", debug1) << "backward: i=" << i << std::endl;
111 |             for (unsigned j = 0; j < n_states; ++j)
112 |             {
113 |                 s.clear();
114 |                 for (const auto& p : st.neighbours(j).to_v)
115 |                 {
116 |                     const unsigned& j_next = p.first;
117 |                     const Float_Type& log_pr_transition = p.second;
118 |                     s.add(log_pr_transition + pm.log_pr_corrected_emission(j_next, ev[ip1]) + cell(ip1, j_next).beta);
119 |                 }
120 |                 cell(i, j).beta += s.val();
121 |                 LOG("Forward_Backward", debug2)
122 |                     << "i=" << i << " j=" << j << " kmer_j=" << Kmer_Type::to_string(j)
123 |                     << " beta=" << cell(i, j).beta << std::endl;
124 |             }
125 |         }
126 |         //
127 |         // pr_data
128 |         //
129 |         s.clear();
130 |         for (unsigned j = 0; j < n_states; ++j)
131 |         {
132 |             s.add(cell(ev.size() - 1, j).alpha);
133 |         }
134 |         _log_pr_data = s.val();
135 |     }
136 | 
137 |     friend std::ostream& operator << (std::ostream& os, const Forward_Backward& fwbw)
138 |     {
139 |         for (unsigned i = 0; i < fwbw.n_events(); ++i)
140 |         {
141 |             for (unsigned j = 0; j < fwbw.n_states; ++j)
142 |             {
143 |                 os << i << '\t' << j << '\t'
144 |                    << fwbw.cell(i, j).alpha << '\t'
145 |                    << fwbw.cell(i, j).beta << std::endl;
146 |             }
147 |         }
148 |         return os;
149 |     }
150 | 
151 | private:
152 |     std::vector< Matrix_Entry > _m;
153 |     Float_Type _log_pr_data;
154 | }; // class Forward_Backward
155 | 
156 | #endif
157 | 


--------------------------------------------------------------------------------
/src/nanocall/Forward_Backward_Custom.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef __FORWARD_BACKWARD_CUSTOM_HPP
  2 | #define __FORWARD_BACKWARD_CUSTOM_HPP
  3 | 
  4 | #include <cmath>
  5 | #include <iostream>
  6 | #include <vector>
  7 | #include <set>
  8 | 
  9 | #include "Pore_Model.hpp"
 10 | #include "State_Transitions.hpp"
 11 | #include "logsumset.hpp"
 12 | #include "logger.hpp"
 13 | 
 14 | template < typename Float_Type, unsigned Kmer_Size = 6 >
 15 | class Forward_Backward_Custom
 16 | {
 17 | public:
 18 |     typedef Kmer< Kmer_Size > Kmer_Type;
 19 |     typedef Pore_Model< Float_Type, Kmer_Size > Pore_Model_Type;
 20 |     typedef State_Transitions< Float_Type, Kmer_Size > State_Transitions_Type;
 21 |     typedef Event< Float_Type, Kmer_Size > Event_Type;
 22 |     typedef Event_Sequence< Float_Type, Kmer_Size > Event_Sequence_Type;
 23 |     typedef logsum::logsumset< Float_Type > LogSumSet_Type;
 24 | 
 25 |     struct Matrix_Entry
 26 |     {
 27 |         Float_Type alpha; // := Pr[ S_i = j | e_1 ... e_{i-1} ]
 28 |         Float_Type beta;  // := Pr[ S_i = j | e_1 ... e_i ]
 29 |         Float_Type gamma; // := Pr[ S_i = j | e_1 ... e_n ]
 30 |     }; // struct Matrix_Entry
 31 | 
 32 |     static const unsigned n_states = Pore_Model_Type::n_states;
 33 | 
 34 |     void clear() { _m.clear(); }
 35 |     unsigned n_events() const { return _m.size() / n_states; }
 36 | 
 37 |     // i: event index
 38 |     // j: state/kmer index
 39 |     const Matrix_Entry& cell(unsigned i, unsigned j) const { return _m[i * n_states + j]; }
 40 |     Matrix_Entry& cell(unsigned i, unsigned j) { return _m[i * n_states + j]; }
 41 | 
 42 |     Float_Type log_posterior(unsigned i, unsigned j) const { return cell(i, j).gamma; }
 43 | 
 44 |     static unsigned& n_threads() { static unsigned _n_threads = 1; return _n_threads; }
 45 | 
 46 |     void fill(const Pore_Model_Type& pm,
 47 |               const State_Transitions_Type& st,
 48 |               const Event_Sequence_Type& ev)
 49 |     {
 50 |         clear();
 51 |         unsigned n_events = ev.size();
 52 |         _m.resize(n_states * n_events);
 53 |         Float_Type log_n_states = std::log(static_cast< Float_Type >(n_states));
 54 |         LogSumSet_Type s1(false);
 55 |         LogSumSet_Type s2(false);
 56 |         //
 57 |         // forward: alpha, beta; i == 0
 58 |         //
 59 |         {
 60 |             for (unsigned j = 0; j < n_states; ++j)
 61 |             {
 62 |                 // alpha
 63 |                 cell(0, j).alpha = - log_n_states;
 64 |                 // beta
 65 |                 cell(0, j).beta = pm.log_pr_emission(j, ev[0]) + cell(0, j).alpha;
 66 |                 s1.add(cell(0, j).beta);
 67 |             }
 68 |             Float_Type denom = s1.val();
 69 |             LOG("Forward_Backward_Custom", debug1) << "i=0 beta_denom=" << denom << std::endl;
 70 |             for (unsigned j = 0; j < n_states; ++j)
 71 |             {
 72 |                 cell(0, j).beta -= denom;
 73 |                 LOG("Forward_Backward_Custom", debug2)
 74 |                     << "i=0 j=" << Kmer_Type::to_string(j)
 75 |                     << " alpha=" << cell(0, j).alpha
 76 |                     << " beta=" << cell(0, j).beta << std::endl;
 77 |             }
 78 |         }
 79 |         //
 80 |         // forward: alpha, beta; i > 0
 81 |         //
 82 |         for (unsigned i = 1; i < ev.size(); ++i)
 83 |         {
 84 |             LOG("Forward_Backward_Custom", debug1) << "forward: i=" << i << std::endl;
 85 |             s1.clear();
 86 |             for (unsigned j = 0; j < n_states; ++j) // TODO: parallelize
 87 |             {
 88 |                 // alpha
 89 |                 s2.clear();
 90 |                 for (const auto& p : st.neighbours(j).from_v)
 91 |                 {
 92 |                     const unsigned& j_prev = p.first;
 93 |                     const Float_Type& log_pr_transition = p.second;
 94 |                     s2.add(log_pr_transition + cell(i - 1, j_prev).beta);
 95 |                 }
 96 |                 cell(i, j).alpha = s2.val();
 97 |                 // beta
 98 |                 cell(i, j).beta = pm.log_pr_emission(j, ev[i]) + cell(i, j).alpha;
 99 |                 s1.add(cell(i, j).beta);
100 |             }
101 |             Float_Type denom = s1.val();
102 |             LOG("Forward_Backward_Custom", debug1) << "i=" << i << " beta_denom=" << denom << std::endl;
103 |             for (unsigned j = 0; j < n_states; ++j)
104 |             {
105 |                 cell(i, j).beta -= denom;
106 |                 LOG("Forward_Backward_Custom", debug2)
107 |                     << "i=" << i << " j=" << Kmer_Type::to_string(j)
108 |                     << " alpha=" << cell(i, j).alpha
109 |                     << " beta=" << cell(i, j).beta << std::endl;
110 |             }
111 |         }
112 |         //
113 |         // backward, gamma; i == n-1
114 |         //
115 |         for (unsigned j = 0; j < n_states; ++j)
116 |         {
117 |             cell(n_events - 1, j).gamma = cell(n_events - 1, j).beta;
118 |         }
119 |         //
120 |         // backward, gamma; i < n-1
121 |         //
122 |         for (unsigned ip1 = ev.size() - 1; ip1 > 0; --ip1)
123 |         {
124 |             unsigned i = ip1 - 1;
125 |             LOG("Forward_Backward_Custom", debug1) << "backward: i=" << i << std::endl;
126 |             for (unsigned j = 0; j < n_states; ++j) // TODO: parallelize
127 |             {
128 |                 cell(i, j).gamma = cell(i, j).beta;
129 |                 s2.clear();
130 |                 for (const auto& p : st.neighbours(j).to_v)
131 |                 {
132 |                     const unsigned& j_next = p.first;
133 |                     const Float_Type& log_pr_transition = p.second;
134 |                     s2.add(log_pr_transition + cell(ip1, j_next).gamma - cell(ip1, j_next).alpha);
135 |                 }
136 |                 cell(i, j).gamma += s2.val();
137 |                 LOG("Forward_Backward_Custom", debug2)
138 |                     << "i=" << i << " j=" << Kmer_Type::to_string(j)
139 |                     << " gamma=" << cell(i, j).gamma << std::endl;
140 |             }
141 |         }
142 |     }
143 | 
144 |     friend std::ostream& operator << (std::ostream& os, const Forward_Backward_Custom& fwbw)
145 |     {
146 |         for (unsigned i = 0; i < fwbw.n_events(); ++i)
147 |         {
148 |             for (unsigned j = 0; j < fwbw.n_states; ++j)
149 |             {
150 |                 os << i << '\t' << j << '\t'
151 |                    << fwbw.cell(i, j).alpha << '\t'
152 |                    << fwbw.cell(i, j).beta << '\t'
153 |                    << fwbw.cell(i, j).gamma << std::endl;
154 |             }
155 |         }
156 |         return os;
157 |     }
158 | 
159 | private:
160 |     std::vector< Matrix_Entry > _m;
161 | }; // class Forward_Backward_Custom
162 | 
163 | #endif
164 | 


--------------------------------------------------------------------------------
/src/nanocall/Kmer.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef __KMER_HPP
  2 | #define __KMER_HPP
  3 | 
  4 | #include <array>
  5 | #include <string>
  6 | #include <mutex>
  7 | 
  8 | template < unsigned Kmer_Size >
  9 | class Kmer
 10 | {
 11 | public:
 12 |     static const unsigned n_states = (1u << (2 * Kmer_Size));
 13 |     static size_t to_int(const std::string& s)
 14 |     {
 15 |         static std::array< int8_t, 256 > base_to_int;
 16 |         static bool table_initialized = false;
 17 |         if (not table_initialized)
 18 |         {
 19 |             for (unsigned i = 0; i < 256; ++i)
 20 |             {
 21 |                 base_to_int[i] = -1;
 22 |             }
 23 |             base_to_int['A'] = 0;
 24 |             base_to_int['C'] = 1;
 25 |             base_to_int['G'] = 2;
 26 |             base_to_int['T'] = 3;
 27 |             table_initialized = true;
 28 |         }
 29 |         size_t res = 0;
 30 |         for (size_t i = 0; i < s.size(); ++i)
 31 |         {
 32 |             res <<= 2;
 33 |             res += base_to_int[static_cast< unsigned >(s[i])];
 34 |         }
 35 |         return res;
 36 |     }
 37 |     static size_t to_int(const std::array< char, Kmer_Size >& a)
 38 |     {
 39 |         return to_int(std::string(a.begin(), a.end()));
 40 |     }
 41 |     static std::string to_string(size_t k)
 42 |     {
 43 |         static const std::string int_to_base("ACGT");
 44 |         std::string res;
 45 |         for (size_t j = 0; j < Kmer_Size; ++j)
 46 |         {
 47 |             res += int_to_base[(k >> (2 * (Kmer_Size - j - 1))) & 0x3];
 48 |         }
 49 |         return res;
 50 |     }
 51 |     static unsigned min_skip(unsigned k1, unsigned k2)
 52 |     {
 53 |         if (k1 == k2)
 54 |         {
 55 |             return 0;
 56 |         }
 57 |         else
 58 |         {
 59 |             for (unsigned k = Kmer_Size - 1; k > 0; --k)
 60 |             {
 61 |                 if ((k1 & ((1u << (2 * k)) - 1)) == (k2 >> (2 * (Kmer_Size - k))))
 62 |                 {
 63 |                     return Kmer_Size - k;
 64 |                 }
 65 |             }
 66 |             return Kmer_Size;
 67 |         }
 68 |     }
 69 |     static unsigned prefix(unsigned i, unsigned k)
 70 |     {
 71 |         return i >> (2 * (Kmer_Size - k));
 72 |     }
 73 |     static unsigned suffix(unsigned i, unsigned k)
 74 |     {
 75 |         return i & ((1u << (2 * k)) - 1);
 76 |     }
 77 | 
 78 |     /*
 79 |      * Precompute, for every kmer i, the maximum k such that suffix(i, k) == prefix(i, k)
 80 |      */
 81 |     static unsigned max_self_overlap(unsigned i)
 82 |     {
 83 |         assert(i < n_states);
 84 |         std::array< unsigned, n_states > _max_self_overlap;
 85 |         bool _inited = false;
 86 |         if (not _inited)
 87 |         {
 88 |             static std::mutex _mutex;
 89 |             {
 90 |                 std::lock_guard< std::mutex > _lock(_mutex);
 91 |                 if (not _inited) // recheck
 92 |                 {
 93 |                     for (unsigned i = 0; i < n_states; ++i)
 94 |                     {
 95 |                         _max_self_overlap[i] = 0;
 96 |                         for (unsigned k = Kmer_Size - 1; k >= 1; --k)
 97 |                         {
 98 |                             if (suffix(i, k) == prefix(i, k))
 99 |                             {
100 |                                 _max_self_overlap[i] = k;
101 |                                 break;
102 |                             }
103 |                         }
104 |                     }
105 |                     _inited = true;
106 |                 }
107 |             }
108 |         }
109 |         return _max_self_overlap[i];
110 |     }
111 | 
112 |     /*
113 |      * Precompute neighbours at distance 1 and 2.
114 |      */
115 |     static const std::vector< unsigned >& neighbour_list(unsigned i, unsigned d)
116 |     {
117 |         assert(i < n_states);
118 |         assert(d == 1 or d == 2);
119 |         static std::array< std::array< std::vector< unsigned >, 2 >, 4096 > _neighbour_list;
120 |         static bool _inited = false;
121 |         if (not _inited)
122 |         {
123 |             static std::mutex _mutex;
124 |             {
125 |                 std::lock_guard< std::mutex > _lock(_mutex);
126 |                 if (not _inited) // recheck
127 |                 {
128 |                     for (unsigned i = 0; i < n_states; ++i)
129 |                     {
130 |                         _neighbour_list[i][0].clear();
131 |                         _neighbour_list[i][1].clear();
132 |                         for (unsigned b1 = 0; b1 < 4; ++b1)
133 |                         {
134 |                             unsigned i1 = (suffix(i, Kmer_Size - 1) << 2) + b1;
135 |                             _neighbour_list[i][0].push_back(i1);
136 |                             for (unsigned b2 = 0; b2 < 4; ++b2)
137 |                             {
138 |                                 unsigned i2 = (suffix(i1, Kmer_Size - 1) << 2) + b2;
139 |                                 _neighbour_list[i][1].push_back(i2);
140 |                             }
141 |                         }
142 |                     }
143 |                     _inited = true;
144 |                 }
145 |             }
146 |         }
147 |         return _neighbour_list[i][d - 1];
148 |     }
149 | }; // class Kmer
150 | 
151 | #endif
152 | 


--------------------------------------------------------------------------------
/src/nanocall/Parameter_Trainer.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef __PARAMETER_TRAINER
  2 | #define __PARAMETER_TRAINER
  3 | 
  4 | #include <array>
  5 | #include <vector>
  6 | #include <map>
  7 | 
  8 | #include "global_assert.hpp"
  9 | #include "Pore_Model.hpp"
 10 | #include "State_Transitions.hpp"
 11 | #include "Forward_Backward.hpp"
 12 | #include "logsumset.hpp"
 13 | #include "logger.hpp"
 14 | 
 15 | template < typename Float_Type, unsigned Kmer_Size = 6 >
 16 | struct Parameter_Trainer
 17 | {
 18 |     typedef Kmer< Kmer_Size > Kmer_Type;
 19 |     typedef Pore_Model< Float_Type, Kmer_Size > Pore_Model_Type;
 20 |     typedef Pore_Model_Parameters< Float_Type > Pore_Model_Parameters_Type;
 21 |     typedef State_Transitions< Float_Type, Kmer_Size > State_Transitions_Type;
 22 |     typedef State_Transition_Parameters< Float_Type > State_Transition_Parameters_Type;
 23 |     typedef Event< Float_Type, Kmer_Size > Event_Type;
 24 |     typedef Event_Sequence< Float_Type, Kmer_Size > Event_Sequence_Type;
 25 |     typedef Forward_Backward< Float_Type, Kmer_Size > Forward_Backward_Type;
 26 |     typedef logsum::logsumset< Float_Type > LogSumSet_Type;
 27 | 
 28 |     static const unsigned n_states = Pore_Model_Type::n_states;
 29 | 
 30 |     static void init()
 31 |     {
 32 |         // pick states i s.t. i has self-overlap 0,
 33 |         // and all its 1-step neighbours have self-overlap <=1
 34 |         st_train_kmers().clear();
 35 |         for (unsigned i = 0; i < n_states; ++i)
 36 |         {
 37 |             if (Kmer_Type::max_self_overlap(i) > 0)
 38 |             {
 39 |                 continue;
 40 |             }
 41 |             bool all_good = true;
 42 |             for (unsigned b1 = 0; b1 < 4; ++b1)
 43 |             {
 44 |                 unsigned j = (Kmer_Type::suffix(i, Kmer_Size - 1) << 2) + b1;
 45 |                 if (Kmer_Type::max_self_overlap(j) > 1)
 46 |                 {
 47 |                     all_good = false;
 48 |                     break;
 49 |                 }
 50 |             }
 51 |             if (all_good)
 52 |             {
 53 |                 st_train_kmers().push_back(i);
 54 |             }
 55 |         }
 56 |         LOG(info) << "using [" << st_train_kmers().size() << "] states for state trainsition training" << std::endl;
 57 |     }
 58 | 
 59 |     static std::vector< unsigned >& st_train_kmers()
 60 |     {
 61 |         static std::vector< unsigned > _st_train_kmers;
 62 |         return _st_train_kmers;
 63 |     }
 64 | 
 65 |     static unsigned& pm_train_drift()
 66 |     {
 67 |         static unsigned _pm_train_drift = 1;
 68 |         return _pm_train_drift;
 69 |     }
 70 | 
 71 |     /**
 72 |      * Struct used for training rounds.
 73 |      * @event_seq_ptr_v Vector of pairs, first: an event sequence, second: strand from which it comes
 74 |      * @model_ptr_v Pointers to unscaled pore models (per strand)
 75 |      * @default_transitions_ptr Default state transitions
 76 |      * @pm_params_ptr Pore model scaling parameters (common to both strands)
 77 |      * @st_params_ptr_v State transition parameters (per strand)
 78 |      */
 79 |     struct Train_Data
 80 |     {
 81 |         // input
 82 |         std::vector< std::pair< const Event_Sequence_Type*, unsigned > > event_seq_ptr_v;
 83 |         std::array< const Pore_Model_Type*, 2 > model_ptr_v;
 84 |         const State_Transitions_Type* default_transitions_ptr;
 85 |         const Pore_Model_Parameters_Type* pm_params_ptr;
 86 |         std::array< const State_Transition_Parameters_Type*, 2 > st_params_ptr_v;
 87 |         // output
 88 |         std::array< Pore_Model_Type, 2 > scaled_model_v;
 89 |         std::array< State_Transitions_Type, 2 > custom_transitions_v;
 90 |         std::array< const State_Transitions_Type*, 2 > transitions_ptr_v;
 91 |         std::vector< Event_Sequence_Type > corrected_event_seq_v;
 92 |         std::vector< Forward_Backward_Type > fwbw_v;
 93 |         Float_Type fit;
 94 |     };
 95 | 
 96 |     /**
 97 |      * Fill training data for one training round.
 98 |      */
 99 |     static void fill_train_data(Train_Data& data)
100 |     {
101 |         // compute scaled pore models
102 |         data.scaled_model_v[0].clear();
103 |         data.scaled_model_v[1].clear();
104 |         std::array< bool, 2 > init_scaled_models = {{ false, false }};
105 |         for (const auto& p : data.event_seq_ptr_v)
106 |         {
107 |             ASSERT(p.second < 2);
108 |             if (init_scaled_models[p.second]) continue;
109 |             ASSERT(data.model_ptr_v[p.second]);
110 |             ASSERT(data.pm_params_ptr);
111 |             data.scaled_model_v[p.second] = *data.model_ptr_v[p.second];
112 |             data.scaled_model_v[p.second].scale(*data.pm_params_ptr);
113 |             init_scaled_models[p.second] = true;
114 |         }
115 |         // compute custom state transitions
116 |         data.custom_transitions_v[0].clear();
117 |         data.custom_transitions_v[1].clear();
118 |         std::array< bool, 2 > init_transitions = {{ false, false }};
119 |         for (const auto& p : data.event_seq_ptr_v)
120 |         {
121 |             if (init_transitions[p.second]) continue;
122 |             ASSERT(data.st_params_ptr_v[p.second]);
123 |             if (not data.st_params_ptr_v[p.second]->is_default())
124 |             {
125 |                 data.custom_transitions_v[p.second].compute_transitions_fast(*data.st_params_ptr_v[p.second]);
126 |                 data.transitions_ptr_v[p.second] = &data.custom_transitions_v[p.second];
127 |             }
128 |             else
129 |             {
130 |                 data.transitions_ptr_v[p.second] = data.default_transitions_ptr;
131 |             }
132 |             init_transitions[p.second] = true;
133 |         }
134 |         // compute drift-corrected event sequences
135 |         unsigned n_event_seqs = data.event_seq_ptr_v.size();
136 |         data.corrected_event_seq_v.clear();
137 |         data.corrected_event_seq_v.reserve(n_event_seqs);
138 |         data.fwbw_v.clear();
139 |         data.fwbw_v.reserve(n_event_seqs);
140 |         data.fit = 0.0;
141 |         for (unsigned k = 0; k < n_event_seqs; ++k)
142 |         {
143 |             unsigned st = data.event_seq_ptr_v[k].second;
144 |             ASSERT(init_scaled_models[st]);
145 |             ASSERT(init_transitions[st]);
146 |             // first, copy events
147 |             data.corrected_event_seq_v.emplace_back(*data.event_seq_ptr_v[k].first);
148 |             // then, apply drift correction
149 |             data.corrected_event_seq_v.back().apply_drift_correction(data.pm_params_ptr->drift);
150 |             // finally, run fwbw
151 |             data.fwbw_v.emplace_back();
152 |             data.fwbw_v.back().fill(
153 |                 data.scaled_model_v[st], *data.transitions_ptr_v[st], data.corrected_event_seq_v.back());
154 |             data.fit += data.fwbw_v.back().log_pr_data();
155 |         }
156 | #ifdef DUMP_TRAINING_DATA
157 |         for (unsigned k = 0; k < n_event_seqs; ++k)
158 |         {
159 |             unsigned st = data.event_seq_ptr_v[k].second;
160 |             unsigned n_events = data.event_seq_ptr_v[k].first->size();
161 |             std::ostringstream k_sstr;
162 |             k_sstr << k;
163 |             std::ofstream ofs;
164 |             ofs.open(std::string("emissions.") + k_sstr.str() + ".tab");
165 |             for (unsigned i = 0; i < n_events; ++i)
166 |             {
167 |                 for (unsigned j = 0; j < n_states; ++j)
168 |                 {
169 |                     if (j > 0) ofs << '\t';
170 |                     ofs << data.scaled_model_v[st].log_pr_corrected_emission(j, data.corrected_event_seq_v[k][i]);
171 |                 }
172 |                 ofs << std::endl;
173 |             }
174 |             ofs.close();
175 |             ofs.open(std::string("transitions.") + k_sstr.str() + ".tab");
176 |             for (unsigned j1 = 0; j1 < n_states; ++j1)
177 |             {
178 |                 std::map< unsigned, Float_Type > neighbour_m;
179 |                 for (const auto& p : data.transitions_ptr_v[st]->neighbours(j1).to_v)
180 |                 {
181 |                     neighbour_m[p.first] = p.second;
182 |                 }
183 |                 for (unsigned j2 = 0; j2 < n_states; ++j2)
184 |                 {
185 |                     if (j2 > 0) ofs << '\t';
186 |                     if (neighbour_m.count(j2))
187 |                     {
188 |                         ofs << neighbour_m.at(j2);
189 |                     }
190 |                     else
191 |                     {
192 |                         ofs << -1000.0;
193 |                     }
194 |                 }
195 |                 ofs << std::endl;
196 |             }
197 |             ofs.close();
198 |             ofs.open(std::string("fw.") + k_sstr.str() + ".tab");
199 |             for (unsigned i = 0; i < n_events; ++i)
200 |             {
201 |                 for (unsigned j = 0; j < n_states; ++j)
202 |                 {
203 |                     if (j > 0) ofs << '\t';
204 |                     ofs << data.fwbw_v[k].cell(i, j).alpha;
205 |                 }
206 |                 ofs << std::endl;
207 |             }
208 |             ofs.close();
209 |             ofs.open(std::string("bw.") + k_sstr.str() + ".tab");
210 |             for (unsigned i = 0; i < n_events; ++i)
211 |             {
212 |                 for (unsigned j = 0; j < n_states; ++j)
213 |                 {
214 |                     if (j > 0) ofs << '\t';
215 |                     ofs << data.fwbw_v[k].cell(i, j).beta;
216 |                 }
217 |                 ofs << std::endl;
218 |             }
219 |         }
220 |         abort();
221 | #endif
222 |     }
223 | 
224 |     /**
225 |      * Train pm_params on training data.
226 |      * @data Training data, as filled by fill_train_data.
227 |      * @new_pm_params Destination for new params.
228 |      * @done Bool; if true, training failed, and no rounds are possible because of a singularity.
229 |      */
230 |     static void train_pm_params(const Train_Data& data, Pore_Model_Parameters_Type& new_pm_params, bool& done)
231 |     {
232 |         done = false;
233 |         unsigned n_event_seqs = data.event_seq_ptr_v.size();
234 |         unsigned total_n_events = 0;
235 |         ASSERT(data.pm_params_ptr);
236 |         //
237 |         // compute the scaling matrices in normal space (not logspace!)
238 |         // against unscaled pm & uncorrected events
239 |         //
240 |         auto& a_hat = new_pm_params.shift;
241 |         auto& b_hat = new_pm_params.scale;
242 |         auto& c_hat = new_pm_params.drift;
243 |         auto& d_hat = new_pm_params.var;
244 |         auto& v_hat = new_pm_params.scale_sd;
245 |         auto& u_hat = new_pm_params.var_sd;
246 |         std::array< std::array< double, 3 >, 3 > A = {{ {{ 0.0, 0.0, 0.0 }},
247 |                                                         {{ 0.0, 0.0, 0.0 }},
248 |                                                         {{ 0.0, 0.0, 0.0 }} }};
249 |         std::array< double, 3 > B = {{ 0.0, 0.0, 0.0 }};
250 |         double D       = 0.0; // = \sum_i x^2_i s_{i,0} (used for var)
251 |         double V_numer = 0.0; // = \sum_i y_i \sum_j p_{i,j} \lambda_j / \eta^2_j (for scale_sd)
252 |         double V_denom = 0.0; // = \sum_i \sum_j p_{i,j} \lambda_j / \eta_j (for scale_sd)
253 |         double U_pos   = 0.0; // = \sum_i (1/y_i) \sum_j p_{i,j} \lambda_j (for var_sd)
254 |         for (unsigned k = 0; k < n_event_seqs; ++k)
255 |         {
256 |             unsigned st = data.event_seq_ptr_v.at(k).second;
257 |             ASSERT(st < 2);
258 |             const Event_Sequence_Type& events = *data.event_seq_ptr_v[k].first;
259 |             unsigned n_events = events.size();
260 |             total_n_events += n_events;
261 |             const Pore_Model_Type& pm = *data.model_ptr_v[st];
262 |             const Forward_Backward_Type& fwbw = data.fwbw_v.at(k);
263 |             for (unsigned i = 0; i < n_events; ++i)
264 |             {
265 |                 Float_Type x_i = events[i].mean;
266 |                 Float_Type y_i = events[i].stdv;
267 |                 Float_Type t_i = events[i].start;
268 |                 LOG(debug1)
269 |                     << "outter_loop k=" << k << " i=" << i
270 |                     << " x_i=" << x_i
271 |                     << " t_i=" << t_i << std::endl;
272 |                 // \sum_j p_{i,j} \mu^*_j / \simga^2_j
273 |                 std::array< float, 3 > s = {{ 0.0, 0.0, 0.0 }};
274 |                 // \sum_j p_{i,j} \lambda_j / \eta^*_j
275 |                 std::array< float, 3 > l = {{ 0.0, 0.0, 0.0 }};
276 |                 for (unsigned j = 0; j < Pore_Model_Type::n_states; ++j)
277 |                 {
278 |                     Float_Type p_ij = std::exp(fwbw.log_posterior(i, j));
279 |                     Float_Type term_s0 = p_ij / (pm.state(j).level_stdv * pm.state(j).level_stdv);
280 |                     Float_Type term_s1 = term_s0 * pm.state(j).level_mean;
281 |                     Float_Type term_s2 = term_s1 * pm.state(j).level_mean;
282 |                     Float_Type term_l0 = p_ij * pm.state(j).sd_lambda;
283 |                     Float_Type term_l1 = term_l0 / pm.state(j).sd_mean;
284 |                     Float_Type term_l2 = term_l1 / pm.state(j).sd_mean;
285 |                     LOG(debug2)
286 |                         << "inner_loop k=" << k << " i=" << i << " j=" << j << " p_ij=" << p_ij
287 |                         << " term_s0=" << term_s0 << " term_s1=" << term_s1 << " term_s2=" << term_s2
288 |                         << " term_l0=" << term_l0 << " term_l1=" << term_l1 << " term_l2=" << term_l2
289 |                         << std::endl;
290 |                     s[0] += term_s0;
291 |                     s[1] += term_s1;
292 |                     s[2] += term_s2;
293 |                     l[0] += term_l0;
294 |                     l[1] += term_l1;
295 |                     l[2] += term_l2;
296 |                 } // for j
297 |                 A[0][0] += s[0];
298 |                 A[0][1] += s[1];
299 |                 A[1][1] += s[2];
300 |                 B[0]    += s[0] * x_i;
301 |                 B[1]    += s[1] * x_i;
302 |                 if (pm_train_drift())
303 |                 {
304 |                     A[0][2] += s[0] * t_i;
305 |                     A[1][2] += s[1] * t_i;
306 |                     A[2][2] += s[0] * t_i * t_i;
307 |                     B[2]    += s[0] * x_i * t_i;
308 |                 }
309 |                 D       += s[0] * x_i * x_i;
310 |                 V_numer += l[2] * y_i;
311 |                 V_denom += l[1];
312 |                 U_pos   += l[0] / y_i;
313 |             } // for i
314 |         } // for k
315 |         A[1][0] = A[0][1];
316 |         A[2][0] = A[0][2];
317 |         A[2][1] = A[1][2];
318 |         if (not pm_train_drift())
319 |         {
320 |             A[2][2] = 1.0;
321 |         }
322 |         auto A_copy = A;
323 |         auto B_copy = B;
324 |         // compute scaling vector used for scaled partial pivoting
325 |         std::array< double, 3 > C;
326 |         for (unsigned i = 0; i < 3; ++i)
327 |         {
328 |             C[i] = alg::max_value_of(A[i]); // no need for abs(), as A>0
329 |         }
330 |         LOG(debug1)
331 |             << "A={{" << A[0][0] << ", " << A[0][1] << ", " << A[0][2]
332 |             << "}, {" << A[1][0] << ", " << A[1][1] << ", " << A[1][2]
333 |             << "}, {" << A[2][0] << ", " << A[2][1] << ", " << A[2][2]
334 |             << "}} B={" << B[0] << ", " << B[1] << ", " << B[2]
335 |             << "} C={" << C[0] << ", " << C[1] << ", " << C[2] << "}" << std::endl;
336 |         //
337 |         // solve A * X = B using Gaussian elimination with partial pivoting
338 |         //
339 |         for (unsigned i = 0; i < 3; ++i)
340 |         {
341 |             unsigned p = i;
342 |             double p_val = std::abs(A[i][i]) / C[p];
343 |             for (unsigned i2 = i + 1; i2 < 3; ++i2)
344 |             {
345 |                 double i2_val = std::abs(A[i2][i]) / C[i2];
346 |                 if (i2_val > p_val)
347 |                 {
348 |                     p = i2;
349 |                     p_val = i2_val;
350 |                 }
351 |             }
352 |             LOG(debug1)
353 |                 << "gaussian_elimination i=" << i << " p=" << p << " p_val=" << p_val << std::endl;
354 |             // if the pivot is too small, consider matrix singular, and give up
355 |             if (p_val < 1e-7)
356 |             {
357 |                 done = true;
358 |                 new_pm_params = *data.pm_params_ptr;
359 |                 return;
360 |             }
361 |             // if necessary, interchange rows i & p
362 |             if (p > i)
363 |             {
364 |                 std::swap(A[i], A[p]);
365 |                 std::swap(B[i], B[p]);
366 |                 std::swap(C[i], C[p]);
367 |             }
368 |             // eliminate variable i from the last i-1 equations
369 |             for (p = i + 1; p < 3; ++p)
370 |             {
371 |                 double m = A[p][i] / A[i][i];
372 |                 A[p][i] = 0;
373 |                 for (unsigned j = i + 1; j < 3; ++j)
374 |                 {
375 |                     A[p][j] -= m * A[i][j];
376 |                 }
377 |                 B[p] -= m * B[i];
378 |             }
379 |             LOG(debug1)
380 |                 << "gaussian_elimination i=" << i
381 |                 << " A={{" << A[0][0] << ", " << A[0][1] << ", " << A[0][2]
382 |                 << "}, {" << A[1][0] << ", " << A[1][1] << ", " << A[1][2]
383 |                 << "}, {" << A[2][0] << ", " << A[2][1] << ", " << A[2][2]
384 |                 << "}} B={" << B[0] << ", " << B[1] << ", " << B[2]
385 |                 << "} C={" << C[0] << ", " << C[1] << ", " << C[2] << "}" << std::endl;
386 |         }
387 |         // solve the upper triangular system by hand, storing the solutions as the new parameters
388 |         c_hat = B[2] / A[2][2];
389 |         b_hat = (B[1] - A[1][2] * c_hat) / A[1][1];
390 |         a_hat = (B[0] - A[0][1] * b_hat - A[0][2] * c_hat) / A[0][0];
391 |         LOG(debug1)
392 |             << "update_step a=" << a_hat << " b=" << b_hat << " c=" << c_hat << std::endl;
393 | #ifndef NDEBUG
394 |         // sanity check
395 |         for (unsigned i = 0; i < 3; ++i)
396 |         {
397 |             double x = (A_copy[i][0] * a_hat
398 |                         + A_copy[i][1] * b_hat
399 |                         + A_copy[i][2] * c_hat);
400 |             ASSERT((x - B_copy[i])/std::max(x, B_copy[i]) < pm_train_drift()? 1e-3 : 1e-2);
401 |         }
402 | #endif
403 |         //
404 |         // update var
405 |         //
406 |         double d_numer = (D
407 |                           + a_hat * a_hat * A_copy[0][0]
408 |                           + b_hat * b_hat * A_copy[1][1]
409 |                           + c_hat * c_hat * A_copy[2][2]
410 |                           + 2.0 * a_hat * b_hat * A_copy[0][1]
411 |                           + 2.0 * a_hat * c_hat * A_copy[0][2]
412 |                           + 2.0 * b_hat * c_hat * A_copy[1][2]
413 |                           - 2.0 * (a_hat * B_copy[0]
414 |                                    + b_hat * B_copy[1]
415 |                                    + c_hat * B_copy[2])
416 |             );
417 |         d_hat = std::sqrt(d_numer / (double)total_n_events);
418 |         LOG(debug1) << "update_step d=" << d_hat << std::endl;
419 |         //
420 |         // update scale_sd
421 |         //
422 |         v_hat = V_numer / V_denom;
423 |         //
424 |         // update var_sd
425 |         //
426 |         u_hat = (double)total_n_events / (U_pos - V_denom / v_hat);
427 |     }
428 | 
429 |     /**
430 |      * Train st_params on training data.
431 |      * @data Training data, as filled by fill_train_data.
432 |      * @new_st_params Destination for new st params.
433 |      */
434 |     static void train_st_params(const Train_Data& data,
435 |                                 std::array< State_Transition_Parameters_Type, 2 >& new_st_params)
436 |     {
437 |         unsigned n_event_seqs = data.event_seq_ptr_v.size();
438 |         for (unsigned st = 0; st < 2; ++st)
439 |         {
440 |             ASSERT(data.st_params_ptr_v[st]);
441 |             LogSumSet_Type s_p_stay_num(false);
442 |             LogSumSet_Type s_p_skip_num(false);
443 |             LogSumSet_Type s_denom(false);
444 |             Float_Type log_p_stay = std::log(data.st_params_ptr_v[st]->p_stay);
445 |             Float_Type log_p_step_4 = std::log(1.0 - data.st_params_ptr_v[st]->p_stay - data.st_params_ptr_v[st]->p_skip) - std::log(4.0);
446 |             for (unsigned k = 0; k < n_event_seqs; ++k)
447 |             {
448 |                 if (data.event_seq_ptr_v[k].second != st) continue;
449 |                 const Pore_Model_Type& scaled_pm = data.scaled_model_v[st];
450 |                 const Event_Sequence_Type& corrected_events = data.corrected_event_seq_v.at(k);
451 |                 unsigned n_events = corrected_events.size();
452 |                 const Forward_Backward_Type& fwbw = data.fwbw_v.at(k);
453 |                 //
454 |                 // P[S_i = j1, S_{i+1} = j2]
455 |                 //
456 |                 auto log_joint_prob = [&] (unsigned i, unsigned j1, unsigned j2, Float_Type log_p_trans) {
457 |                     Float_Type p = fwbw.cell(i, j1).alpha
458 |                         + log_p_trans
459 |                         + scaled_pm.log_pr_corrected_emission(j2, corrected_events[i + 1])
460 |                         + fwbw.cell(i + 1, j2).beta
461 |                         - fwbw.log_pr_data();
462 |                     LOG(debug2) << "step_prob k=" << k
463 |                                 << " i=" << i
464 |                                 << " j1=" << Kmer_Type::to_string(j1)
465 |                                 << " j2=" << Kmer_Type::to_string(j2)
466 |                                 << " log_p_trans=" << log_p_trans
467 |                                 << " res=" << p << std::endl;
468 |                     return p;
469 |                 };
470 | 
471 |                 for (unsigned i = 0; i < n_events - 1; ++i)
472 |                 {
473 |                     for (auto j1 : st_train_kmers())
474 |                     {
475 |                         // Pr[ S_i = j1 ]
476 |                         Float_Type log_p_j1 = fwbw.log_posterior(i, j1);
477 |                         s_denom.add(log_p_j1);
478 |                         // Pr[ S_i = j1, S_{i+1} = j1 ]
479 |                         Float_Type log_p_j1_j1 = log_joint_prob(i, j1, j1, log_p_stay);
480 |                         if (log_p_j1_j1 > log_p_j1)
481 |                         {
482 |                             if (log_p_j1_j1 > log_p_j1 + std::max(std::abs(log_p_j1), 1.0f) * 1.0e-3)
483 |                             {
484 |                                 LOG(warning) << "numerical error log_p_j1 [" << log_p_j1
485 |                                              << "] log_p_j1_j1 [" << log_p_j1_j1 << "]" << std::endl;
486 |                             }
487 |                             log_p_j1_j1 = log_p_j1;
488 |                         }
489 |                         s_p_stay_num.add(log_p_j1_j1);
490 |                         // Pr[ S_i = j1, dist(j1,S_{i+1}) > 1 ]
491 |                         Float_Type log_p_j1_d01;
492 |                         {
493 |                             LogSumSet_Type s2(false);
494 |                             s2.add(log_p_j1_j1);
495 |                             for (auto j2 : Kmer_Type::neighbour_list(j1, 1))
496 |                             {
497 |                                 // transition prob j1 to j2 is (p_step / 4)
498 |                                 s2.add(log_joint_prob(i, j1, j2, log_p_step_4));
499 |                             }
500 |                             log_p_j1_d01 = s2.val();
501 |                         }
502 |                         if (log_p_j1_d01 > log_p_j1)
503 |                         {
504 |                             if (log_p_j1_d01 > log_p_j1 + std::max(std::abs(log_p_j1), 1.0f) * 1.0e-3)
505 |                             {
506 |                                 LOG(warning) << "numerical error log_p_j1 [" << log_p_j1
507 |                                              << "] log_p_j1_d01 [" << log_p_j1_d01 << "]" << std::endl;
508 |                             }
509 |                             log_p_j1_d01 = log_p_j1;
510 |                         }
511 |                         Float_Type p_j1_d2 = std::exp(log_p_j1) - std::exp(log_p_j1_d01);
512 |                         s_p_skip_num.add(std::log(p_j1_d2));
513 |                     } // for j1
514 |                 } // for i
515 |             } // for k
516 |             new_st_params[st].p_stay = std::exp(s_p_stay_num.val() - s_denom.val());
517 |             new_st_params[st].p_skip = std::exp(s_p_skip_num.val() - s_denom.val());
518 |             if (new_st_params[st].p_stay < .05 or new_st_params[st].p_stay > .4
519 |                 or new_st_params[st].p_skip < .05 or new_st_params[st].p_skip > .4)
520 |             {
521 |                 State_Transition_Parameters_Type alt_st_params;
522 |                 alt_st_params.p_stay = std::max(new_st_params[st].p_stay, .05f);
523 |                 alt_st_params.p_stay = std::min(alt_st_params.p_stay, .4f);
524 |                 alt_st_params.p_skip = std::max(new_st_params[st].p_skip, .05f);
525 |                 alt_st_params.p_skip = std::min(alt_st_params.p_skip, .4f);
526 |                 LOG(warning) << "unusual state transition parameters " << new_st_params[st]
527 |                              << " for strand [" << st
528 |                              << "] resetting them to " << alt_st_params << std::endl;
529 |                 std::swap(alt_st_params, new_st_params[st]);
530 |             }
531 |         } // for st
532 |     } // train_st_params()
533 | 
534 |     /**
535 |      * Perform one training round.
536 |      * @new_pm_params Destination for trained pm params (common to both strands)
537 |      * @new_st_params Destination for trained st params (per strand)
538 |      * @fit Destination for pr_data using crt params
539 |      * @done Bool; set to true if no more training rounds can be performed due to singularity.
540 |      */
541 |     static void train_one_round(
542 |         const std::vector< std::pair< const Event_Sequence_Type*, unsigned > >& event_seq_ptrs,
543 |         const std::array< const Pore_Model_Type*, 2 >& model_ptrs,
544 |         const State_Transitions_Type& default_transitions,
545 |         const Pore_Model_Parameters_Type& crt_pm_params,
546 |         const std::array< State_Transition_Parameters_Type, 2 >& crt_st_params,
547 |         Pore_Model_Parameters_Type& new_pm_params,
548 |         std::array< State_Transition_Parameters_Type, 2 >& new_st_params,
549 |         Float_Type& fit,
550 |         bool& done,
551 |         bool train_scaling,
552 |         bool train_transitions)
553 |     {
554 |         // initialize training data
555 |         Train_Data data;
556 |         data.event_seq_ptr_v = event_seq_ptrs;
557 |         data.model_ptr_v = model_ptrs;
558 |         data.default_transitions_ptr = &default_transitions;
559 |         data.pm_params_ptr = &crt_pm_params;
560 |         data.st_params_ptr_v = {{ &crt_st_params[0], &crt_st_params[1] }};
561 |         // fill the training data
562 |         fill_train_data(data);
563 |         fit = data.fit;
564 |         if (train_scaling)
565 |         {
566 |             // train pm params
567 |             train_pm_params(data, new_pm_params, done);
568 |             if (done)
569 |             {
570 |                 new_st_params = crt_st_params;
571 |                 return;
572 |             }
573 |         }
574 |         if (train_transitions)
575 |         {
576 |             // train st params
577 |             train_st_params(data, new_st_params);
578 |         }
579 |     } // train_one_round
580 | 
581 | }; // class Parameter_Trainer
582 | 
583 | #endif
584 | 


--------------------------------------------------------------------------------
/src/nanocall/Pore_Model.hpp:
--------------------------------------------------------------------------------
  1 | //---------------------------------------------------------
  2 | // Copyright 2015 Ontario Institute for Cancer Research
  3 | // Written by Jared Simpson (jared.simpson@oicr.on.ca)
  4 | //---------------------------------------------------------
  5 | //
  6 | // nanopolish_poremodel -- Representation of the Oxford
  7 | // Nanopore sequencing model, as described in a FAST5 file
  8 | //
  9 | #ifndef __POREMODEL_HPP
 10 | #define __POREMODEL_HPP
 11 | 
 12 | #include <cassert>
 13 | #include <cmath>
 14 | #include <iomanip>
 15 | #include <iostream>
 16 | #include <map>
 17 | #include <string>
 18 | 
 19 | #include "Kmer.hpp"
 20 | #include "Event.hpp"
 21 | #include "fast5.hpp"
 22 | #include "alg.hpp"
 23 | 
 24 | template < typename Float_Type >
 25 | inline Float_Type log_normal_pdf(Float_Type x, Float_Type mean, Float_Type stdv, Float_Type log_stdv)
 26 | {
 27 |     // From SO: http://stackoverflow.com/questions/10847007/using-the-gaussian-probability-density-function-in-c
 28 |     static const Float_Type log_2pi = std::log(2.0 * M_PI);
 29 |     Float_Type a = (x - mean) / stdv;
 30 |     return - log_stdv - (log_2pi + a * a) / static_cast< Float_Type >(2.0);
 31 | }
 32 | 
 33 | template < typename Float_Type >
 34 | inline Float_Type log_invgauss_pdf(Float_Type x, Float_Type log_x,
 35 |                                    Float_Type mu, Float_Type lambda, Float_Type log_lambda)
 36 | {
 37 |     static const Float_Type log_2pi = std::log(2.0 * M_PI);
 38 |     Float_Type a = (x - mu) / mu;
 39 |     return (log_lambda - log_2pi - static_cast< Float_Type >(3.0) * log_x - lambda * a * a / x) / static_cast< Float_Type >(2.0);
 40 | }
 41 | 
 42 | template < typename Float_Type >
 43 | struct Pore_Model_Parameters
 44 | {
 45 |     Pore_Model_Parameters() : scale(1.0), shift(0.0), drift(0.0), var(1.0), scale_sd(1.0), var_sd(1.0) {}
 46 | 
 47 |     Float_Type scale;
 48 |     Float_Type shift;
 49 |     Float_Type drift;
 50 |     Float_Type var;
 51 |     Float_Type scale_sd;
 52 |     Float_Type var_sd;
 53 | 
 54 |     void load_from_fast5(const fast5::File& f, bool strand)
 55 |     {
 56 |         assert(f.have_basecall_model(strand));
 57 |         auto m_p = f.get_basecall_model_params(strand);
 58 |         scale = m_p.scale;
 59 |         shift = m_p.shift;
 60 |         drift = m_p.drift;
 61 |         var = m_p.var;
 62 |         scale_sd = m_p.scale_sd;
 63 |         var_sd = m_p.var_sd;
 64 |     }
 65 | 
 66 |     friend std::ostream& operator << (std::ostream& os, const Pore_Model_Parameters& p)
 67 |     {
 68 |         os << "[scale=" << p.scale << " shift=" << p.shift << " drift=" << p.drift
 69 |            << " var=" << p.var << " scale_sd=" << p.scale_sd << " var_sd=" << p.var_sd << "]";
 70 |         return os;
 71 |     }
 72 |     void write_tsv(std::ostream& os) const
 73 |     {
 74 |         os << std::fixed << std::setprecision(5)
 75 |            << scale << '\t' << shift << '\t' << drift << '\t' << var << '\t' << scale_sd << '\t' << var_sd;
 76 |     }
 77 | }; // struct Pore_Model_Parameters
 78 | 
 79 | template < typename Float_Type, unsigned Kmer_Size >
 80 | struct Pore_Model_State
 81 | {
 82 |     typedef Event< Float_Type, Kmer_Size > Event_Type;
 83 |     typedef Pore_Model_Parameters< Float_Type > Pore_Model_Parameters_Type;
 84 | 
 85 |     Float_Type level_mean;
 86 |     Float_Type level_stdv;
 87 |     Float_Type sd_mean;
 88 |     Float_Type sd_stdv;
 89 |     Float_Type sd_lambda;
 90 | 
 91 |     Float_Type log_level_mean;
 92 |     Float_Type log_level_stdv;
 93 |     Float_Type log_sd_mean;
 94 |     Float_Type log_sd_stdv;
 95 |     Float_Type log_sd_lambda;
 96 | 
 97 |     std::array< char, Kmer_Size > kmer;
 98 | 
 99 |     Pore_Model_State& operator = (const fast5::Model_Entry& e)
100 |     {
101 |         level_mean = e.level_mean;
102 |         level_stdv = e.level_stdv;
103 |         sd_mean = e.sd_mean;
104 |         sd_stdv = e.sd_stdv;
105 |         std::copy_n(e.kmer.begin(), Kmer_Size, kmer.begin());
106 |         update_sd_lambda();
107 |         update_logs();
108 |         return *this;
109 |     }
110 | 
111 |     // update sd_lambda based on sd_mean & sd_stdv
112 |     void update_sd_lambda() { sd_lambda = pow(sd_mean, 3.0) / pow(sd_stdv, 2.0); }
113 | 
114 |     // update sd_stdv based on sd_mean & sd_lambda
115 |     void update_sd_stdv() { sd_stdv = pow(pow(sd_mean, 3.0) / sd_lambda, .5); }
116 | 
117 |     // update logs
118 |     void update_logs()
119 |     {
120 |         log_level_mean = std::log(level_mean);
121 |         log_level_stdv = std::log(level_stdv);
122 |         log_sd_mean = std::log(sd_mean);
123 |         log_sd_lambda = std::log(sd_lambda);
124 |     }
125 | 
126 |     void scale(const Pore_Model_Parameters_Type& params, const Pore_Model_Parameters_Type& log_params)
127 |     {
128 |         // these functions are provided by ONT
129 |         level_mean = level_mean * params.scale + params.shift;
130 |         level_stdv = level_stdv * params.var;
131 |         sd_mean = sd_mean * params.scale_sd;
132 |         sd_lambda = sd_lambda * params.var_sd;
133 |         update_sd_stdv();
134 |         log_level_mean = std::log(level_mean);
135 |         log_level_stdv += log_params.var;
136 |         log_sd_mean += log_params.scale_sd;
137 |         log_sd_lambda += log_params.var_sd;
138 |     }
139 | 
140 |     Float_Type log_pr_emission(const Event_Type& e) const
141 |     {
142 |         return (log_normal_pdf< Float_Type >(e.mean, level_mean, level_stdv, log_level_stdv)
143 |                 + log_invgauss_pdf< Float_Type >(e.stdv, e.log_stdv, sd_mean, sd_lambda, log_sd_lambda));
144 |     }
145 |     Float_Type log_pr_corrected_emission(const Event_Type& e) const
146 |     {
147 |         return (log_normal_pdf< Float_Type >(e.corrected_mean, level_mean, level_stdv, log_level_stdv)
148 |                 + log_invgauss_pdf< Float_Type >(e.stdv, e.log_stdv, sd_mean, sd_lambda, log_sd_lambda));
149 |     }
150 | 
151 |     friend std::ostream& operator << (std::ostream& os, const Pore_Model_State& state)
152 |     {
153 |         os << std::string(state.kmer.begin(), state.kmer.end()) << '\t'
154 |            << state.level_mean << '\t'
155 |            << state.level_stdv << '\t'
156 |            << state.sd_mean << '\t'
157 |            << state.sd_stdv;
158 |         return os;
159 |     }
160 | 
161 |     friend bool operator < (const Pore_Model_State& lhs, const Pore_Model_State& rhs)
162 |     {
163 |         return lhs.kmer < rhs.kmer;
164 |     }
165 | }; // struct Pore_Model_State
166 | 
167 | template < typename Float_Type, unsigned Kmer_Size = 6 >
168 | class Pore_Model
169 | {
170 | public:
171 |     typedef Kmer< Kmer_Size > Kmer_Type;
172 |     typedef Event< Float_Type, Kmer_Size > Event_Type;
173 |     typedef Pore_Model_State< Float_Type, Kmer_Size > Pore_Model_State_Type;
174 |     typedef Pore_Model_Parameters< Float_Type > Pore_Model_Parameters_Type;
175 |     static const unsigned n_states = 1u << (2 * Kmer_Size);
176 | 
177 |     Pore_Model() : _strand(2) {}
178 |     void clear() { _state.clear(); }
179 | 
180 |     const Pore_Model_State_Type& state(unsigned i) const { return _state.at(i); }
181 |     Pore_Model_State_Type& state(unsigned i) { return _state.at(i); }
182 | 
183 |     const std::vector< Pore_Model_State_Type >& get_state_vector() const { return _state; }
184 | 
185 |     const unsigned& strand() const { return _strand; }
186 |     unsigned& strand() { return _strand; }
187 |     Float_Type mean() const { return _mean; }
188 |     Float_Type stdv() const { return _stdv; }
189 | 
190 |     void scale(const Pore_Model_Parameters_Type& params)
191 |     {
192 |         Pore_Model_Parameters_Type log_params;
193 |         log_params.var = std::log(params.var);
194 |         log_params.scale_sd = std::log(params.scale_sd);
195 |         log_params.var_sd = std::log(params.var_sd);
196 |         for(unsigned i = 0; i < n_states; ++i)
197 |         {
198 |             state(i).scale(params, log_params);
199 |         }
200 |         update_statistics();
201 |     }
202 | 
203 |     // load model from fast5 file
204 |     void load_from_fast5(const fast5::File& f, bool strand)
205 |     {
206 |         assert(f.have_basecall_model(strand));
207 |         auto m = f.get_basecall_model(strand);
208 |         assert(m.size() == n_states);
209 |         _state.clear();
210 |         _state.reserve(n_states);
211 |         for (unsigned i = 0; i < n_states; ++i)
212 |         {
213 |             _state.emplace_back();
214 |             state(i) = m.at(i);
215 |         }
216 |         update_statistics();
217 |     }
218 | 
219 |     // load from vector
220 |     template < typename V_Float_Type >
221 |     void load_from_vector(const std::vector< V_Float_Type >& v)
222 |     {
223 |         assert(v.size() == n_states * 4);
224 |         _state.clear();
225 |         _state.reserve(n_states);
226 |         for (unsigned i = 0; i < n_states; ++i)
227 |         {
228 |             _state.emplace_back();
229 |             state(i).level_mean = v[4 * i + 0];
230 |             state(i).level_stdv = v[4 * i + 1];
231 |             state(i).sd_mean = v[4 * i + 2];
232 |             state(i).sd_stdv = v[4 * i + 3];
233 |             auto s = Kmer_Type::to_string(i);
234 |             std::copy_n(s.begin(), Kmer_Size, state(i).kmer.begin());
235 |             state(i).update_sd_lambda();
236 |             state(i).update_logs();
237 |         }
238 |         update_statistics();
239 |     }
240 | 
241 |     // write model to out stream
242 |     friend std::ostream& operator << (std::ostream& os, const Pore_Model& pm)
243 |     {
244 |         for (unsigned i = 0; i < pm.n_states; ++i)
245 |         {
246 |             os << pm.state(i) << std::endl;
247 |         }
248 |         return os;
249 |     }
250 |     // load model from input stream
251 |     friend std::istream& operator >> (std::istream& is, Pore_Model& pm)
252 |     {
253 |         pm._state.clear();
254 |         pm._state.reserve(n_states);
255 |         unsigned i = 0;
256 |         std::string line;
257 |         while (std::getline(is, line))
258 |         {
259 |             std::istringstream iss(line);
260 |             std::string s;
261 |             iss >> s;
262 |             if (s[0] == '#') continue;
263 |             if (line.find("kmer") != std::string::npos) continue;
264 |             pm._state.emplace_back();
265 |             iss >> pm.state(i).level_mean
266 |                 >> pm.state(i).level_stdv
267 |                 >> pm.state(i).sd_mean
268 |                 >> pm.state(i).sd_stdv;
269 |             std::copy_n(s.begin(), Kmer_Size, pm.state(i).kmer.begin());
270 |             pm.state(i).update_sd_lambda();
271 |             pm.state(i).update_logs();
272 |             ++i;
273 |         }
274 |         if (i != pm.n_states)
275 |         {
276 |             LOG(error)
277 |                 << "unexpected number of states" << std::endl;
278 |             std::exit(EXIT_FAILURE);
279 |         }
280 |         std::sort(pm._state.begin(), pm._state.end());
281 |         for (unsigned i = 0; i < pm.n_states; ++i)
282 |         {
283 |             assert(Kmer_Type::to_int(pm.state(i).kmer) == i);
284 |         }
285 |         pm.update_statistics();
286 |         return is;
287 |     }
288 | 
289 |     // log of probability of an emission from a state
290 |     Float_Type log_pr_emission(unsigned i, const Event_Type& e) const
291 |     {
292 |         Float_Type res = state(i).log_pr_emission(e);
293 |         return res;
294 |     }
295 |     Float_Type log_pr_corrected_emission(unsigned i, const Event_Type& e) const
296 |     {
297 |         Float_Type res = state(i).log_pr_corrected_emission(e);
298 |         return res;
299 |     }
300 | 
301 | private:
302 |     std::vector< Pore_Model_State_Type > _state;
303 |     Float_Type _mean;
304 |     Float_Type _stdv;
305 |     unsigned _strand;
306 | 
307 |     void update_statistics()
308 |     {
309 |         assert(_state.size() == n_states);
310 |         std::tie(_mean, _stdv) = alg::mean_stdv_of< Float_Type >(
311 |             _state,
312 |             [] (const Pore_Model_State_Type& s) { return s.level_mean; });
313 |     }
314 | }; // class Pore_Model
315 | 
316 | template < typename Float_Type, unsigned Kmer_Size >
317 | using Pore_Model_Dict = std::map< std::string, Pore_Model< Float_Type, Kmer_Size > >;
318 | 
319 | #endif
320 | 


--------------------------------------------------------------------------------
/src/nanocall/State_Transitions.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef __STATE_TRANSITIONS_BASE_HPP
  2 | #define __STATE_TRANSITIONS_BASE_HPP
  3 | 
  4 | #include <cassert>
  5 | #include <cmath>
  6 | #include <iostream>
  7 | #include <vector>
  8 | #include <map>
  9 | 
 10 | #include "Kmer.hpp"
 11 | #include "logsumset.hpp"
 12 | #include "logger.hpp"
 13 | 
 14 | template < typename Float_Type >
 15 | struct State_Transition_Parameters
 16 | {
 17 |     Float_Type p_stay;
 18 |     Float_Type p_skip;
 19 | 
 20 |     static Float_Type& default_p_stay()
 21 |     {
 22 |         static Float_Type _default_p_stay = .09;
 23 |         return _default_p_stay;
 24 |     }
 25 |     static Float_Type& default_p_skip()
 26 |     {
 27 |         static Float_Type _default_p_skip = .28;
 28 |         return _default_p_skip;
 29 |     }
 30 | 
 31 |     State_Transition_Parameters()
 32 |         : p_stay(default_p_stay()), p_skip(default_p_skip()) {}
 33 | 
 34 |     bool is_default() const
 35 |     {
 36 |         return p_stay == default_p_stay() and p_skip == default_p_skip();
 37 |     }
 38 | 
 39 |     friend std::ostream& operator << (std::ostream& os, const State_Transition_Parameters& stp)
 40 |     {
 41 |         os << "[p_stay=" << stp.p_stay
 42 |            << " p_skip=" << stp.p_skip << "]";
 43 |         return os;
 44 |     }
 45 |     void write_tsv(std::ostream& os) const
 46 |     {
 47 |         os << std::fixed << std::setprecision(5)
 48 |            << p_stay << '\t'
 49 |            << p_skip;
 50 |     }
 51 | }; // struct State_Transition_Parameters
 52 | 
 53 | template < typename Float_Type >
 54 | struct State_Neighbours
 55 | {
 56 |     State_Neighbours() : p_rest_from(-INFINITY), p_rest_to(-INFINITY) {}
 57 |     std::vector< std::pair< unsigned, Float_Type > > from_v;
 58 |     std::vector< std::pair< unsigned, Float_Type > > to_v;
 59 |     Float_Type p_rest_from;
 60 |     Float_Type p_rest_to;
 61 | }; // struct State_Neighbours
 62 | 
 63 | template < typename Float_Type, unsigned Kmer_Size = 6 >
 64 | class State_Transitions
 65 | {
 66 | public:
 67 |     typedef Kmer< Kmer_Size > Kmer_Type;
 68 |     typedef State_Neighbours< Float_Type > State_Neighbours_Type;
 69 |     typedef State_Transition_Parameters< Float_Type > State_Transition_Parameters_Type;
 70 |     static const unsigned n_states = 1u << (2 * Kmer_Size);
 71 | 
 72 |     State_Transitions() = default;
 73 |     void clear() { _neighbours.clear(); }
 74 | 
 75 |     const State_Neighbours_Type& neighbours(unsigned i) const { return _neighbours.at(i); }
 76 |     State_Neighbours_Type& neighbours(unsigned i) { return _neighbours.at(i); }
 77 | 
 78 |     // update fields from_v, p_rest_from, p_rest_to based on to_v
 79 |     void update_fields()
 80 |     {
 81 |         for (unsigned i = 0; i < n_states; ++i)
 82 |         {
 83 |             neighbours(i).from_v.clear();
 84 |         }
 85 |         for (unsigned i = 0; i < n_states; ++i)
 86 |         {
 87 |             logsum::logsumset< Float_Type > s(false);
 88 |             for (const auto& p : neighbours(i).to_v)
 89 |             {
 90 |                 neighbours(p.first).from_v.push_back(std::make_pair(i, p.second));
 91 |                 s.add(p.second);
 92 |             }
 93 |             neighbours(i).p_rest_to = std::log(1 - std::exp(s.val()));
 94 |         }
 95 |         for (unsigned i = 0; i < n_states; ++i)
 96 |         {
 97 |             logsum::logsumset< Float_Type > s(false);
 98 |             for (const auto& p : neighbours(i).from_v)
 99 |             {
100 |                 s.add(p.second);
101 |             }
102 |             neighbours(i).p_rest_from = std::log(1 - std::exp(s.val()));
103 |         }
104 |     }
105 | 
106 |     // drop transitions with low probability
107 |     void drop_transitions(Float_Type p_cutoff)
108 |     {
109 |         Float_Type log_p_cutoff = std::log(p_cutoff);
110 |         for (unsigned i = 0; i < n_states; ++i)
111 |         {
112 |             decltype(neighbours(i).to_v) to_v;
113 |             for (const auto& p : neighbours(i).to_v)
114 |             {
115 |                 if (p.second > log_p_cutoff)
116 |                 {
117 |                     to_v.push_back(p);
118 |                 }
119 |             }
120 |             neighbours(i).to_v = std::move(to_v);
121 |         }
122 |         update_fields();
123 |     }
124 | 
125 |     static Float_Type get_trans_prob(unsigned i, unsigned j,
126 |                                      Float_Type p_stay, Float_Type p_step, Float_Type p_skip_1)
127 |     {
128 |         Float_Type p = 0;
129 |         if (i == j)
130 |         {
131 |             p += p_stay;
132 |         }
133 |         if (Kmer_Type::suffix(i, Kmer_Size - 1) == Kmer_Type::prefix(j, Kmer_Size - 1))
134 |         {
135 |             p += p_step / 4;
136 |         }
137 |         for (unsigned l = 2; l < Kmer_Size; ++l)
138 |             if (Kmer_Type::suffix(i, Kmer_Size - l) == Kmer_Type::prefix(j, Kmer_Size - l))
139 |             {
140 |                 p += pow(p_skip_1, l - 1) / (1u << (2 * l));
141 |             }
142 |         p += (pow(p_skip_1, 5) / (Float_Type(1.0) - p_skip_1)) / n_states;
143 |         return p;
144 |     }
145 | 
146 |     // recompute transition table
147 |     void compute_transitions(Float_Type p_skip_default, Float_Type p_stay, Float_Type p_cutoff,
148 |                              const std::map< unsigned, Float_Type >& p_skip_map = {})
149 |     {
150 |         _neighbours.clear();
151 |         _neighbours.reserve(n_states);
152 |         for (unsigned i = 0; i < n_states; ++i)
153 |         {
154 |             _neighbours.emplace_back();
155 |             Float_Type p_skip = p_skip_default;
156 |             if (p_skip_map.count(i))
157 |             {
158 |                 p_skip = p_skip_map.at(i);
159 |             }
160 |             Float_Type p_step = 1.0 - p_stay - p_skip;
161 |             // p_skip = sum_{i>=1} p_skip_1^i
162 |             Float_Type p_skip_1 = p_skip / (p_skip + 1.0);
163 |             LOG(debug2) << "i=" << Kmer_Type::to_string(i)
164 |                         << " p_stay=" << p_stay
165 |                         << " p_skip=" << p_skip
166 |                         << " p_step=" << p_step
167 |                         << " p_skip_1=" << p_skip_1 << std::endl;
168 |             for (unsigned j = 0; j < n_states; ++j)
169 |             {
170 |                 Float_Type p = get_trans_prob(i, j, p_stay, p_step, p_skip_1);
171 |                 if (p > p_cutoff)
172 |                 {
173 |                     neighbours(i).to_v.push_back(std::make_pair(j, std::log(p)));
174 |                 }
175 |             }
176 |         }
177 |         update_fields();
178 |     }
179 | 
180 |     // compute transition table allowing a maximum of 1 skip
181 |     void compute_transitions_fast(Float_Type p_skip_default, Float_Type p_stay,
182 |                                   const std::map< unsigned, Float_Type >& p_skip_map = {})
183 |     {
184 |         struct Default_Float_Type
185 |         {
186 |             Default_Float_Type(Float_Type _val = 0.0) : val(_val) {}
187 |             Float_Type val;
188 |         }; // struct Default_Float
189 | 
190 |         _neighbours.clear();
191 |         _neighbours.reserve(n_states);
192 |         for (unsigned i = 0; i < n_states; ++i)
193 |         {
194 |             _neighbours.emplace_back();
195 |             Float_Type p_skip = p_skip_default;
196 |             if (p_skip_map.count(i))
197 |             {
198 |                 p_skip = p_skip_map.at(i);
199 |             }
200 |             Float_Type p_step = 1.0 - p_stay - p_skip;
201 |             // p_skip = sum_{i>=1} p_skip_1^i
202 |             Float_Type p_skip_1 = p_skip / (p_skip + 1.0);
203 |             LOG(debug2) << "i=" << Kmer_Type::to_string(i)
204 |                         << " p_stay=" << p_stay
205 |                         << " p_skip=" << p_skip
206 |                         << " p_step=" << p_step
207 |                         << " p_skip_1=" << p_skip_1 << std::endl;
208 |             std::set< unsigned > to_s{i};
209 |             const auto& nl1 = Kmer_Type::neighbour_list(i, 1);
210 |             to_s.insert(nl1.begin(), nl1.end());
211 |             const auto& nl2 = Kmer_Type::neighbour_list(i, 2);
212 |             to_s.insert(nl2.begin(), nl2.end());
213 |             for (const auto& j : to_s)
214 |             {
215 |                 Float_Type p = get_trans_prob(i, j, p_stay, p_step, p_skip_1);
216 |                 neighbours(i).to_v.push_back(std::make_pair(j, std::log(p)));
217 |             }
218 |         }
219 |         update_fields();
220 |     }
221 |     void compute_transitions_fast(const State_Transition_Parameters_Type& stp)
222 |     {
223 |         compute_transitions_fast(stp.p_skip, stp.p_stay);
224 |     }
225 | 
226 |     friend std::ostream& operator << (std::ostream& os, const State_Transitions& st)
227 |     {
228 |         for (unsigned i = 0; i < n_states; ++i)
229 |         {
230 |             for (const auto& p : st.neighbours(i).to_v)
231 |             {
232 |                 os << Kmer_Type::to_string(i) << '\t' << Kmer_Type::to_string(p.first) << '\t' << p.second << std::endl;
233 |             }
234 |         }
235 |         return os;
236 |     }
237 |     friend std::istream& operator >> (std::istream& is, State_Transitions& st)
238 |     {
239 |         st._neighbours.clear();
240 |         st._neighbours.resize(n_states);
241 |         std::string k_i;
242 |         std::string k_j;
243 |         Float_Type p;
244 |         while (is >> k_i >> k_j >> p)
245 |         {
246 |             unsigned i = Kmer_Type::to_int(k_i);
247 |             unsigned j = Kmer_Type::to_int(k_j);
248 |             st.neighbours(i).to_v.push_back(std::make_pair(j, p));
249 |         }
250 |         st.update_fields();
251 |         return is;
252 |     }
253 | 
254 | private:
255 |     std::vector< State_Neighbours_Type > _neighbours;
256 | }; // class State_Transitions
257 | 
258 | #endif
259 | 


--------------------------------------------------------------------------------
/src/nanocall/Viterbi.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef __VITERBI_HPP
  2 | #define __VITERBI_HPP
  3 | 
  4 | #include <cmath>
  5 | #include <iostream>
  6 | #include <vector>
  7 | #include <set>
  8 | 
  9 | #include "Pore_Model.hpp"
 10 | #include "State_Transitions.hpp"
 11 | #include "logsumset.hpp"
 12 | #include "logger.hpp"
 13 | #include "fast5.hpp"
 14 | 
 15 | template < typename Float_Type, unsigned Kmer_Size = 6 >
 16 | class Viterbi
 17 | {
 18 | public:
 19 |     typedef Kmer< Kmer_Size > Kmer_Type;
 20 |     typedef Pore_Model< Float_Type, Kmer_Size > Pore_Model_Type;
 21 |     typedef State_Transitions< Float_Type, Kmer_Size > State_Transitions_Type;
 22 |     typedef Event< Float_Type, Kmer_Size > Event_Type;
 23 |     typedef Event_Sequence< Float_Type, Kmer_Size > Event_Sequence_Type;
 24 |     typedef logsum::logsumset< Float_Type > LogSumSet_Type;
 25 | 
 26 |     struct Matrix_Entry
 27 |     {
 28 |         Float_Type alpha; // := Pr[ MLSS producing e_1 ... e_i, with S_i == j ]
 29 |         unsigned beta;    // := previous state in the MLSS
 30 |     }; // struct Matrix_Entry
 31 | 
 32 |     static const unsigned n_states = Pore_Model_Type::n_states;
 33 | 
 34 |     unsigned n_events() const { return _n_events; }
 35 |     Float_Type path_probability() const { return _path_probability; }
 36 | 
 37 |     // i: event index
 38 |     // j: state/kmer index
 39 |     const Matrix_Entry& cell(unsigned i, unsigned j) const { return _m[i * n_states + j]; }
 40 |     Matrix_Entry& cell(unsigned i, unsigned j) { return _m[i * n_states + j]; }
 41 | 
 42 |     static unsigned& n_threads() { static unsigned _n_threads = 1; return _n_threads; }
 43 | 
 44 |     void fill(const Pore_Model_Type& pm,
 45 |               const State_Transitions_Type& st,
 46 |               Event_Sequence_Type& ev)
 47 |     {
 48 |         _n_events = ev.size();
 49 |         _m.clear();
 50 |         _m.resize(n_states * n_events());
 51 |         Float_Type log_n_states = std::log(static_cast< Float_Type >(n_states));
 52 |         //
 53 |         // alpha, beta; i == 0
 54 |         //
 55 |         {
 56 |             LOG("Viterbi", debug1) << "forward: i=0" << std::endl;
 57 |             for (unsigned j = 0; j < n_states; ++j)
 58 |             {
 59 |                 // alpha
 60 |                 cell(0, j).alpha = pm.log_pr_corrected_emission(j, ev[0]) - log_n_states;
 61 |                 // beta
 62 |                 cell(0, j).beta = n_states;
 63 |                 LOG("Viterbi", debug2)
 64 |                     << "i=0 j=" << Kmer_Type::to_string(j)
 65 |                     << " alpha=" << cell(0, j).alpha
 66 |                     << " beta=" << cell(0, j).beta << std::endl;
 67 |             }
 68 |         }
 69 |         //
 70 |         // alpha, beta; i > 0
 71 |         //
 72 |         for (unsigned i = 1; i < n_events(); ++i)
 73 |         {
 74 |             LOG("Viterbi", debug1) << "forward: i=" << i << std::endl;
 75 |             for (unsigned j = 0; j < n_states; ++j) // TODO: parallelize
 76 |             {
 77 |                 cell(i, j).alpha = -INFINITY;
 78 |                 cell(i, j).beta = n_states;
 79 |                 for (const auto& p : st.neighbours(j).from_v)
 80 |                 {
 81 |                     const unsigned& j_prev = p.first;
 82 |                     const Float_Type& log_pr_transition = p.second;
 83 |                     Float_Type v = log_pr_transition + cell(i - 1, j_prev).alpha;
 84 |                     if (v > cell(i, j).alpha)
 85 |                     {
 86 |                         cell(i, j).alpha = v;
 87 |                         cell(i, j).beta = j_prev;
 88 |                     }
 89 |                 }
 90 |                 cell(i, j).alpha += pm.log_pr_corrected_emission(j, ev[i]);
 91 |                 LOG("Viterbi", debug2)
 92 |                     << "i=" << i << " j=" << Kmer_Type::to_string(j)
 93 |                     << " alpha=" << cell(i, j).alpha
 94 |                     << " beta=" << cell(i, j).beta << std::endl;
 95 |             }
 96 |         }
 97 |         fill_state_seq(ev);
 98 |         fill_move_seq(ev);
 99 |     }
100 | 
101 |     friend std::ostream& operator << (std::ostream& os, const Viterbi& vit)
102 |     {
103 |         for (unsigned i = 0; i < vit.n_events(); ++i)
104 |         {
105 |             for (unsigned j = 0; j < vit.n_states; ++j)
106 |             {
107 |                 os << i << '\t' << j << '\t'
108 |                    << vit.cell(i, j).alpha << '\t'
109 |                    << vit.cell(i, j).beta << std::endl;
110 |             }
111 |         }
112 |         return os;
113 |     }
114 | 
115 | private:
116 |     std::vector< Matrix_Entry > _m;
117 |     Float_Type _path_probability;
118 |     unsigned _n_events;
119 | 
120 |     void fill_state_seq(Event_Sequence_Type& ev)
121 |     {
122 |         assert(Kmer_Size <= MAX_K_LEN);
123 |         Float_Type max_v = -INFINITY;
124 |         unsigned max_j = n_states;
125 |         for (unsigned j = 0; j < n_states; ++j)
126 |         {
127 |             if (cell(n_events() - 1, j).alpha > max_v)
128 |             {
129 |                 max_j = j;
130 |                 max_v = cell(n_events() - 1, j).alpha;
131 |             }
132 |         }
133 |         _path_probability = max_v;
134 |         for (unsigned i = n_events() - 1; i > 0; --i)
135 |         {
136 |             ev[i].model_state_idx = max_j;
137 |             ev[i].set_model_state(Kmer_Type::to_string(ev[i].model_state_idx));
138 |             max_j = cell(i, max_j).beta;
139 |         }
140 |         ev[0].model_state_idx = max_j;
141 |         ev[0].set_model_state(Kmer_Type::to_string(ev[0].model_state_idx));
142 |     }
143 | 
144 |     void fill_move_seq(Event_Sequence_Type& ev)
145 |     {
146 |         for (unsigned i = 0; i < n_events(); ++i)
147 |         {
148 |             ev[i].move = i > 0? Kmer_Type::min_skip(ev[i - 1].model_state_idx, ev[i].model_state_idx) : 0u;
149 |         }
150 |     }
151 | 
152 | }; // class Viterbi
153 | 
154 | #endif
155 | 


--------------------------------------------------------------------------------
/src/nanocall/compute-scaled-pore-model.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string>
 3 | #include <tclap/CmdLine.h>
 4 | 
 5 | #include "zstr.hpp"
 6 | #include "Pore_Model.hpp"
 7 | #include "logger.hpp"
 8 | 
 9 | using namespace std;
10 | 
11 | #ifndef FLOAT_TYPE
12 | #define FLOAT_TYPE float
13 | #endif
14 | typedef Pore_Model< FLOAT_TYPE > Pore_Model_Type;
15 | typedef Pore_Model_Type::Pore_Model_Parameters_Type Pore_Model_Parameters_Type;
16 | 
17 | namespace opts
18 | {
19 |     using namespace TCLAP;
20 |     string description =
21 |         "Compute scaled pore model.";
22 |     CmdLine cmd_parser(description);
23 |     MultiArg< string > log_level("d", "log-level", "Log level.", false, "string", cmd_parser);
24 |     ValueArg< string > file_name("f", "file-name", "Fast5 file.", true, "", "file", cmd_parser);
25 |     ValueArg< unsigned > strand("s", "strand", "Strand.", false, 0, "file", cmd_parser);
26 |     ValueArg< string > output_file_name("o", "output", "Output file name.", false, "", "file", cmd_parser);
27 | } // namespace opts
28 | 
29 | void real_main()
30 | {
31 |     Pore_Model_Type m;
32 |     Pore_Model_Parameters_Type m_params;
33 |     m.load_from_fast5(fast5::File(opts::file_name), opts::strand);
34 |     m_params.load_from_fast5(fast5::File(opts::file_name), opts::strand);
35 |     m.scale(m_params);
36 |     if (not opts::output_file_name.get().empty())
37 |     {
38 |         strict_fstream::ofstream(opts::output_file_name.get()) << m;
39 |     }
40 |     else
41 |     {
42 |         cout << m;
43 |     }
44 | }
45 | 
46 | int main(int argc, char * argv[])
47 | {
48 |     opts::cmd_parser.parse(argc, argv);
49 |     logger::Logger::set_levels_from_options(opts::log_level);
50 |     real_main();
51 | }
52 | 


--------------------------------------------------------------------------------
/src/nanocall/compute-state-transitions.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string>
 3 | #include <tclap/CmdLine.h>
 4 | 
 5 | #include "zstr.hpp"
 6 | #include "State_Transitions.hpp"
 7 | #include "logger.hpp"
 8 | 
 9 | using namespace std;
10 | 
11 | #ifndef FLOAT_TYPE
12 | #define FLOAT_TYPE float
13 | #endif
14 | typedef State_Transitions< FLOAT_TYPE > State_Transitions_Type;
15 | 
16 | namespace opts
17 | {
18 |     using namespace TCLAP;
19 |     string description =
20 |         "Compute state transition probabilities based on the overlap model, for a given pr_skip and pr_stay.";
21 |     CmdLine cmd_parser(description);
22 |     MultiArg< string > log_level("d", "log-level", "Log level.", false, "string", cmd_parser);
23 |     ValueArg< string > output_file_name("o", "output", "Output file name.", false, "", "file", cmd_parser);
24 |     ValueArg< float > p_cutoff("p", "pr-cutoff", "Minimim prob to keep.", false, 0.001, "float", cmd_parser);
25 |     ValueArg< float > p_skip("k", "pr-skip", "Pr skip.", false, 0.28, "float", cmd_parser);
26 |     ValueArg< float > p_stay("t", "pr-stay", "Pr stay.", false, 0.09, "float", cmd_parser);
27 |     SwitchArg fast("", "fast", "Use fast computation.", cmd_parser);
28 | } // namespace opts
29 | 
30 | void real_main()
31 | {
32 |     State_Transitions_Type st;
33 |     if (opts::fast)
34 |     {
35 |         st.compute_transitions_fast(opts::p_skip, opts::p_stay);
36 |     }
37 |     else
38 |     {
39 |         st.compute_transitions(opts::p_skip, opts::p_stay, opts::p_cutoff);
40 |     }
41 |     if (not opts::output_file_name.get().empty())
42 |     {
43 |         strict_fstream::ofstream(opts::output_file_name) << st;
44 |     }
45 |     else
46 |     {
47 |         cout << st;
48 |     }
49 | }
50 | 
51 | int main(int argc, char * argv[])
52 | {
53 |     opts::cmd_parser.parse(argc, argv);
54 |     logger::Logger::set_levels_from_options(opts::log_level);
55 |     real_main();
56 | }
57 | 


--------------------------------------------------------------------------------
/src/nanocall/fs_support.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef __FS_SUPPORT_HPP
 2 | #define __FS_SUPPORT_HPP
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include <sys/types.h>
 8 | #include <dirent.h>
 9 | 
10 | // This should work in windows.
11 | // Ref:
12 | // http://stackoverflow.com/a/612176/717706
13 | 
14 | bool is_directory(const std::string& file_name)
15 | {
16 |     auto dir = opendir(file_name.c_str());
17 |     if (not dir) return false;
18 |     closedir(dir);
19 |     return true;
20 | }
21 | 
22 | std::vector< std::string > list_directory(const std::string& file_name)
23 | {
24 |     std::vector< std::string > res;
25 |     DIR* dir;
26 |     struct dirent *ent;
27 | 
28 |     dir = opendir(file_name.c_str());
29 |     if (not dir) return res;
30 |     while ((ent = readdir(dir)) != nullptr)
31 |     {
32 |         res.push_back(ent->d_name);
33 |     }
34 |     closedir(dir);
35 |     return res;
36 | }
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/nanocall/global_assert.hpp:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------
 2 | // Copyright 2013 Ontario Institute for Cancer Research
 3 | // Written by Matei David (mdavid@oicr.on.ca)
 4 | // Released under the GPL license
 5 | //-----------------------------------------------
 6 | 
 7 | #ifndef __GLOBAL_ASSERT_HPP
 8 | #define __GLOBAL_ASSERT_HPP
 9 | 
10 | #include <iostream>
11 | #include <string>
12 | 
13 | struct global_assert
14 | {
15 |     static std::string& prog_name()
16 |     {
17 |         static std::string _prog_name;
18 |         return _prog_name;
19 |     }
20 |     static std::string& global_msg()
21 |     {
22 |         static thread_local std::string _global_msg;
23 |         return _global_msg;
24 |     }
25 | 
26 |     static void assertion_failed(const std::string& expr, const std::string& msg,
27 |                                  const std::string& function, const std::string& file, long line)
28 |     {
29 |         std::cerr << prog_name() << ": "
30 |                   << file << ":" << line << ": "
31 |                   << function;
32 |         if (not global_msg().empty())
33 |         {
34 |             std::cerr << " [" << global_msg() << "]";
35 |         }
36 |         std::cerr << ": " << "Assertion '" << expr << "' failed";
37 |         if (not msg.empty())
38 |         {
39 |             std::cerr << ": " << msg;
40 |         }
41 |         std::cerr << std::endl;
42 |         abort();
43 |     }
44 | }; // struct global_assert
45 | 
46 | #undef ASSERT
47 | #undef ASSERT_MSG
48 | 
49 | #if defined(NDEBUG)
50 | 
51 | #define ASSERT(expr) ((void)0)
52 | #define ASSERT_MSG(expr, msg) ((void)0)
53 | 
54 | #else
55 | 
56 | #define ASSERT_MSG(expr, msg) ((expr)? ((void)0): global_assert::assertion_failed(#expr, msg, __func__, __FILE__, __LINE__))
57 | #define ASSERT(expr) ASSERT_MSG(expr, "")
58 | 
59 | #endif
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/src/nanocall/list-directory.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string>
 3 | 
 4 | #include "fs_support.hpp"
 5 | 
 6 | using namespace std;
 7 | 
 8 | int main(int argc, char* argv[])
 9 | {
10 |     if (argc != 2)
11 |     {
12 |         cerr << "use: " << argv[0] << " <directory>" << endl;
13 |         return EXIT_FAILURE;
14 |     }
15 |     string file_name = argv[1];
16 |     auto is_dir = is_directory(file_name);
17 |     if (not is_dir)
18 |     {
19 |         cerr << "not a directory: " << file_name << endl;
20 |         return EXIT_FAILURE;
21 |     }
22 |     auto l = list_directory(file_name);
23 |     for (const auto& f : l)
24 |     {
25 |         cout << f << endl;
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/nanocall/nanocall.cpp:
--------------------------------------------------------------------------------
   1 | #include <deque>
   2 | #include <iostream>
   3 | #include <string>
   4 | #include <tclap/CmdLine.h>
   5 | 
   6 | #include <ctime>
   7 | 
   8 | #include "global_assert.hpp"
   9 | #include "version.hpp"
  10 | #include "Pore_Model.hpp"
  11 | #include "Builtin_Model.hpp"
  12 | #include "State_Transitions.hpp"
  13 | #include "Event.hpp"
  14 | #include "Fast5_Summary.hpp"
  15 | #include "Viterbi.hpp"
  16 | #include "Forward_Backward.hpp"
  17 | #include "Parameter_Trainer.hpp"
  18 | #include "logger.hpp"
  19 | #include "alg.hpp"
  20 | #include "zstr.hpp"
  21 | #include "fast5.hpp"
  22 | #include "pfor.hpp"
  23 | #include "fs_support.hpp"
  24 | 
  25 | using namespace std;
  26 | 
  27 | long get_cpu_time_ms()
  28 | {
  29 |     auto t = clock();
  30 |     return (t * 1000) / CLOCKS_PER_SEC;
  31 | }
  32 | 
  33 | #ifndef FLOAT_TYPE
  34 | #define FLOAT_TYPE float
  35 | #endif
  36 | #ifndef KMER_SIZE
  37 | #define KMER_SIZE 6
  38 | #endif
  39 | typedef State_Transitions< FLOAT_TYPE, KMER_SIZE > State_Transitions_Type;
  40 | typedef State_Transition_Parameters< FLOAT_TYPE > State_Transition_Parameters_Type;
  41 | typedef Pore_Model< FLOAT_TYPE, KMER_SIZE > Pore_Model_Type;
  42 | typedef Pore_Model_Dict< FLOAT_TYPE, KMER_SIZE > Pore_Model_Dict_Type;
  43 | typedef Pore_Model_Parameters< FLOAT_TYPE > Pore_Model_Parameters_Type;
  44 | typedef Event< FLOAT_TYPE, KMER_SIZE > Event_Type;
  45 | typedef Event_Sequence< FLOAT_TYPE, KMER_SIZE > Event_Sequence_Type;
  46 | typedef Fast5_Summary< FLOAT_TYPE, KMER_SIZE > Fast5_Summary_Type;
  47 | typedef Parameter_Trainer< FLOAT_TYPE, KMER_SIZE > Parameter_Trainer_Type;
  48 | typedef Viterbi< FLOAT_TYPE, KMER_SIZE > Viterbi_Type;
  49 | 
  50 | namespace opts
  51 | {
  52 |     using namespace TCLAP;
  53 |     string description = "Call bases in Oxford Nanopore reads.";
  54 |     CmdLine cmd_parser(description, ' ', package_version);
  55 |     //
  56 |     ValueArg< string > ed_group("", "ed-group", "EventDetection group to use. (default: smallest available)", false, "", "000|001|...", cmd_parser);
  57 |     ValueArg< unsigned > chunk_size("", "chunk-size", "Thread chunk size.", false, 1, "int", cmd_parser);
  58 |     MultiArg< string > log_level("", "log", "Log level. (default: info)", false, "string", cmd_parser);
  59 |     ValueArg< string > stats_fn("", "stats", "Stats.", false, "", "file", cmd_parser);
  60 |     ValueArg< string > train_drift("", "train-drift", "Train drift parameter. (default: yes for R73, no for R9)", false, "", "0|1", cmd_parser);
  61 |     ValueArg< unsigned > trim_ed_hp_end("", "trim-ed-hp-end", "Number of events to trim after hairpin end.", false, 50, "int", cmd_parser);
  62 |     ValueArg< unsigned > trim_ed_hp_start("", "trim-ed-hp-start", "Number of events to trim before hairpin start.", false, 50, "int", cmd_parser);
  63 |     ValueArg< unsigned > trim_ed_sq_end("", "trim-ed-sq-end", "Number of events to trim before sequence end.", false, 50, "int", cmd_parser);
  64 |     ValueArg< unsigned > trim_ed_sq_start("", "trim-ed-sq-start", "Number of events to trim after sequence start.", false, 50, "int", cmd_parser);
  65 |     ValueArg< unsigned > max_ed_events("", "max-ed-events", "Maximum EventDetection events.", false, 100000, "int", cmd_parser);
  66 |     ValueArg< unsigned > min_ed_events("", "min-ed-events", "Minimum EventDetection events.", false, 10, "int", cmd_parser);
  67 |     ValueArg< unsigned > fasta_line_width("", "fasta-line-width", "Maximum fasta line width.", false, 80, "int", cmd_parser);
  68 |     //
  69 |     ValueArg< float > scaling_select_threshold("", "scaling-select-threshold", "Select best model per strand during scaling if log score better by threshold.", false, 20.0, "float", cmd_parser);
  70 |     ValueArg< float > scaling_min_progress("", "scaling-min-progress", "Minimum scaling fit progress.", false, 1.0, "float", cmd_parser);
  71 |     ValueArg< unsigned > scaling_max_rounds("", "scaling-max-rounds", "Maximum scaling rounds.", false, 10, "int", cmd_parser);
  72 |     ValueArg< unsigned > scaling_num_events("", "scaling-num-events", "Number of events used for model scaling.", false, 200, "int", cmd_parser);
  73 |     //
  74 |     SwitchArg template_only("", "1d", "Interpret entire read as 1D template only.", cmd_parser);
  75 |     SwitchArg single_strand_scaling("", "single-strand-scaling", "Train scaling parameters per strand.", cmd_parser);
  76 |     SwitchArg double_strand_scaling("", "double-strand-scaling", "Train scaling parameters per read. (default)", cmd_parser);
  77 |     SwitchArg no_train_transitions("", "no-train-transitions", "Do not train state transitions.", cmd_parser);
  78 |     SwitchArg no_train_scaling("", "no-train-scaling", "Do not train pore model scaling.", cmd_parser);
  79 |     SwitchArg train("", "train", "Enable training. (default)", cmd_parser);
  80 |     SwitchArg no_train("", "no-train", "Disable all training.", cmd_parser);
  81 |     SwitchArg basecall("", "basecall", "Enable basecalling (default).", cmd_parser);
  82 |     SwitchArg no_basecall("", "no-basecall", "Disable basecalling.", cmd_parser);
  83 |     //
  84 |     ValueArg< float > pr_skip("", "pr-skip", "Transition probability of skipping at least 1 state.", false, .3, "float", cmd_parser);
  85 |     ValueArg< float > pr_stay("", "pr-stay", "Transition probability of staying in the same state.", false, .1, "float", cmd_parser);
  86 |     ValueArg< string > trans_fn("s", "trans", "Custom initial state transitions.", false, "", "file", cmd_parser);
  87 |     ValueArg< string > model_fofn("", "model-fofn", "File of pore models.", false, "", "file", cmd_parser);
  88 |     MultiArg< string > model_fn("m", "model", "Custom pore model for strand (0=template, 1=complement, 2=both).", false, "strand:file", cmd_parser);
  89 |     //
  90 |     ValueArg< string > pore("", "pore", "Pore name, used to select builtin pore model.", false, "r9", "r73|r9", cmd_parser);
  91 |     SwitchArg write_fast5("", "write-fast5", "Write basecalls to fast5 files.", cmd_parser);
  92 |     ValueArg< string > output_fn("o", "output", "Output.", false, "", "file", cmd_parser);
  93 |     ValueArg< unsigned > num_threads("t", "threads", "Number of parallel threads.", false, 1, "int", cmd_parser);
  94 |     UnlabeledMultiArg< string > input_fn("inputs", "Inputs: directories, fast5 files, or files of fast5 file names (use \"-\" to read fofn from stdin).", true, "path", cmd_parser);
  95 | } // namespace opts
  96 | 
  97 | void init_models(Pore_Model_Dict_Type& models)
  98 | {
  99 |     auto parse_model_name = [] (const string& s) {
 100 |         if (s.size() < 3
 101 |             or (s[0] != '0' and s[0] != '1' and s[0] != '2')
 102 |             or s[1] != ':')
 103 |         {
 104 |             LOG(error) << "could not parse model name: \"" << s << "\"; format should be \"[0|1|2]:<file>\"" << endl;
 105 |             exit(EXIT_FAILURE);
 106 |         }
 107 |         unsigned st = s[0] - '0';
 108 |         return make_pair(st, s.substr(2));
 109 |     };
 110 | 
 111 |     map< unsigned, list< string > > model_list;
 112 |     if (not opts::model_fn.get().empty())
 113 |     {
 114 |         for (const auto& s : opts::model_fn.get())
 115 |         {
 116 |             auto p = parse_model_name(s);
 117 |             model_list[p.first].push_back(p.second);
 118 |         }
 119 |     }
 120 |     if (not opts::model_fofn.get().empty())
 121 |     {
 122 |         zstr::ifstream ifs(opts::model_fofn);
 123 |         string s;
 124 |         while (getline(ifs, s))
 125 |         {
 126 |             auto p = parse_model_name(s);
 127 |             model_list[p.first].push_back(p.second);
 128 |         }
 129 |     }
 130 |     if (model_list[2].empty() and (model_list[0].empty() != model_list[1].empty()))
 131 |     {
 132 |         LOG(error) << "models were specified only for strand " << (int)model_list[0].empty()
 133 |                    << "! give models for both strands, or for neither." << endl;
 134 |         exit(EXIT_FAILURE);
 135 |     }
 136 |     if (not (model_list[0].empty() and model_list[1].empty() and model_list[2].empty()))
 137 |     {
 138 |         for (unsigned st = 0; st < 3; ++st)
 139 |         {
 140 |             for (const auto& e : model_list[st])
 141 |             {
 142 |                 Pore_Model_Type pm;
 143 |                 string pm_name = e;
 144 |                 zstr::ifstream(e) >> pm;
 145 |                 pm.strand() = st;
 146 |                 models[pm_name] = move(pm);
 147 |                 LOG(info) << "loaded module [" << pm_name
 148 |                           << "] for strand [" << st
 149 |                           << "] statistics [mean=" << pm.mean()
 150 |                           << ", stdv=" << pm.stdv() << "]" << endl;
 151 |             }
 152 |         }
 153 |     }
 154 |     else
 155 |     {
 156 |         // use built-in models
 157 |         for (unsigned i = 0; i < Builtin_Model::num; ++i)
 158 |         {
 159 |             Pore_Model_Type pm;
 160 |             string pm_name = Builtin_Model::names[i];
 161 |             if (pm_name.compare(0, opts::pore.get().size() + 1, opts::pore.get() + ".")) continue;
 162 |             pm.load_from_vector(Builtin_Model::init_lists[i]);
 163 |             pm.strand() = Builtin_Model::strands[i];
 164 |             models[Builtin_Model::names[i]] = move(pm);
 165 |             LOG(info)
 166 |                 << "loaded builtin module [" << Builtin_Model::names[i]
 167 |                 << "] for strand [" << Builtin_Model::strands[i]
 168 |                 << "] statistics [mean=" << pm.mean()
 169 |                 << ", stdv=" << pm.stdv() << "]" << endl;
 170 |         }
 171 |         if (models.empty())
 172 |         {
 173 |             LOG(error)
 174 |                 << "no builtin models found for pore [" << opts::pore.get() << "]" << endl;
 175 |             exit(EXIT_FAILURE);
 176 |         }
 177 |     }
 178 | } // init_models
 179 | 
 180 | void init_transitions(State_Transitions_Type& transitions)
 181 | {
 182 |     if (not opts::trans_fn.get().empty())
 183 |     {
 184 |         zstr::ifstream(opts::trans_fn) >> transitions;
 185 |         LOG(info) << "loaded state transitions from [" << opts::trans_fn.get() << "]" << endl;
 186 |     }
 187 |     else
 188 |     {
 189 |         transitions.compute_transitions_fast(opts::pr_skip, opts::pr_stay);
 190 |         LOG(info) << "init_state_transitions pr_skip=[" << opts::pr_skip
 191 |                   << "], pr_stay=[" << opts::pr_stay << "]" << endl;
 192 |     }
 193 | } // init_transitions
 194 | 
 195 | // Parse command line arguments. For each of them:
 196 | // - if it is a directory, find all fast5 files in it, ignore non-fast5 files.
 197 | // - if it is a file, check that it is indeed a fast5 file.
 198 | void init_files(list< string >& files)
 199 | {
 200 |     for (const auto& f : opts::input_fn)
 201 |     {
 202 |         if (is_directory(f))
 203 |         {
 204 |             auto l = list_directory(f);
 205 |             for (const auto& g : l)
 206 |             {
 207 |                 string f2 = f + (f[f.size() - 1] != '/'? "/" : "") + g;
 208 |                 if (is_directory(f2))
 209 |                 {
 210 |                     LOG(info) << "ignoring subdirectory [" << f2 << "]" << endl;
 211 |                 }
 212 |                 else if (fast5::File::is_valid_file(f2))
 213 |                 {
 214 |                     files.push_back(f2);
 215 |                     LOG(info) << "adding input file [" << f2 << "]" << endl;
 216 |                 }
 217 |                 else
 218 |                 {
 219 |                     LOG(info) << "ignoring file [" << f2 << "]" << endl;
 220 |                 }
 221 |             }
 222 |         }
 223 |         else // not a directory
 224 |         {
 225 |             if (f != "-" and fast5::File::is_valid_file(f))
 226 |             {
 227 |                 files.push_back(f);
 228 |                 LOG(info) << "adding input file [" << f << "]" << endl;
 229 |             }
 230 |             else // not fast5, interpret as fofn
 231 |             {
 232 |                 LOG(info) << "interpreting [" << f << "] as fofn" << endl;
 233 |                 istream* is_p = nullptr;
 234 |                 strict_fstream::ifstream ifs;
 235 |                 if (f == "-")
 236 |                 {
 237 |                     is_p = &cin;
 238 |                 }
 239 |                 else
 240 |                 {
 241 |                     ifs.open(f);
 242 |                     is_p = &ifs;
 243 |                 }
 244 |                 string g;
 245 |                 while (getline(*is_p, g))
 246 |                 {
 247 |                     if (fast5::File::is_valid_file(g))
 248 |                     {
 249 |                         files.push_back(g);
 250 |                         LOG(info) << "adding input file [" << g << "]" << endl;
 251 |                     }
 252 |                 }
 253 |             }
 254 |         }
 255 |     }
 256 |     if (files.empty())
 257 |     {
 258 |         LOG(error) << "no fast5 files to process" << endl;
 259 |         exit(EXIT_FAILURE);
 260 |     }
 261 | } // init_files
 262 | 
 263 | void init_reads(const Pore_Model_Dict_Type& models,
 264 |                 const list< string >& files,
 265 |                 deque< Fast5_Summary_Type >& reads)
 266 | {
 267 |     for (const auto& f : files)
 268 |     {
 269 |         Fast5_Summary_Type s(f, models, opts::double_strand_scaling);
 270 |         LOG(info) << "summary: " << s << endl;
 271 |         reads.emplace_back(move(s));
 272 |     }
 273 | } // init_reads
 274 | 
 275 | void train_reads(const Pore_Model_Dict_Type& models,
 276 |                  const State_Transitions_Type& default_transitions,
 277 |                  deque< Fast5_Summary_Type >& reads)
 278 | {
 279 |     auto time_start_ms = get_cpu_time_ms();
 280 |     Parameter_Trainer_Type::init();
 281 |     unsigned crt_idx = 0;
 282 |     pfor::pfor< unsigned >(
 283 |         opts::num_threads,
 284 |         opts::chunk_size,
 285 |         // get_item
 286 |         [&] (unsigned& i) {
 287 |             if (crt_idx >= reads.size()) return false;
 288 |             i = crt_idx++;
 289 |             return true;
 290 |         },
 291 |         // process item
 292 |         [&] (unsigned& i) {
 293 |             Fast5_Summary_Type& read_summary = reads[i];
 294 |             if (read_summary.num_ed_events == 0) return;
 295 |             global_assert::global_msg() = read_summary.read_id;
 296 |             read_summary.load_events();
 297 |             //
 298 |             // create per-strand list of models to try
 299 |             //
 300 |             array< list< string >, 2 > model_list;
 301 |             for (unsigned st = 0; st < 2; ++st)
 302 |             {
 303 |                 // if not enough events, ignore strand
 304 |                 if (read_summary.events(st).size() < opts::min_ed_events) continue;
 305 |                 // create list of models to try
 306 |                 if (not read_summary.preferred_model[st][st].empty())
 307 |                 {
 308 |                     // if we have a preferred model, use that
 309 |                     model_list[st].push_back(read_summary.preferred_model[st][st]);
 310 |                 }
 311 |                 else
 312 |                 {
 313 |                     // no preferred model, try all that apply to this strand
 314 |                     for (const auto& p : models)
 315 |                     {
 316 |                         if (p.second.strand() == st or p.second.strand() == 2)
 317 |                         {
 318 |                             model_list[st].push_back(p.first);
 319 |                         }
 320 |                     }
 321 |                 }
 322 |                 ASSERT(not model_list.empty());
 323 |             }
 324 |             //
 325 |             // create per-strand list of event sequences on which to train
 326 |             //
 327 |             array< vector< Event_Sequence_Type >, 2 > train_event_seqs;
 328 |             for (unsigned st = 0; st < 2; ++st)
 329 |             {
 330 |                 // if not enough events, ignore strand
 331 |                 if (read_summary.events(st).size() < opts::min_ed_events) continue;
 332 |                 // create 2 event sequences on which to train
 333 |                 unsigned num_train_events = min((size_t)opts::scaling_num_events.get(), read_summary.events(st).size());
 334 |                 train_event_seqs[st].emplace_back(
 335 |                     read_summary.events(st).begin(), read_summary.events(st).begin() + num_train_events / 2);
 336 |                 train_event_seqs[st].emplace_back(
 337 |                     read_summary.events(st).end() - num_train_events / 2, read_summary.events(st).end());
 338 |             }
 339 |             //
 340 |             // branch on whether pore models should be scaled together
 341 |             //
 342 |             if (read_summary.scale_strands_together)
 343 |             {
 344 |                 // prepare vector of event sequences
 345 |                 vector< pair< const Event_Sequence_Type*, unsigned > > train_event_seq_ptrs;
 346 |                 for (unsigned st = 0; st < 2; ++st)
 347 |                 {
 348 |                     for (const auto& events : train_event_seqs[st])
 349 |                     {
 350 |                         train_event_seq_ptrs.push_back(make_pair(&events, st));
 351 |                     }
 352 |                 }
 353 |                 // track model fit
 354 |                 // key = pore model name; value = fit
 355 |                 map< array< string, 2 >, FLOAT_TYPE > model_fit;
 356 |                 for (const auto& m_name_0 : model_list[0])
 357 |                 {
 358 |                     for (const auto& m_name_1 : model_list[1])
 359 |                     {
 360 |                         array< string, 2 > m_name_key = {{ m_name_0, m_name_1 }};
 361 |                         string m_name = m_name_0 + "+" + m_name_1;
 362 |                         unsigned round = 0;
 363 |                         auto& crt_pm_params = read_summary.pm_params_m.at(m_name_key);
 364 |                         auto& crt_st_params = read_summary.st_params_m.at(m_name_key);
 365 |                         auto& crt_fit = model_fit[m_name_key];
 366 |                         crt_fit = -INFINITY;
 367 |                         while (true)
 368 |                         {
 369 |                             Pore_Model_Parameters_Type old_pm_params(crt_pm_params);
 370 |                             std::array< State_Transition_Parameters_Type, 2 > old_st_params(crt_st_params);
 371 |                             auto old_fit = crt_fit;
 372 |                             bool done;
 373 | 
 374 |                             Parameter_Trainer_Type::train_one_round(
 375 |                                 train_event_seq_ptrs,
 376 |                                 {{ &models.at(m_name_0), &models.at(m_name_1) }},
 377 |                                 default_transitions,
 378 |                                 old_pm_params, old_st_params,
 379 |                                 crt_pm_params, crt_st_params, crt_fit, done,
 380 |                                 not opts::no_train_scaling, not opts::no_train_transitions);
 381 | 
 382 |                             LOG(debug)
 383 |                                 << "scaling_round read [" << read_summary.read_id
 384 |                                 << "] strand [" << 2
 385 |                                 << "] model [" << m_name
 386 |                                 << "] old_pm_params [" << old_pm_params
 387 |                                 << "] old_st_params [" << old_st_params[0] << "," << old_st_params[1]
 388 |                                 << "] old_fit [" << old_fit
 389 |                                 << "] crt_pm_params [" << crt_pm_params
 390 |                                 << "] crt_st_params [" << crt_st_params[0] << "," << crt_st_params[1]
 391 |                                 << "] crt_fit [" << crt_fit
 392 |                                 << "] round [" << round << "]" << endl;
 393 | 
 394 |                             if (done)
 395 |                             {
 396 |                                 // singularity detected; stop
 397 |                                 break;
 398 |                             }
 399 | 
 400 |                             if (crt_fit < old_fit)
 401 |                             {
 402 |                                 LOG(info) << "scaling_regression read [" << read_summary.read_id
 403 |                                           << "] strand [" << 2
 404 |                                           << "] model [" << m_name
 405 |                                           << "] old_params [" << old_pm_params
 406 |                                           << "] old_st_params [" << old_st_params[0] << "," << old_st_params[1]
 407 |                                           << "] old_fit [" << old_fit
 408 |                                           << "] crt_pm_params [" << crt_pm_params
 409 |                                           << "] crt_st_params [" << crt_st_params[0] << "," << crt_st_params[1]
 410 |                                           << "] crt_fit [" << crt_fit
 411 |                                           << "] round [" << round << "]" << endl;
 412 |                                 crt_pm_params = old_pm_params;
 413 |                                 crt_st_params = old_st_params;
 414 |                                 crt_fit = old_fit;
 415 |                                 break;
 416 |                             }
 417 | 
 418 |                             ++round;
 419 |                             // stop condition
 420 |                             if (round >= 2u * opts::scaling_max_rounds
 421 |                                 or (round > 1 and crt_fit < old_fit + opts::scaling_min_progress))
 422 |                             {
 423 |                                 break;
 424 |                             }
 425 | 
 426 |                         }; // while true
 427 |                         LOG(info)
 428 |                             << "scaling_result read [" << read_summary.read_id
 429 |                             << "] strand [" << 2
 430 |                             << "] model [" << m_name
 431 |                             << "] pm_params [" << crt_pm_params
 432 |                             << "] st_params [" << crt_st_params[0] << "," << crt_st_params[1]
 433 |                             << "] fit [" << crt_fit
 434 |                             << "] rounds [" << round << "]" << endl;
 435 |                     } // for m_name[1]
 436 |                 } // for m_name[0]
 437 |                 if (opts::scaling_select_threshold.get() < INFINITY)
 438 |                 {
 439 |                     auto it_max = alg::max_of(
 440 |                         model_fit,
 441 |                         [] (const decltype(model_fit)::value_type& p) { return p.second; });
 442 |                     // check maximum is unique
 443 |                     if (alg::all_of(
 444 |                             model_fit,
 445 |                             [&] (const decltype(model_fit)::value_type& p) {
 446 |                                 return &p == &*it_max
 447 |                                     or p.second + opts::scaling_select_threshold.get() < it_max->second;
 448 |                             }))
 449 |                     {
 450 |                         const auto& m_name_0 = it_max->first[0];
 451 |                         const auto& m_name_1 = it_max->first[1];
 452 |                         auto m_name = m_name_0 + '+' + m_name_1;
 453 |                         read_summary.preferred_model[2][0] = m_name_0;
 454 |                         read_summary.preferred_model[2][1] = m_name_1;
 455 |                         LOG(info)
 456 |                             << "selected_model read [" << read_summary.read_id
 457 |                             << "] strand [2] model [" << m_name << "]" << endl;
 458 |                     }
 459 |                 }
 460 |             }
 461 |             else // not scale_strands_together
 462 |             {
 463 |                 for (unsigned st = 0; st < 2; ++st)
 464 |                 {
 465 |                     // if not enough events, ignore strand
 466 |                     if (read_summary.events(st).size() < opts::min_ed_events) continue;
 467 |                     // prepare vector of event sequences
 468 |                     vector< pair< const Event_Sequence_Type*, unsigned > > train_event_seq_ptrs;
 469 |                     for (const auto& events : train_event_seqs[st])
 470 |                     {
 471 |                         train_event_seq_ptrs.push_back(make_pair(&events, st));
 472 |                     }
 473 |                     map< string, FLOAT_TYPE > model_fit;
 474 |                     for (const auto& m_name : model_list[st])
 475 |                     {
 476 |                         array< string, 2 > m_name_key;
 477 |                         m_name_key[st] = m_name;
 478 |                         unsigned round = 0;
 479 |                         auto& crt_pm_params = read_summary.pm_params_m.at(m_name_key);
 480 |                         auto& crt_st_params = read_summary.st_params_m.at(m_name_key);
 481 |                         auto& crt_fit = model_fit[m_name];
 482 |                         crt_fit = -INFINITY;
 483 |                         while (true)
 484 |                         {
 485 |                             Pore_Model_Parameters_Type old_pm_params(crt_pm_params);
 486 |                             array< State_Transition_Parameters_Type, 2 > old_st_params(crt_st_params);
 487 |                             auto old_fit = crt_fit;
 488 |                             bool done;
 489 | 
 490 |                             Parameter_Trainer_Type::train_one_round(
 491 |                                 train_event_seq_ptrs,
 492 |                                 {{ &models.at(m_name), &models.at(m_name) }},
 493 |                                 default_transitions,
 494 |                                 old_pm_params, old_st_params,
 495 |                                 crt_pm_params, crt_st_params, crt_fit, done,
 496 |                                 not opts::no_train_scaling, not opts::no_train_transitions);
 497 | 
 498 |                             LOG(debug)
 499 |                                 << "scaling_round read [" << read_summary.read_id
 500 |                                 << "] strand [" << st
 501 |                                 << "] model [" << m_name
 502 |                                 << "] old_pm_params [" << old_pm_params
 503 |                                 << "] old_st_params [" << old_st_params[st]
 504 |                                 << "] old_fit [" << old_fit
 505 |                                 << "] crt_pm_params [" << crt_pm_params
 506 |                                 << "] crt_st_params [" << crt_st_params[st]
 507 |                                 << "] crt_fit [" << crt_fit
 508 |                                 << "] round [" << round << "]" << endl;
 509 | 
 510 |                             if (done)
 511 |                             {
 512 |                                 // singularity detected; stop
 513 |                                 break;
 514 |                             }
 515 | 
 516 |                             if (crt_fit < old_fit)
 517 |                             {
 518 |                                 LOG(info) << "scaling_regression read [" << read_summary.read_id
 519 |                                           << "] strand [" << st
 520 |                                           << "] model [" << m_name
 521 |                                           << "] old_pm_params [" << old_pm_params
 522 |                                           << "] old_st_params [" << old_st_params[st]
 523 |                                           << "] old_fit [" << old_fit
 524 |                                           << "] crt_pm_params [" << crt_pm_params
 525 |                                           << "] crt_st_params [" << crt_st_params[st]
 526 |                                           << "] crt_fit [" << crt_fit
 527 |                                           << "] round [" << round << "]" << endl;
 528 |                                 crt_pm_params = old_pm_params;
 529 |                                 crt_st_params = old_st_params;
 530 |                                 crt_fit = old_fit;
 531 |                                 break;
 532 |                             }
 533 | 
 534 |                             ++round;
 535 |                             // stop condition
 536 |                             if (round >= opts::scaling_max_rounds
 537 |                                 or (round > 1 and crt_fit < old_fit + opts::scaling_min_progress))
 538 |                             {
 539 |                                 break;
 540 |                             }
 541 | 
 542 |                         }; // while true
 543 |                         LOG(info)
 544 |                             << "scaling_result read [" << read_summary.read_id
 545 |                             << "] strand [" << st
 546 |                             << "] model [" << m_name
 547 |                             << "] pm_params [" << crt_pm_params
 548 |                             << "] st_params [" << crt_st_params[st]
 549 |                             << "] fit [" << crt_fit
 550 |                             << "] rounds [" << round << "]" << endl;
 551 |                     } // for m_name
 552 |                     if (opts::scaling_select_threshold.get() < INFINITY)
 553 |                     {
 554 |                         auto it_max = alg::max_of(
 555 |                             model_fit,
 556 |                             [] (const decltype(model_fit)::value_type& p) { return p.second; });
 557 |                         if (alg::all_of(
 558 |                                 model_fit,
 559 |                                 [&] (const decltype(model_fit)::value_type& p) {
 560 |                                     return &p == &*it_max
 561 |                                         or p.second + opts::scaling_select_threshold.get() < it_max->second;
 562 |                                 }))
 563 |                         {
 564 |                             read_summary.preferred_model[st][st] = it_max->first;
 565 |                             LOG(info)
 566 |                                 << "selected_model read [" << read_summary.read_id
 567 |                                 << "] strand [" << st
 568 |                                 << "] model [" << it_max->first << "]" << endl;
 569 |                         }
 570 |                     }
 571 |                 } // for st
 572 |             } // if not scale_strands_together
 573 |             read_summary.drop_events();
 574 |         }, // process_item
 575 |         // progress_report
 576 |         [&] (unsigned items, unsigned seconds) {
 577 |             clog << "Processed " << setw(6) << right << items << " reads in "
 578 |                  << setw(6) << right << seconds << " seconds\r";
 579 |         }); // pfor
 580 |     auto time_end_ms = get_cpu_time_ms();
 581 |     LOG(info) << "training user_cpu_secs=" << (time_end_ms - time_start_ms)/1000 << endl;
 582 | } // train_reads
 583 | 
 584 | void write_fasta(ostream& os, const string& name, const string& seq)
 585 | {
 586 |     os << ">" << name << endl;
 587 |     for (unsigned pos = 0; pos < seq.size(); pos += opts::fasta_line_width)
 588 |     {
 589 |         os << seq.substr(pos, opts::fasta_line_width) << endl;
 590 |     }
 591 | } // write_fasta
 592 | 
 593 | void basecall_reads(const Pore_Model_Dict_Type& models,
 594 |                     const State_Transitions_Type& default_transitions,
 595 |                     deque< Fast5_Summary_Type >& reads)
 596 | {
 597 |     auto time_start_ms = get_cpu_time_ms();
 598 |     strict_fstream::ofstream ofs;
 599 |     ostream* os_p = nullptr;
 600 |     if (not opts::output_fn.get().empty())
 601 |     {
 602 |         ofs.open(opts::output_fn);
 603 |         os_p = &ofs;
 604 |     }
 605 |     else
 606 |     {
 607 |         os_p = &cout;
 608 |     }
 609 | 
 610 |     unsigned crt_idx = 0;
 611 |     pfor::pfor< unsigned, ostringstream >(
 612 |         opts::num_threads,
 613 |         opts::chunk_size,
 614 |         // get_item
 615 |         [&] (unsigned& i) {
 616 |             if (crt_idx >= reads.size()) return false;
 617 |             i = crt_idx++;
 618 |             return true;
 619 |         },
 620 |         // process_item
 621 |         [&] (unsigned& i, ostringstream& oss) {
 622 |             Fast5_Summary_Type& read_summary = reads[i];
 623 |             if (read_summary.num_ed_events == 0) return;
 624 |             global_assert::global_msg() = read_summary.read_id;
 625 |             read_summary.load_events();
 626 | 
 627 |             // compute read statistics used to check scaling
 628 |             array< pair< FLOAT_TYPE, FLOAT_TYPE >, 2 > r_stats;
 629 |             for (unsigned st = 0; st < 2; ++st)
 630 |             {
 631 |                 // if not enough events, ignore strand
 632 |                 if (read_summary.events(st).size() < opts::min_ed_events) continue;
 633 |                 r_stats[st] = alg::mean_stdv_of< FLOAT_TYPE >(
 634 |                     read_summary.events(st),
 635 |                     [] (const Event_Type& ev) { return ev.mean; });
 636 |                 LOG(debug)
 637 |                     << "mean_stdv read [" << read_summary.read_id
 638 |                     << "] strand [" << st
 639 |                     << "] ev_mean=[" << r_stats[st].first
 640 |                     << "] ev_stdv=[" << r_stats[st].second << "]" << endl;
 641 |             }
 642 | 
 643 |             // basecalling functor
 644 |             // returns: (path_prob, base_seq)
 645 |             auto basecall_strand = [&] (unsigned st, string m_name,
 646 |                                         const Pore_Model_Parameters_Type& pm_params,
 647 |                                         const State_Transition_Parameters_Type& st_params) {
 648 |                 // scale model
 649 |                 Pore_Model_Type pm(models.at(m_name));
 650 |                 pm.scale(pm_params);
 651 |                 State_Transitions_Type custom_transitions;
 652 |                 const State_Transitions_Type* transitions_ptr;
 653 |                 if (not st_params.is_default())
 654 |                 {
 655 |                     custom_transitions.compute_transitions_fast(st_params);
 656 |                     transitions_ptr = &custom_transitions;
 657 |                 }
 658 |                 else
 659 |                 {
 660 |                     transitions_ptr = &default_transitions;
 661 |                 }
 662 |                 LOG(info)
 663 |                     << "basecalling read [" << read_summary.read_id
 664 |                     << "] strand [" << st
 665 |                     << "] model [" << m_name
 666 |                     << "] pm_params [" << pm_params
 667 |                     << "] st_params [" << st_params << "]" << endl;
 668 |                 LOG(debug)
 669 |                     << "mean_stdv read [" << read_summary.read_id
 670 |                     << "] strand [" << st
 671 |                     << "] model_mean [" << pm.mean()
 672 |                     << "] model_stdv [" << pm.stdv() << "]" << endl;
 673 |                 if (abs(r_stats[st].first - pm.mean()) > 5.0)
 674 |                 {
 675 |                     LOG(warning)
 676 |                         << "means_apart read [" << read_summary.read_id
 677 |                         << "] strand [" << st
 678 |                         << "] model [" << m_name
 679 |                         << "] parameters [" << pm_params
 680 |                         << "] model_mean=[" << pm.mean()
 681 |                         << "] events_mean=[" << r_stats[st].first
 682 |                         << "]" << endl;
 683 |                 }
 684 |                 // correct drift
 685 |                 Event_Sequence_Type corrected_events = read_summary.events(st);
 686 |                 corrected_events.apply_drift_correction(pm_params.drift);
 687 |                 Viterbi_Type vit;
 688 |                 vit.fill(pm, *transitions_ptr, corrected_events);
 689 |                 return std::make_tuple(vit.path_probability(), std::move(corrected_events));
 690 |             };
 691 | 
 692 |             if (read_summary.scale_strands_together)
 693 |             {
 694 |                 // create list of models to try
 695 |                 list< array< string, 2 > > model_sublist;
 696 |                 if (not read_summary.preferred_model[2][0].empty())
 697 |                 {
 698 |                     // if we have a preferred model, use that
 699 |                     model_sublist.push_back(read_summary.preferred_model[2]);
 700 |                 }
 701 |                 else
 702 |                 {
 703 |                     // no preferred model, try all for which we have scaling parameters
 704 |                     for (const auto& p : read_summary.pm_params_m)
 705 |                     {
 706 |                         if (p.first[0].empty() or p.first[1].empty()) continue;
 707 |                         model_sublist.push_back(p.first);
 708 |                     }
 709 |                 }
 710 |                 // basecall using applicable models
 711 |                 deque< tuple< FLOAT_TYPE,
 712 |                               FLOAT_TYPE, FLOAT_TYPE,
 713 |                               string, string,
 714 |                               Event_Sequence_Type, Event_Sequence_Type > > results;
 715 |                 for (const auto& m_name : model_sublist)
 716 |                 {
 717 |                     array< tuple< FLOAT_TYPE, Event_Sequence_Type >, 2 > part_results;
 718 |                     for (unsigned st = 0; st < 2; ++st)
 719 |                     {
 720 |                         part_results[st] = basecall_strand(
 721 |                             st, m_name[st],
 722 |                             read_summary.pm_params_m.at(m_name),
 723 |                             read_summary.st_params_m.at(m_name)[st]);
 724 |                     }
 725 |                     results.emplace_back(get<0>(part_results[0]) + get<0>(part_results[1]),
 726 |                                          get<0>(part_results[0]),
 727 |                                          get<0>(part_results[1]),
 728 |                                          string(m_name[0]),
 729 |                                          string(m_name[1]),
 730 |                                          std::move(get<1>(part_results[0])),
 731 |                                          std::move(get<1>(part_results[1])));
 732 |                 }
 733 |                 // sort results by first component (log path probability)
 734 |                 sort(results.begin(),
 735 |                      results.end(),
 736 |                      [] (const decltype(results)::value_type& lhs, const decltype(results)::value_type& rhs) {
 737 |                          return get<0>(lhs) < get<0>(rhs);
 738 |                      });
 739 |                 array< FLOAT_TYPE, 2 > best_log_path_prob{{ get<1>(results.back()), get<2>(results.back()) }};
 740 |                 array< string, 2 > best_m_name{{ get<3>(results.back()), get<4>(results.back()) }};
 741 |                 array< const Event_Sequence_Type*, 2 > event_seq_ptr = {
 742 |                     &get<5>(results.back()),
 743 |                     &get<6>(results.back())
 744 |                 };
 745 |                 array< string, 2 > base_seq = {
 746 |                     get<5>(results.back()).get_base_seq(),
 747 |                     get<6>(results.back()).get_base_seq()
 748 |                 };
 749 |                 string best_m_name_str = best_m_name[0] + '+' + best_m_name[1];
 750 |                 auto& best_pm_params = read_summary.pm_params_m.at(best_m_name);
 751 |                 auto& best_st_params = read_summary.st_params_m.at(best_m_name);
 752 |                 for (unsigned st = 0; st < 2; ++st)
 753 |                 {
 754 |                     LOG(info)
 755 |                         << "best_model read [" << read_summary.read_id
 756 |                         << "] strand [" << st
 757 |                         << "] model [" << best_m_name[st]
 758 |                         << "] pm_params [" << best_pm_params
 759 |                         << "] st_params [" << best_st_params[st]
 760 |                         << "] log_path_prob [" << best_log_path_prob[st] << "]" << endl;
 761 |                     read_summary.preferred_model[st][st] = best_m_name[st];
 762 |                     read_summary.pm_params_m[read_summary.preferred_model[st]] = best_pm_params;
 763 |                     read_summary.st_params_m[read_summary.preferred_model[st]][st] = best_st_params[st];
 764 |                     string seq_name;
 765 |                     {
 766 |                         ostringstream tmp;
 767 |                         tmp << read_summary.read_id << ":" << read_summary.base_file_name << ":" << st;
 768 |                         seq_name = tmp.str();
 769 |                     }
 770 |                     if (opts::write_fast5)
 771 |                     {
 772 |                         read_summary.add_basecall_seq(seq_name, st, base_seq[st]);
 773 |                         read_summary.add_basecall_events(st, *event_seq_ptr[st]);
 774 |                         read_summary.add_basecall_model(st, models.at(best_m_name[st]));
 775 |                         read_summary.add_basecall_model_params(st, best_pm_params);
 776 |                     }
 777 |                     else
 778 |                     {
 779 |                         write_fasta(oss, seq_name, base_seq[st]);
 780 |                     }
 781 |                 }
 782 |             }
 783 |             else // not scale_strands_together
 784 |             {
 785 |                 for (unsigned st = 0; st < 2; ++st)
 786 |                 {
 787 |                     // if not enough events, ignore strand
 788 |                     if (read_summary.events(st).size() < opts::min_ed_events) continue;
 789 |                     // create list of models to try
 790 |                     list< array< string, 2 > > model_sublist;
 791 |                     if (not read_summary.preferred_model[st][st].empty())
 792 |                     {
 793 |                         // if we have a preferred model, use that
 794 |                         model_sublist.push_back(read_summary.preferred_model[st]);
 795 |                     }
 796 |                     else
 797 |                     {
 798 |                         // no preferred model, try all for which we have scaling
 799 |                         for (const auto& p : read_summary.pm_params_m)
 800 |                         {
 801 |                             if (not p.first[st].empty() and p.first[1 - st].empty())
 802 |                             {
 803 |                                 model_sublist.push_back(p.first);
 804 |                             }
 805 |                         }
 806 |                     }
 807 |                     // deque of results
 808 |                     deque< tuple< FLOAT_TYPE, string, Event_Sequence_Type > > results;
 809 |                     for (const auto& m_name : model_sublist)
 810 |                     {
 811 |                         auto r = basecall_strand(
 812 |                             st, m_name[st],
 813 |                             read_summary.pm_params_m.at(m_name),
 814 |                             read_summary.st_params_m.at(m_name)[st]);
 815 |                         results.emplace_back(get<0>(r),
 816 |                                              string(m_name[st]),
 817 |                                              std::move(get<1>(r)));
 818 |                     }
 819 |                     sort(results.begin(),
 820 |                          results.end(),
 821 |                          [] (const decltype(results)::value_type& lhs, const decltype(results)::value_type& rhs) {
 822 |                              return get<0>(lhs) < get<0>(rhs);
 823 |                          });
 824 |                     const string& best_m_name = get<1>(results.back());
 825 |                     const Event_Sequence_Type& event_seq = get<2>(results.back());
 826 |                     string base_seq = event_seq.get_base_seq();
 827 |                     array< string, 2 > best_m_key;
 828 |                     best_m_key[st] = best_m_name;
 829 |                     LOG(info)
 830 |                         << "best_model read [" << read_summary.read_id
 831 |                         << "] strand [" << st
 832 |                         << "] model [" << best_m_name
 833 |                         << "] pm_params [" << read_summary.pm_params_m.at(best_m_key)
 834 |                         << "] st_params [" << read_summary.st_params_m.at(best_m_key)[st]
 835 |                         << "] log_path_prob [" << get<0>(results.back()) << "]" << endl;
 836 |                     read_summary.preferred_model[st][st] = best_m_name;
 837 |                     string seq_name;
 838 |                     {
 839 |                         ostringstream tmp;
 840 |                         tmp << read_summary.read_id << ":" << read_summary.base_file_name << ":" << st;
 841 |                         seq_name = tmp.str();
 842 |                     }
 843 |                     if (opts::write_fast5)
 844 |                     {
 845 |                         read_summary.add_basecall_seq(seq_name, st, base_seq);
 846 |                         read_summary.add_basecall_events(st, event_seq);
 847 |                         read_summary.add_basecall_model(st, models.at(best_m_name));
 848 |                         read_summary.add_basecall_model_params(st, read_summary.pm_params_m.at(best_m_key));
 849 |                     }
 850 |                     else
 851 |                     {
 852 |                         write_fasta(oss, seq_name, base_seq);
 853 |                     }
 854 |                 } // for st
 855 |             }
 856 |             read_summary.drop_events();
 857 |         },
 858 |         // output_chunk
 859 |         [&] (ostringstream& oss) {
 860 |             *os_p << oss.str();
 861 |         },
 862 |         // progress_report
 863 |         [&] (unsigned items, unsigned seconds) {
 864 |             clog << "Processed " << setw(6) << right << items << " reads in "
 865 |                  << setw(6) << right << seconds << " seconds\r";
 866 |         }); // pfor
 867 |     auto time_end_ms = get_cpu_time_ms();
 868 |     LOG(info) << "basecalling user_cpu_secs=" << (time_end_ms - time_start_ms)/1000 << endl;
 869 | } // basecall_reads
 870 | 
 871 | int real_main()
 872 | {
 873 |     Pore_Model_Dict_Type models;
 874 |     State_Transitions_Type default_transitions;
 875 |     deque< Fast5_Summary_Type > reads;
 876 |     list< string > files;
 877 |     // initialize structs
 878 |     init_models(models);
 879 |     init_transitions(default_transitions);
 880 |     init_files(files);
 881 |     init_reads(models, files, reads);
 882 |     if (opts::train)
 883 |     {
 884 |         // do some training
 885 |         train_reads(models, default_transitions, reads);
 886 |     }
 887 |     if (opts::basecall)
 888 |     {
 889 |         // basecall reads
 890 |         basecall_reads(models, default_transitions, reads);
 891 |     }
 892 |     // print stats
 893 |     if (not opts::stats_fn.get().empty())
 894 |     {
 895 |         strict_fstream::ofstream ofs(opts::stats_fn);
 896 |         Fast5_Summary_Type::write_tsv_header(ofs);
 897 |         ofs << endl;
 898 |         for (const auto& s : reads)
 899 |         {
 900 |             s.write_tsv(ofs);
 901 |             ofs << endl;
 902 |         }
 903 |     }
 904 |     assert(fast5::File::get_object_count() == 0);
 905 |     return EXIT_SUCCESS;
 906 | }
 907 | 
 908 | int main(int argc, char * argv[])
 909 | {
 910 |     opts::cmd_parser.parse(argc, argv);
 911 |     logger::Logger::set_default_level(logger::level::info);
 912 |     logger::Logger::set_levels_from_options(opts::log_level);
 913 |     LOG(info) << "program: " << opts::cmd_parser.getProgramName() << endl;
 914 |     LOG(info) << "version: " << opts::cmd_parser.getVersion() << endl;
 915 |     LOG(info) << "args: " << opts::cmd_parser.getOrigArgv() << endl;
 916 |     LOG(info) << "num_threads=" << opts::num_threads.get() << endl;
 917 | #ifndef H5_HAVE_THREADSAFE
 918 |     if (opts::num_threads > 1)
 919 |     {
 920 |         LOG(warning) << "enabled multi-threading with non-threadsafe HDF5: using experimental locking" << endl;
 921 |     }
 922 | #endif
 923 |     State_Transition_Parameters_Type::default_p_stay() = opts::pr_stay;
 924 |     State_Transition_Parameters_Type::default_p_skip() = opts::pr_skip;
 925 |     Fast5_Summary_Type::min_ed_events() = opts::min_ed_events;
 926 |     Fast5_Summary_Type::max_ed_events() = opts::max_ed_events;
 927 |     Fast5_Summary_Type::eventdetection_group() = opts::ed_group;
 928 |     Fast5_Summary_Type::template_only() = opts::template_only;
 929 |     Fast5_Summary_Type::trim_margins() = {{ opts::trim_ed_sq_start, opts::trim_ed_sq_end, opts::trim_ed_hp_start, opts::trim_ed_hp_end }};
 930 |     LOG (info) << "eventdetection_group=" << (Fast5_Summary_Type::eventdetection_group().empty()
 931 |                                               ? string("smallest")
 932 |                                               : Fast5_Summary_Type::eventdetection_group()) << endl;
 933 |     //
 934 |     // set pore-related options
 935 |     //
 936 |     if (not opts::train_drift.get().empty()
 937 |         and opts::train_drift.get() != "0"
 938 |         and opts::train_drift.get() != "1")
 939 |     {
 940 |         LOG(error) << "train-drift not understdood: " << opts::train_drift.get() << endl;
 941 |         return EXIT_FAILURE;
 942 |     }
 943 |     if (opts::pore.get() == "r9")
 944 |     {
 945 |         Fast5_Summary_Type::abasic_level_top_percent() = 1.0;
 946 |         Fast5_Summary_Type::abasic_level_top_offset() = 0.0;
 947 |         Fast5_Summary_Type::hairpin_island_window_size() = 10;
 948 |         Fast5_Summary_Type::hairpin_island_window_load() = 5;
 949 |         if (opts::train_drift.get().empty())
 950 |         {
 951 |             opts::train_drift.get() = "0";
 952 |         }
 953 |     }
 954 |     else if (opts::pore.get() == "r73")
 955 |     {
 956 |         Fast5_Summary_Type::abasic_level_top_percent() = 1.0;
 957 |         Fast5_Summary_Type::abasic_level_top_offset() = 5.0;
 958 |         Fast5_Summary_Type::hairpin_island_window_size() = 5;
 959 |         Fast5_Summary_Type::hairpin_island_window_load() = 5;
 960 |         if (opts::train_drift.get().empty())
 961 |         {
 962 |             opts::train_drift.get() = "1";
 963 |         }
 964 |     }
 965 |     else
 966 |     {
 967 |         LOG(error) << "unknown pore type: " << opts::pore.get() << endl;
 968 |         return EXIT_FAILURE;
 969 |     }
 970 |     Parameter_Trainer_Type::pm_train_drift() = opts::train_drift.get() == "1";
 971 |     LOG(info)
 972 |         << "ed_event_trimming: "
 973 |         << " sq_start=" << Fast5_Summary_Type::trim_margins()[0]
 974 |         << " sq_end=" << Fast5_Summary_Type::trim_margins()[1]
 975 |         << " hp_start=" << Fast5_Summary_Type::trim_margins()[2]
 976 |         << " hp_end=" << Fast5_Summary_Type::trim_margins()[3] << endl;
 977 |     if (not opts::template_only.get())
 978 |     {
 979 |         LOG(info)
 980 |             << "hairpin_detection:"
 981 |             << " abasic_level_top_percent=" << Fast5_Summary_Type::abasic_level_top_percent()
 982 |             << " abasic_level_top_offset=" << Fast5_Summary_Type::abasic_level_top_offset()
 983 |             << " hairpin_island_window_size=" << Fast5_Summary_Type::hairpin_island_window_size()
 984 |             << " hairpin_island_window_load=" << Fast5_Summary_Type::hairpin_island_window_load()
 985 |             << endl;
 986 |     }
 987 |     else
 988 |     {
 989 |         LOG(info)
 990 |             << "hairpin_detection: disabled" << endl;
 991 |     }
 992 |     //
 993 |     // set training option
 994 |     //
 995 |     if (opts::train and opts::no_train)
 996 |     {
 997 |         LOG(error)
 998 |             << "either --train or --no-train may be used, but not both" << endl;
 999 |         return EXIT_FAILURE;
1000 |     }
1001 |     else if (not opts::train and not opts::no_train)
1002 |     {
1003 |         // by default, enable training
1004 |         opts::train.set(true);
1005 |     }
1006 |     ASSERT(opts::train != opts::no_train);
1007 |     //
1008 |     // set basecalling option
1009 |     //
1010 |     if (opts::basecall and opts::no_basecall)
1011 |     {
1012 |         LOG(error)
1013 |             << "either --basecall or --no-basecall may be used, but not both" << endl;
1014 |         return EXIT_FAILURE;
1015 |     }
1016 |     else if (not opts::basecall and not opts::no_basecall)
1017 |     {
1018 |         // by default, enable basecalling
1019 |         opts::basecall.set(true);
1020 |     }
1021 |     ASSERT(opts::basecall != opts::no_basecall);
1022 |     //
1023 |     // set single/double strand scaling option
1024 |     //
1025 |     if (opts::train and not opts::no_train_scaling)
1026 |     {
1027 |         if (opts::single_strand_scaling and opts::double_strand_scaling)
1028 |         {
1029 |             LOG(error)
1030 |                 << "either --single-strand-scaling or --double-strand-scaling may be used, but not both" << endl;
1031 |             return EXIT_FAILURE;
1032 |         }
1033 |         else if (not opts::single_strand_scaling and not opts::double_strand_scaling)
1034 |         {
1035 |             // by default, do double strand scaling
1036 |             opts::double_strand_scaling.set(true);
1037 |         }
1038 |     }
1039 |     //
1040 |     // check other options
1041 |     //
1042 |     if (opts::scaling_select_threshold.get() < 0.0)
1043 |     {
1044 |         LOG(error)
1045 |             << "invalid scaling_select_threshold: " << opts::scaling_select_threshold.get() << endl;
1046 |         return EXIT_FAILURE;
1047 |     }
1048 |     if (opts::scaling_min_progress < 0.0)
1049 |     {
1050 |         LOG(error)
1051 |             << "invalid scaling_min_progress: " << opts::scaling_min_progress.get() << endl;
1052 |         return EXIT_FAILURE;
1053 |     }
1054 |     if (not opts::output_fn.get().empty() and opts::write_fast5)
1055 |     {
1056 |         LOG(error)
1057 |             << "output may be written to fast5 files or to a single output file, but not both" << endl;
1058 |         return EXIT_FAILURE;
1059 |     }
1060 |     //
1061 |     // print training options
1062 |     //
1063 |     LOG(info) << "train=" << opts::train.get() << endl;
1064 |     if (opts::train)
1065 |     {
1066 |         LOG(info) << "train_scaling=" << not opts::no_train_scaling.get() << endl;
1067 |         LOG(info) << "train_transitions=" << not opts::no_train_transitions.get() << endl;
1068 |         if (not opts::no_train_scaling)
1069 |         {
1070 |             LOG(info) << "double_strands_scaling=" << opts::double_strand_scaling.get() << endl;
1071 |             LOG(info) << "scaling_num_events=" << opts::scaling_num_events.get() << endl;
1072 |             LOG(info) << "scaling_max_rounds=" << opts::scaling_max_rounds.get() << endl;
1073 |             LOG(info) << "scaling_min_progress=" << opts::scaling_min_progress.get() << endl;
1074 |             LOG(info) << "scaling_select_threshold=" << opts::scaling_select_threshold.get() << endl;
1075 |             LOG(info) << "train_drift=" << opts::train_drift.get() << endl;
1076 |         }
1077 |     }
1078 |     LOG(info) << "basecall=" << opts::basecall.get() << endl;
1079 |     return real_main();
1080 | }
1081 | 


--------------------------------------------------------------------------------
/src/nanocall/run-fwbw.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <string>
  3 | #include <tclap/CmdLine.h>
  4 | 
  5 | #include "Pore_Model.hpp"
  6 | #include "State_Transitions.hpp"
  7 | #include "Event.hpp"
  8 | #include "Forward_Backward.hpp"
  9 | #include "Forward_Backward_Custom.hpp"
 10 | #include "logger.hpp"
 11 | #include "zstr.hpp"
 12 | 
 13 | using namespace std;
 14 | 
 15 | #ifndef FLOAT_TYPE
 16 | #define FLOAT_TYPE float
 17 | #endif
 18 | #ifndef KMER_SIZE
 19 | #define KMER_SIZE 6
 20 | #endif
 21 | typedef State_Transitions< FLOAT_TYPE, KMER_SIZE > State_Transitions_Type;
 22 | typedef Pore_Model< FLOAT_TYPE, KMER_SIZE > Pore_Model_Type;
 23 | typedef Event< FLOAT_TYPE, KMER_SIZE> Event_Type;
 24 | typedef Event_Sequence< FLOAT_TYPE, KMER_SIZE > Event_Sequence_Type;
 25 | typedef Forward_Backward< FLOAT_TYPE, KMER_SIZE > Forward_Backward_Type;
 26 | typedef Forward_Backward_Custom< FLOAT_TYPE, KMER_SIZE > Forward_Backward_Custom_Type;
 27 | 
 28 | namespace opts
 29 | {
 30 |     using namespace TCLAP;
 31 |     string description =
 32 |         "Given a scaled pore model, a state trasition table, and a sequence of events, "
 33 |         "compute the state distribution conditioned on the prefix event sequence";
 34 |     CmdLine cmd_parser(description);
 35 |     MultiArg< string > log_level("d", "log-level", "Log level.", false, "string", cmd_parser);
 36 |     ValueArg< string > pm_file_name("p", "pore-model", "Scaled pore model file name.", true, "", "file", cmd_parser);
 37 |     ValueArg< string > st_file_name("s", "state-transitions", "State transitions file name.", true, "", "file", cmd_parser);
 38 |     ValueArg< string > ev_file_name("e", "events", "Events file name.", true, "", "file", cmd_parser);
 39 |     ValueArg< string > output_file_name("o", "output", "Output file name.", false, "", "file", cmd_parser);
 40 |     SwitchArg custom_fwbw("", "custom-fwbw", "Use custom fwbw.", cmd_parser);
 41 | } // namespace opts
 42 | 
 43 | void real_main()
 44 | {
 45 |     Pore_Model_Type pm;
 46 |     //Pore_Model_Parameters<> params;
 47 |     State_Transitions_Type st;
 48 |     Event_Sequence_Type ev;
 49 |     zstr::ifstream(opts::pm_file_name) >> pm;
 50 |     zstr::ifstream(opts::st_file_name) >> st;
 51 |     {
 52 |         zstr::ifstream ifs(opts::ev_file_name);
 53 |         Event_Type e;
 54 |         while (ifs >> e)
 55 |         {
 56 |             ev.push_back(e);
 57 |         }
 58 |     }
 59 | 
 60 |     Forward_Backward_Type fwbw;
 61 |     Forward_Backward_Custom_Type fwbw_custom;
 62 |     if (not opts::custom_fwbw)
 63 |     {
 64 |         fwbw.fill(pm, st, ev);
 65 |     }
 66 |     else
 67 |     {
 68 |         fwbw_custom.fill(pm, st, ev);
 69 |     }
 70 | 
 71 |     // print all kmers with posterior >= .1 for the middle event
 72 |     multiset< pair< FLOAT_TYPE, unsigned > > s;
 73 |     for (unsigned j = 0; j < pm.n_states; ++j)
 74 |     {
 75 |         FLOAT_TYPE v = exp(not opts::custom_fwbw
 76 |                            ? fwbw.log_posterior(ev.size() / 2, j)
 77 |                            : fwbw_custom.log_posterior(ev.size() / 2, j));
 78 |         if (v >= .1)
 79 |         {
 80 |             s.insert(make_pair(v, j));
 81 |         }
 82 |     }
 83 |     while (not s.empty())
 84 |     {
 85 |         auto it = prev(s.end());
 86 |         cout << Forward_Backward_Type::Kmer_Type::to_string(it->second) << '\t' << it->first << endl;
 87 |         s.erase(it);
 88 |     }
 89 | 
 90 |     if (not opts::output_file_name.get().empty())
 91 |     {
 92 |         strict_fstream::ofstream(opts::output_file_name) << fwbw;
 93 |     }
 94 | }
 95 | 
 96 | int main(int argc, char * argv[])
 97 | {
 98 |     opts::cmd_parser.parse(argc, argv);
 99 |     logger::Logger::set_levels_from_options(opts::log_level);
100 |     real_main();
101 | }
102 | 


--------------------------------------------------------------------------------
/src/nanocall/run-viterbi.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string>
 3 | #include <tclap/CmdLine.h>
 4 | 
 5 | #include "Pore_Model.hpp"
 6 | #include "State_Transitions.hpp"
 7 | #include "Event.hpp"
 8 | #include "Viterbi.hpp"
 9 | #include "logger.hpp"
10 | #include "zstr.hpp"
11 | 
12 | using namespace std;
13 | 
14 | #ifndef FLOAT_TYPE
15 | #define FLOAT_TYPE float
16 | #endif
17 | #ifndef KMER_SIZE
18 | #define KMER_SIZE 6
19 | #endif
20 | typedef State_Transitions< FLOAT_TYPE, KMER_SIZE > State_Transitions_Type;
21 | typedef Pore_Model< FLOAT_TYPE, KMER_SIZE > Pore_Model_Type;
22 | typedef Event< FLOAT_TYPE, KMER_SIZE > Event_Type;
23 | typedef Event_Sequence< FLOAT_TYPE, KMER_SIZE > Event_Sequence_Type;
24 | typedef Viterbi< FLOAT_TYPE, KMER_SIZE > Viterbi_Type;
25 | 
26 | namespace opts
27 | {
28 |     using namespace TCLAP;
29 |     string description =
30 |         "Run Viterbi on given input";
31 |     CmdLine cmd_parser(description);
32 |     MultiArg< string > log_level("d", "log-level", "Log level.", false, "string", cmd_parser);
33 |     ValueArg< string > pm_file_name("p", "pore-model", "Scaled pore model file name.", true, "", "file", cmd_parser);
34 |     ValueArg< string > st_file_name("s", "state-transitions", "State transitions file name.", true, "", "file", cmd_parser);
35 |     ValueArg< string > ev_file_name("e", "events", "Events file name.", true, "", "file", cmd_parser);
36 | } // namespace opts
37 | 
38 | void real_main()
39 | {
40 |     Pore_Model_Type pm;
41 |     State_Transitions_Type st;
42 |     Event_Sequence_Type ev;
43 |     zstr::ifstream(opts::pm_file_name) >> pm;
44 |     zstr::ifstream(opts::st_file_name) >> st;
45 |     {
46 |         zstr::ifstream ifs(opts::ev_file_name);
47 |         Event_Type e;
48 |         while (ifs >> e)
49 |         {
50 |             ev.push_back(e);
51 |         }
52 |     }
53 | 
54 |     Viterbi_Type vit;
55 |     vit.fill(pm, st, ev);
56 |     cout << ev.get_base_seq() << std::endl;
57 | }
58 | 
59 | int main(int argc, char * argv[])
60 | {
61 |     opts::cmd_parser.parse(argc, argv);
62 |     logger::Logger::set_levels_from_options(opts::log_level);
63 |     real_main();
64 | }
65 | 


--------------------------------------------------------------------------------
/src/version/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | message(STATUS "Processing: ${CMAKE_CURRENT_SOURCE_DIR}")
2 | 
3 | add_library(version STATIC version.cpp)
4 | add_dependencies(version package_version)
5 | 


--------------------------------------------------------------------------------
/src/version/version.cpp:
--------------------------------------------------------------------------------
1 | #include "version.hpp"
2 | #include "package_version.h"
3 | 
4 | char const * const package_version = PACKAGE_VERSION;
5 | 


--------------------------------------------------------------------------------
/src/version/version.hpp:
--------------------------------------------------------------------------------
1 | #ifndef __VERSION_HPP
2 | #define __VERSION_HPP
3 | 
4 | extern char const * const package_version;
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------