├── .VERSION.in ├── .gitignore ├── .gitmodules ├── .travis.yml ├── .version_files ├── Dockerfile ├── Dockerfile.slim.in ├── HACKING.org ├── LICENSE ├── README.org ├── VERSION ├── context └── nanocall │ └── .gitignore ├── script ├── build-slim-docker-image └── get-lddtree └── src ├── .gitignore ├── CMakeLists.txt ├── builtin_models ├── .gitignore ├── builtin_model_init_lists.inl ├── builtin_model_names.inl ├── builtin_model_num.inl ├── builtin_model_strands.inl ├── make-builtin-model-initializers ├── r73.c.p1.006.ont.model ├── r73.c.p2.006.ont.model └── r73.t.006.ont.model ├── cmake └── FindHDF5.cmake ├── get-dir-version ├── nanocall ├── Builtin_Model.cpp ├── Builtin_Model.hpp ├── CMakeLists.txt ├── Event.hpp ├── Fast5_Summary.hpp ├── Forward_Backward.hpp ├── Forward_Backward_Custom.hpp ├── Kmer.hpp ├── Parameter_Trainer.hpp ├── Pore_Model.hpp ├── State_Transitions.hpp ├── Viterbi.hpp ├── compute-scaled-pore-model.cpp ├── compute-state-transitions.cpp ├── fs_support.hpp ├── global_assert.hpp ├── list-directory.cpp ├── nanocall.cpp ├── run-fwbw.cpp └── run-viterbi.cpp └── version ├── CMakeLists.txt ├── version.cpp └── version.hpp /.VERSION.in: -------------------------------------------------------------------------------- 1 | ${VERSION} 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build* 2 | /local* 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/fast5"] 2 | path = src/fast5 3 | url = https://github.com/mateidavid/fast5.git 4 | [submodule "src/tclap"] 5 | path = src/tclap 6 | url = https://github.com/mateidavid/tclap.git 7 | [submodule "src/hpptools"] 8 | path = src/hpptools 9 | url = https://github.com/mateidavid/hpptools.git 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # travis.yml for github.com/mateidavid/nanocall 2 | 3 | sudo: required 4 | 5 | services: docker 6 | 7 | before_install: 8 | - sudo apt-get update -y 9 | - sudo apt-get install -y -o Dpkg::Options::="--force-confnew" docker-engine 10 | 11 | install: script/build-slim-docker-image 12 | 13 | before_script: docker images --all --no-trunc 14 | 15 | script: docker run --rm nanocall 16 | -------------------------------------------------------------------------------- /.version_files: -------------------------------------------------------------------------------- 1 | VERSION 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stable 2 | MAINTAINER Matei David 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | 5 | # install prerequisites 6 | RUN for i in 1 2 3; do \ 7 | apt-get update \ 8 | && break; sleep 1; \ 9 | done && \ 10 | for i in 1 2 3; do \ 11 | apt-get install -y \ 12 | build-essential \ 13 | cmake \ 14 | libhdf5-dev \ 15 | && break; sleep 1; \ 16 | done 17 | 18 | # if necessary, specify compiler 19 | #RUN apt-get install -y g++-4.9 g++-5 g++-6 20 | #ENV CC=gcc-4.9 21 | #ENV CXX=g++-4.9 22 | 23 | # add source 24 | ADD . /src/ 25 | 26 | # build and install 27 | RUN mkdir -p /src/build && \ 28 | cd /src/build && \ 29 | cmake ../src && \ 30 | make && \ 31 | make install 32 | 33 | VOLUME ["/data"] 34 | WORKDIR /data 35 | ENTRYPOINT ["/usr/local/bin/nanocall"] 36 | CMD ["--version"] 37 | -------------------------------------------------------------------------------- /Dockerfile.slim.in: -------------------------------------------------------------------------------- 1 | FROM debian:stable 2 | MAINTAINER Matei David 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | 5 | ADD lddtree.tgz / 6 | 7 | # use host timezone 8 | ENV TZ=${TZ} 9 | RUN ln -snf /usr/share/zoneinfo/${TZ} /etc/localtime && echo ${TZ} > /etc/timezone 10 | 11 | # use host id 12 | RUN groupadd --gid ${GROUP_ID} ${GROUP_NAME} 13 | RUN useradd --create-home --uid ${USER_ID} --gid ${GROUP_ID} ${USER_NAME} 14 | USER ${USER_NAME} 15 | 16 | VOLUME ["/data"] 17 | WORKDIR /data 18 | ENTRYPOINT ["/usr/local/bin/nanocall"] 19 | CMD ["--version"] 20 | -------------------------------------------------------------------------------- /HACKING.org: -------------------------------------------------------------------------------- 1 | # -*- mode:org; mode:visual-line; coding:utf-8; -*- 2 | 3 | **** Change build type 4 | 5 | The default build type is =Release= (optimizations, no assertions). If any crashes are experienced, the first step in addressing them is to redo the run with a =Test= (optimizations, assertions) or =Debug= (no optimizations, assertions) build type. This is achieved using, e.g. =-DCMAKE_BUILD_TYPE=Test=. 6 | 7 | **** Using other packaged tools 8 | 9 | *Note*: The various tools are only built in =Test= or =Debug= build types. 10 | 11 | #+BEGIN_EXAMPLE 12 | FAST5_FILE=$SIMPSONLAB/data/nanopore/ecoli/sqk006/pass/LomanLabz_PC_Ecoli_K12_MG1655_20150924_MAP006_1_5005_1_ch9_file72_strand.fast5 13 | nanocall ${FAST5_FILE} > ${FAST5_FILE}.fa 14 | compute-state-transitions -p .001 -t .1 -k .1 >transitions.tsv 15 | compute-scaled-pore-model -f $FAST5_FILE >model.tsv 16 | get_events $FAST5_FILE | egrep -v '^(#|mean)' | tawk '{print $1,$3,$2,$4}' >events.tsv 17 | run-viterbi -d info -p model.tsv -s transitions.tsv -e events.tsv | { echo ">$(basename $FAST5_FILE)"; cat; } >out.fa 18 | run-viterbi -d debug -p model.tsv -s transitions.tsv -e <(awk 'NR>=100 && NR<200' events.tsv) |& tee log 19 | run-fwbw -d info -p model.tsv -s transitions.tsv -e <(awk 'NR>=100 && NR<200' events.tsv) -o fwbw.tsv 20 | #+END_EXAMPLE 21 | 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Matei David, Ontario Institute for Cancer Research 4 | Copyright (c) 2015 Jared Simpson, Ontario Institute for Cancer Research 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | # -*- mode:org; mode:visual-line; coding:utf-8; -*- 2 | 3 | ** Nanocall: An Oxford Nanopore Basecaller 4 | 5 | [[http://travis-ci.org/mateidavid/nanocall][http://travis-ci.org/mateidavid/nanocall.svg?branch=master]] [[https://tldrlegal.com/license/mit-license][http://img.shields.io/:license-mit-blue.svg]] 6 | 7 | *** Introduction 8 | 9 | Nanocall is an alternative, open source, MIT licensed, basecaller for Oxford Nanopore Technologies (ONT) sequencing data. Published in [[https://doi.org/10.1093/bioinformatics/btw569][Bioinformatics, 2016]]. 10 | 11 | For the official ONT basecaller, see [[https://metrichor.com/s/][Metrichor]]. 12 | 13 | **** Usefulness of Nanocall on recent ONT sequencing data 14 | 15 | To understand the usefulness of Nanocall compared to Metrichor, some background is in order. 16 | 17 | *Before summer 2016*: Before the summer of 2016, and the release of the R9 sequencing pore: 18 | 19 | - Metrichor was the only available basecaller for ONT data. 20 | - Metrichor's source code was closed. 21 | - Metrichor was only available as a cloud service. 22 | 23 | This state of affairs prompted us to develop Nanocall as an open-source local basecaller alternative to Metrichor. 24 | 25 | *After summer 2016*: The summer of 2016 has brought along several significant developments from ONT: 26 | 27 | - A new sequencing pore R9 was released: ([[https://nanoporetech.com/about-us/news/update-new-r9-nanopore-faster-more-accurate-sequencing-and-new-ten-minute-preparation][ONT Press Release, May 2016]]). 28 | - The Metrichor source code was opened (under a development license). 29 | - ONT provided an official option for local basecalling: ([[https://nanoporetech.com/about-us/news/local-basecalling-now-available-enabling-minion-usage-field][ONT Press Release, Aug 2016]]). 30 | 31 | As a result, Nanocall's usefulness is now limited to: 32 | 33 | - a platform for developing new basecalling ideas, and 34 | - situations where, for various reasons, you do not have access to the official ONT basecaller(s). 35 | 36 | If you want to use Nanocall on R9 data, Nanocall does support it directly, but its accuracy is significantly lower than that of Metrichor (unlike the case of R7.3, where the two had similar accuracy). The reason for the discrepancy is that Metrichor on R9 uses a more elaborate RNN-based approach, compared to the simple HMM-based one in Nanocall. 37 | 38 | **** Levels of ONT sequencing data 39 | 40 | Most people are only used to dealing with DNA bases. However, to understand where Nanocall fits in, we observe that there are 3 levels of ONT sequencing data: 41 | 42 | - Raw samples. These are direct (picoamp) current measurements, taken at preset intervals as the DNA molecule is threaded through the pore. This data is passed through the USB cable from the MinION to the controlling laptop running MinKNOW. These are stored in =fast5= files at paths such as =/Raw/Reads/Read_29/Signal=. 43 | 44 | - Events. Each event is an aggregation of multiple consecutive raw samples, (ideally) corresponding to a certain DNA context found in the pore. The process of computing events from raw samples is referred to as /event detection/. These are stored in =fast5= files at paths such as =/Analyses/EventDetection_000/Reads/Read_29/Events=. 45 | 46 | - DNA bases. These are the usual, finished product. The process of computing DNA bases from events is referred to as /basecalling/. These are stored in =fast5= files at paths such as =/Analyses/Basecall_2D_000/BaseCalled_2D/Fastq=. 47 | 48 | On R7.3, event detection was performed locally by MinKNOW, and events were passed on to, and used by Metrichor. Since Nanocall was developed as an alternative local basecaller for R7.3 data, /Nanocall is designed to work with events, not with raw samples/. 49 | 50 | On (at least some versions of) R9, Metrichor would entirely redo the event detection directly from raw samples, disregarding any event detection done locally by MinKNOW. As such, it is less uncommon with R9 (than with R7.3) to see =fast5= files without events. Nanocall cannot be run directly on such files. To use Nanocall on R9 data, you must either configure MinKNOW to perform local event detection, or pass the files through Metrichor to use its event detection. 51 | 52 | *** Installation 53 | 54 | Nanocall can be built from source in a classical UNIX environment, or directly under [[https://www.docker.com/what-docker][Docker]]. The Docker build might run under Windows, though this is not tested. 55 | 56 | **** Under a Classical UNIX Environment 57 | 58 | Nanocall uses =cmake= for configuration and =make= for building. The prerequisites needed for building are =zlib= and =hdf5=. On UNIX systems, =hdf5= can be optionally built as a submodule. 59 | Example build: 60 | 61 | #+BEGIN_EXAMPLE 62 | mkdir /some/source/dir && cd /some/source/dir 63 | git clone --recursive https://github.com/mateidavid/nanocall.git 64 | cd nanocall 65 | mkdir build && cd build 66 | cmake ../src [-DCMAKE_INSTALL_PREFIX=/some/install/dir] [-DBUILD_HDF5=1] [-DHDF5_ROOT=/path/to/hdf5] 67 | make 68 | make install 69 | /some/install/dir/bin/nanocall --version 70 | #+END_EXAMPLE 71 | 72 | *Notes*: 73 | 74 | - The default install prefix is =/usr/local=. 75 | 76 | - Setting =BUILD_HDF5= will cause =hdf5= to be downloaded and built as a submodule. 77 | 78 | - Setting =HDF5_ROOT= is only necessary if a copy of =hdf5= is installed in a non-standard location. This is not needed when =BUILD_HDF5= is used. 79 | 80 | **** Under Docker 81 | 82 | To avoid dealing with prerequisites, Nanocall can be conveniently built under Docker. The installation and configuration of Docker itself is outside of the scope of this document. 83 | 84 | ***** Simple "fat" build 85 | 86 | The simplest way to run Nanocall under Docker is: 87 | 88 | #+BEGIN_EXAMPLE 89 | docker build -t nanocall https://github.com/mateidavid/nanocall.git 90 | docker run --rm nanocall --version 91 | docker run --rm -u $(id -u):$(id -g) -v /path/to/data:/data nanocall -t 4 . >output.fa 92 | #+END_EXAMPLE 93 | 94 | Howver, there are several problems with this build: 95 | 96 | - The docker image is "fat", in that it contains all the build time dependencies of Nanocall, which are not needed at run time. 97 | 98 | - Without using =-u=, the image will create files with a UID of 0 on the mounted volumes of the host. To remove them, you will have to use =sudo rm= or =sudo chown=. 99 | 100 | - The timezone inside the image might be different from the host. This might confuse programs which depend on comparing modification times, most notably =make=. 101 | 102 | ***** Alternate "slim" build 103 | 104 | To alleviate the problems mentioned above, you can build a "slim" Docker image as follows: 105 | 106 | #+BEGIN_EXAMPLE 107 | git clone --recursive --depth 1 https://github.com/mateidavid/nanocall.git 108 | nanocall/script/build-slim-docker-image 109 | docker run --rm nanocall --version 110 | docker run --rm -v /path/to/data:/data nanocall -t 4 . >output.fa 111 | #+END_EXAMPLE 112 | 113 | *** Usage Examples 114 | 115 | #+BEGIN_EXAMPLE 116 | # Check version 117 | nanocall --version 118 | 119 | # Check command line parameters 120 | nanocall --help 121 | 122 | # Run on single file, save output and log 123 | nanocall /path/to/file.fast5 >output.fa 2>log 124 | 125 | # Run on directory, using 24 threads, discard log 126 | nanocall -t 24 /path/to/data >output.fa 2>/dev/null 127 | 128 | # Run on file-of-file-names 129 | nanocall /path/to/files.fofn >output.fa 130 | 131 | # Run Docker build on directory, using 4 threads 132 | # Note: -u is not needed with the "slim" build 133 | docker run --rm -u $(id -u):$(id -g) -v /path/to/data:/data nanocall -t 4 . >output.fa 134 | #+END_EXAMPLE 135 | 136 | *** License 137 | 138 | Released under the [[file:LICENSE][MIT license]]. 139 | 140 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.7.4 2 | -------------------------------------------------------------------------------- /context/nanocall/.gitignore: -------------------------------------------------------------------------------- 1 | /Dockerfile 2 | /lddtree.tgz 3 | -------------------------------------------------------------------------------- /script/build-slim-docker-image: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eEu 3 | trap 'echo "$0: line $LINENO: exit code $?" >&2' ERR 4 | log () { echo "$@" >&2; } 5 | crash () { log "error: $@"; exit 1; } 6 | 7 | _prog_path=$(readlink -e "${BASH_SOURCE[0]}") 8 | _prog_name=$(basename "$_prog_path") 9 | _prog_dir=$(dirname "$_prog_path") 10 | _prog_args=() 11 | 12 | export TZ=$(cat /etc/timezone) 13 | export USER_ID=$(id -u) 14 | export USER_NAME=$(id -un) 15 | export GROUP_ID=$(id -g) 16 | export GROUP_NAME=$(id -gn) 17 | ROOT_DIR=$(cd "$_prog_dir"/..; pwd -P) 18 | 19 | # build default fat image 20 | docker build -t nanocall:build "$ROOT_DIR" 21 | 22 | # extract lddtree for nanocall 23 | mkdir -p "$ROOT_DIR/build-slim-image" 24 | docker run --rm -v "$ROOT_DIR":/data --entrypoint=/bin/bash nanocall:build -c 'apt-get install -y pax-utils >/dev/null 2>&1 && /data/script/get-lddtree /usr/local/bin/nanocall' >"$ROOT_DIR"/build-slim-image/lddtree.tgz 25 | 26 | # remove fat image 27 | docker rmi nanocall:build 28 | 29 | # create slim Dockerfile 30 | envsubst <"$ROOT_DIR/Dockerfile.slim.in" >"$ROOT_DIR/build-slim-image/Dockerfile" 31 | 32 | # build slim image 33 | docker build -t nanocall "$ROOT_DIR/build-slim-image" 34 | -------------------------------------------------------------------------------- /script/get-lddtree: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eEu 3 | trap 'echo "$0: line $LINENO: exit code $?" >&2' ERR 4 | log () { echo "$@" >&2; } 5 | crash () { log "error: $@"; exit 1; } 6 | 7 | _prog_path=$(readlink -e "${BASH_SOURCE[0]}") 8 | _prog_name=$(basename "$_prog_path") 9 | _prog_dir=$(dirname "$_prog_path") 10 | _prog_args=() 11 | 12 | lddtree -a -l "$@" | 13 | sort | 14 | uniq | 15 | tar -chvzf - --files-from - 16 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | /optional* 2 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | message(STATUS "Processing: ${CMAKE_CURRENT_SOURCE_DIR}") 2 | 3 | # set build type 4 | if(NOT CMAKE_BUILD_TYPE) 5 | set(CMAKE_BUILD_TYPE "Release" CACHE STRING 6 | "Choose the type of build, options are: Debug Test Release GProf GProfRel." 7 | FORCE) 8 | endif() 9 | message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") 10 | if(NOT ${CMAKE_BUILD_TYPE} AND NOT ${CMAKE_BUILD_TYPE} STREQUAL "Release") 11 | set(CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Enable verbose make output.") 12 | message(STATUS "Enabling verbose make output.") 13 | endif() 14 | 15 | # general cmake settings 16 | cmake_minimum_required(VERSION 2.8.12) 17 | project(NANOCALL C CXX) 18 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake") 19 | include(CheckLibraryExists) 20 | include(CheckIncludeFileCXX) 21 | include(ExternalProject) 22 | 23 | # force out-of-source build 24 | if(${PROJECT_BINARY_DIR} STREQUAL ${PROJECT_SOURCE_DIR}) 25 | message(FATAL_ERROR "In-source build not supported.") 26 | endif() 27 | # check that "nanocall.cpp" exists; if not, assume the source dir is wrong 28 | if(NOT EXISTS "${PROJECT_SOURCE_DIR}/nanocall/nanocall.cpp") 29 | message(FATAL_ERROR "${PROJECT_SOURCE_DIR}: source files not found") 30 | endif() 31 | 32 | # set project-related variables 33 | set(PACKAGE_BUGREPORT "mdavid@oicr.on.ca") 34 | set(PACKAGE_URL "https://github.com/jts/nanocall") 35 | set(PACKAGE "${PROJECT_NAME}") 36 | set(PACKAGE_NAME "${PROJECT_NAME}") 37 | set(PACKAGE_TARNAME "${PROJECT_NAME}") 38 | 39 | # directory where to build optional submodules 40 | set(OPTIONAL_SUBMODULE_PREFIX 41 | ${PROJECT_SOURCE_DIR}/optional 42 | CACHE INTERNAL "Directory for installing optional submodules") 43 | 44 | # header and source directories 45 | set(SUBDIRS 46 | nanocall version 47 | CACHE INTERNAL "Subdirectories to descend into") 48 | set(HEADER_SUBDIRS 49 | builtin_models fast5/src hpptools/include tclap/include 50 | nanocall version 51 | CACHE INTERNAL "Subdirectories containing header files") 52 | 53 | ### Resolve external dependencies 54 | # 55 | # prefer static libraries 56 | #set(CMAKE_FIND_LIBRARY_SUFFIXES .a .so) 57 | 58 | # bake-in RPATH to prevent library search problems 59 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH true) 60 | set(CMAKE_BUILD_WITH_INSTALL_RPATH true) 61 | set(CMAKE_LIBRARY_PATH $ENV{LD_LIBRARY_PATH}) 62 | 63 | # find zlib 64 | # => ZLIB_INCLUDE_DIRS, ZLIB_LIBRARIES 65 | find_package(ZLIB REQUIRED) 66 | 67 | # find HDF5 68 | # => HDF5_INCLUDE_DIRS, HDF5_LIBRARIES 69 | if(BUILD_HDF5) 70 | message(STATUS "Building HDF5 in: ${OPTIONAL_SUBMODULE_PREFIX}") 71 | if(UNIX) 72 | # use local copy if available 73 | if(EXISTS /tmp/hdf5-1.8.16.tar.bz2) 74 | set(HDF5_URL /tmp/hdf5-1.8.16.tar.bz2) 75 | else() 76 | set(HDF5_URL http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.8.16/src/hdf5-1.8.16.tar.bz2) 77 | endif() 78 | ExternalProject_Add(hdf5 79 | PREFIX ${OPTIONAL_SUBMODULE_PREFIX} 80 | URL ${HDF5_URL} 81 | URL_MD5 79c1593573ebddf734eee8d43ecfe483 82 | CONFIGURE_COMMAND ./configure --prefix=${OPTIONAL_SUBMODULE_PREFIX} --disable-hl --disable-shared --enable-threadsafe 83 | BUILD_IN_SOURCE 1 84 | BUILD_COMMAND make 85 | INSTALL_COMMAND make install 86 | ) 87 | set(HDF5_INCLUDE_DIRS ${OPTIONAL_SUBMODULE_PREFIX}/include) 88 | set(HDF5_LIBRARIES ${OPTIONAL_SUBMODULE_PREFIX}/lib/libhdf5.a) 89 | elseif(WIN32) 90 | message(FATAL_ERROR "Building HDF5 not supported on Windows") 91 | endif() 92 | else() 93 | find_package(HDF5 1.8.11 REQUIRED) 94 | endif() 95 | 96 | # Problem: 97 | # 98 | # By default, if using target_link_libraries(... ${HDF5_LIBRARIES}), cmake 99 | # attempts to use the short name "-lhdf5" as an argument to ld, *without* setting 100 | # "-L" appropriately. It's unclear why, or if this is related to LD_LIBRARY_PATH. 101 | # The problem arises if an older libhdf5 is found instead. The solution below 102 | # uses an IMPORTED library. In this case, cmake will use the full path during 103 | # linking. 104 | # 105 | # Also, setting up RPATH apropriately using cmake is... elusive. If baking-in of 106 | # linker paths is needed, be sure to use LD_RUN_PATH during make- this will be 107 | # honoured by g++. 108 | # 109 | # Refs: 110 | # https://cmake.org/pipermail/cmake/2013-December/056655.html 111 | # https://cmake.org/Wiki/CMake/Tutorials/Exporting_and_Importing_Targets 112 | # 113 | add_library(libhdf5 UNKNOWN IMPORTED) 114 | set_property(TARGET libhdf5 PROPERTY IMPORTED_LOCATION ${HDF5_LIBRARIES}) 115 | #get_filename_component(HDF5_LIBRARIES_DIR ${HDF5_LIBRARIES} DIRECTORY) 116 | #set(CMAKE_INSTALL_RPATH ${HDF5_LIBRARIES_DIR}) 117 | 118 | message(STATUS "HDF5_INCLUDE_DIRS=${HDF5_INCLUDE_DIRS}") 119 | message(STATUS "HDF5_LIBRARIES=${HDF5_LIBRARIES}") 120 | 121 | # message(STATUS "CMAKE_LIBRARY_PATH=${CMAKE_LIBRARY_PATH}") 122 | # message(STATUS "CMAKE_SYSTEM_LIBRARY_PATH=${CMAKE_SYSTEM_LIBRARY_PATH}") 123 | # message(STATUS "CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES=${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") 124 | # message(STATUS "CMAKE_INSTALL_RPATH=${CMAKE_INSTALL_RPATH}") 125 | # message(STATUS "CMAKE_INSTALL_RPATH_USE_LINK_PATH=${CMAKE_INSTALL_RPATH_USE_LINK_PATH}") 126 | # message(STATUS "CMAKE_BUILD_WITH_INSTALL_RPATH=${CMAKE_BUILD_WITH_INSTALL_RPATH}") 127 | # message(STATUS "CMAKE_SKIP_BUILD_RPATH=${CMAKE_SKIP_BUILD_RPATH}") 128 | 129 | ### Prepare default compilation flags 130 | # 131 | # headers 132 | include_directories(SYSTEM 133 | ${ZLIB_INCLUDE_DIRS} 134 | ${HDF5_INCLUDE_DIRS} 135 | ) 136 | include_directories( 137 | ${PROJECT_BINARY_DIR} 138 | ${HEADER_SUBDIRS} 139 | ) 140 | get_directory_property(include_directories INCLUDE_DIRECTORIES) 141 | message(STATUS "INCLUDE_DIRECTORIES='${include_directories}'") 142 | 143 | ### general compile flags 144 | set(EXTRA_FLAGS "-std=c++11 -pthread -Wall -Wextra -pedantic") 145 | 146 | # compiler-specific flags 147 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") 148 | set(EXTRA_FLAGS "${EXTRA_FLAGS} -fmax-errors=1") 149 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 150 | set(EXTRA_FLAGS "${EXTRA_FLAGS} -ferror-limit=1") 151 | endif() 152 | 153 | # consolidate compile flags 154 | set(CMAKE_CXX_FLAGS "$ENV{CXXFLAGS} ${EXTRA_FLAGS}") 155 | message(STATUS "CMAKE_CXX_FLAGS='${CMAKE_CXX_FLAGS}'") 156 | 157 | ### build-specific compile flags 158 | set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g3 -ggdb -fno-inline -fno-eliminate-unused-debug-types") 159 | set(CMAKE_CXX_FLAGS_TEST "-O3 -g3 -fno-eliminate-unused-debug-types") 160 | set(CMAKE_CXX_FLAGS_TEST_O2 "-O2 -g3 -fno-eliminate-unused-debug-types") 161 | set(CMAKE_CXX_FLAGS_TEST_O1 "-O1 -g3 -fno-eliminate-unused-debug-types") 162 | set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -DDISABLE_ASSERTS") 163 | set(CMAKE_CXX_FLAGS_GPROF "-O3 -g3 -pg") 164 | set(CMAKE_CXX_FLAGS_GPROFREL "-O3 -DNDEBUG -DDISABLE_ASSERTS -pg") 165 | 166 | # link flags 167 | #set(CMAKE_EXE_LINKER_FLAGS "-Wl,-rpath=$ENV{LD_RUN_PATH} -Wl,--as-needed") 168 | if(APPLE) 169 | set(CMAKE_EXE_LINKER_FLAGS "-Wl") 170 | else() 171 | set(CMAKE_EXE_LINKER_FLAGS "-Wl,--as-needed") 172 | endif() 173 | message(STATUS "CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'") 174 | 175 | ### This target updates the package version 176 | # 177 | add_custom_target(package_version 178 | ${PROJECT_SOURCE_DIR}/get-dir-version -v -d ${PROJECT_SOURCE_DIR}/.. -r ${PROJECT_SOURCE_DIR}/.. -o ${PROJECT_BINARY_DIR}/package_version.h -t PACKAGE_VERSION 179 | ) 180 | 181 | ### Enable testing targets 182 | #enable_testing() 183 | 184 | ### Descend into subdirectories 185 | # 186 | foreach(dir ${SUBDIRS}) 187 | add_subdirectory(${dir}) 188 | endforeach() 189 | -------------------------------------------------------------------------------- /src/builtin_models/.gitignore: -------------------------------------------------------------------------------- 1 | *.model 2 | -------------------------------------------------------------------------------- /src/builtin_models/builtin_model_names.inl: -------------------------------------------------------------------------------- 1 | { 2 | "r73.t.006.ont.model" 3 | , 4 | "r73.c.p1.006.ont.model" 5 | , 6 | "r73.c.p2.006.ont.model" 7 | , 8 | "r9.t.007.ont.model" 9 | , 10 | "r9.c.p1.007.ont.model" 11 | , 12 | "r9.c.p2.007.ont.model" 13 | } 14 | -------------------------------------------------------------------------------- /src/builtin_models/builtin_model_num.inl: -------------------------------------------------------------------------------- 1 | 6 2 | -------------------------------------------------------------------------------- /src/builtin_models/builtin_model_strands.inl: -------------------------------------------------------------------------------- 1 | { 2 | 0 3 | , 4 | 1 5 | , 6 | 1 7 | , 8 | 0 9 | , 10 | 1 11 | , 12 | 1 13 | } 14 | -------------------------------------------------------------------------------- /src/builtin_models/make-builtin-model-initializers: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import re 6 | import sys 7 | 8 | description = ''' 9 | Transform the given model files into C++ initializer list suitable for inclusion during preprocessing. 10 | ''' 11 | default_files = [ 12 | '0:r73.t.006.ont.model', 13 | '1:r73.c.p1.006.ont.model', 14 | '1:r73.c.p2.006.ont.model', 15 | '0:r9.t.007.ont.model', 16 | '1:r9.c.p1.007.ont.model', 17 | '1:r9.c.p2.007.ont.model'] 18 | parser = argparse.ArgumentParser(description=description, epilog='') 19 | parser.add_argument('input', nargs='*', default=default_files, help='Input models, in the form :') 20 | args = parser.parse_args() 21 | 22 | m_strands = list() 23 | m_names = list() 24 | 25 | for fn in args.input: 26 | if fn[0] not in '012' or fn[1] != ':': 27 | print('error parsing model name [' + fn + ']: expecting :', file=sys.stderr) 28 | sys.exit(1) 29 | m_strands.append(int(fn[0])) 30 | m_names.append(fn[2:]) 31 | 32 | f_out = open('builtin_model_num.inl', 'w') 33 | print(str(len(m_names)), file=f_out) 34 | f_out.close() 35 | 36 | f_out = open('builtin_model_names.inl', 'w') 37 | print('{', file=f_out) 38 | for i in range(len(m_names)): 39 | if i > 0: 40 | print(' ,', file=f_out) 41 | print(' "' + m_names[i] + '"', file=f_out) 42 | print('}', file=f_out) 43 | f_out.close() 44 | 45 | f_out = open('builtin_model_strands.inl', 'w') 46 | print('{', file=f_out) 47 | for i in range(len(m_strands)): 48 | if i > 0: 49 | print(' ,', file=f_out) 50 | print(' ' + str(m_strands[i]), file=f_out) 51 | print('}', file=f_out) 52 | f_out.close() 53 | 54 | f_out = open('builtin_model_init_lists.inl', 'w') 55 | print('{', file=f_out) 56 | for i in range(len(m_names)): 57 | if i > 0: 58 | print(' ,', file=f_out) 59 | print(' {', file=f_out) 60 | f = open(m_names[i]) 61 | first_line = True 62 | for line in f: 63 | l = line.strip().split() 64 | if len(l) < 5 or re.search(r'[^ACGT]', l[0].upper()) != None: 65 | continue 66 | if not first_line: 67 | print(' ,', file=f_out) 68 | first_line = False 69 | print(' ' + ', '.join(l[1:5]), file=f_out) 70 | print(' }', file=f_out) 71 | print('}', file=f_out) 72 | f_out.close() 73 | 74 | -------------------------------------------------------------------------------- /src/cmake/FindHDF5.cmake: -------------------------------------------------------------------------------- 1 | # Find HDF5 (https://www.hdfgroup.org/) 2 | # Uses hint: 3 | # HDF5_ROOT 4 | # Sets: 5 | # HDF5_FOUND 6 | # HDF5_INCLUDE_DIRS 7 | # HDF5_LIBRARIES 8 | # Saves: 9 | # HDF5_ROOT 10 | # HDF5_INCLUDE_DIRS_CACHED 11 | # HDF5_LIBRARIES_CACHED 12 | 13 | if(NOT "${OLD_HDF5_ROOT}" OR NOT "${HDF5_ROOT}" STREQUAL "${OLD_HDF5_ROOT}") 14 | message(STATUS "Detecting HDF5: redetecing with new HDF5_ROOT=${HDF5_ROOT} (OLD_HDF5_ROOT=${OLD_HDF5_ROOT}).") 15 | unset(HDF5_INCLUDE_DIRS_CACHED CACHE) 16 | unset(HDF5_LIBRARIES_CACHED CACHE) 17 | else() 18 | message(STATUS "Detecting HDF5: HDF5_ROOT=${HDF5_ROOT} is not new; using cached paths.") 19 | message(STATUS "HDF5_INCLUDE_DIRS_CACHED=${HDF5_INCLUDE_DIRS_CACHED}") 20 | message(STATUS "HDF5_LIBRARIES_CACHED=${HDF5_LIBRARIES_CACHED}") 21 | endif() 22 | set(OLD_HDF5_ROOT ${HDF5_ROOT} CACHE INTERNAL "Last used value of HDF5_ROOT") 23 | 24 | # find headers 25 | find_path(HDF5_INCLUDE_DIRS_CACHED H5pubconf.h PATHS ${HDF5_ROOT}/include NO_DEFAULT_PATH) 26 | find_path(HDF5_INCLUDE_DIRS_CACHED H5pubconf.h PATH_SUFFIXES hdf5 hdf5/serial) 27 | if(HDF5_INCLUDE_DIRS_CACHED) 28 | execute_process( 29 | COMMAND grep H5_VERSION ${HDF5_INCLUDE_DIRS_CACHED}/H5pubconf.h 30 | COMMAND awk "{print \$3}" 31 | COMMAND tr -d "\"\n" 32 | OUTPUT_VARIABLE HDF5_INCLUDE_DIRS_VERSION 33 | ) 34 | message(STATUS "Found HDF5 headers version ${HDF5_INCLUDE_DIRS_VERSION} in: ${HDF5_INCLUDE_DIRS_CACHED}") 35 | endif() 36 | 37 | # find library 38 | find_library(HDF5_LIBRARIES_CACHED hdf5 PATHS ${HDF5_ROOT}/lib ${HDF5_ROOT}/lib64 NO_DEFAULT_PATH) 39 | find_library(HDF5_LIBRARIES_CACHED hdf5 PATH_SUFFIXES hdf5 hdf5/serial) 40 | 41 | include(FindPackageHandleStandardArgs) 42 | find_package_handle_standard_args(HDF5 43 | REQUIRED_VARS HDF5_INCLUDE_DIRS_CACHED HDF5_LIBRARIES_CACHED 44 | VERSION_VAR HDF5_INCLUDE_DIRS_VERSION 45 | #"HDF5 library (https://www.hdfgroup.org/) not found. Specify location with -DHDF5_ROOT=" 46 | ) 47 | mark_as_advanced(HDF5_INCLUDE_DIRS_CACHED HDF5_LIBRARIES_CACHED) 48 | 49 | if(HDF5_FOUND) 50 | set(HDF5_INCLUDE_DIRS ${HDF5_INCLUDE_DIRS_CACHED}) 51 | set(HDF5_LIBRARIES ${HDF5_LIBRARIES_CACHED}) 52 | endif() 53 | -------------------------------------------------------------------------------- /src/get-dir-version: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | trap 'echo "exit code $?: LINENO=$LINENO BASH_LINENO=\"${BASH_LINENO[@]}\" FUNCNAME=\"${FUNCNAME[@]}\"" >&2' ERR 3 | set -eEu -o pipefail 4 | prog_name=$(basename "$0") 5 | 6 | usage() { 7 | cat <] [-c ] [-f ] [-r ] [-o ] [-t ] 9 | 10 | Determine source tree version using the first method that works: 11 | 1. Change directory to , and run to describe its version. 12 | 2. If exists, read version from that file. 13 | 3. If the directory ends with "-[v].[.]", use the 14 | suffix as version. 15 | 4. Failing all else, set version to "unknown". 16 | Write the version as a single CPP define rule, defining the symbol . Send 17 | the output to . If exists and already contains the correct 18 | definition, do not update it. 19 | 20 | The defaults are: 21 | : . 22 | : git describe --always --dirty --tags 23 | : VERSION 24 | : .. 25 | : stdout 26 | : VERSION 27 | 28 | Other options: 29 | -v : verbose messages 30 | -h : this help 31 | EOF 32 | } 33 | 34 | log() { 35 | ! [ $verbose ] || echo "$@" >&2 36 | } 37 | 38 | dir= 39 | version_command="git describe --always --dirty --tags" 40 | version_file="VERSION" 41 | project_root=.. 42 | output_file= 43 | define_tag="VERSION" 44 | verbose= 45 | 46 | OPTIND=1 47 | while getopts "d:c:f:r:o:t:vh" OPT "$@"; do 48 | case $OPT in 49 | d) 50 | dir=$OPTARG 51 | ;; 52 | c) 53 | version_command=$OPTARG 54 | ;; 55 | f) 56 | version_file=$OPTARG 57 | ;; 58 | r) 59 | project_root=$OPTARG 60 | ;; 61 | o) 62 | output_file=$OPTARG 63 | ;; 64 | t) 65 | define_tag=$OPTARG 66 | ;; 67 | v) 68 | verbose=1 69 | ;; 70 | h) 71 | usage 72 | exit 0 73 | ;; 74 | *) 75 | usage >&2 76 | exit 1 77 | ;; 78 | esac 79 | done 80 | shift $(($OPTIND - 1)) 81 | 82 | ! [ "$dir" ] || cd "$dir" 83 | version=$( 84 | (eval "$version_command" 2>/dev/null && log "got version from: $version_command") || 85 | (cat "$version_file" 2>/dev/null && log "got version from file: $version_file") || 86 | ([ -d "$project_root" ] && 87 | name=$(basename "$(cd "$project_root"; pwd -P)") && 88 | [[ "$name" =~ -v?[0-9]+.[0-9]+(.[0-9]+)?$ ]] && 89 | echo "${name##*-}" && 90 | log "got version from project root folder") || 91 | (echo "unknown" && log "did not find version")) 92 | version=${version#v} 93 | log "found version=$version" 94 | 95 | if [ -r "$output_file" ]; then 96 | existing_version=$(awk -v tag="$define_tag" '$1=="#define"&&$2==tag {print $3}' <"$output_file" | 97 | sed 's/^"//;s/"$//') 98 | if [ "$existing_version" ]; then 99 | log "found existing_version=$existing_version" 100 | if [ "$version" = "$existing_version" ]; then 101 | log "version up to date" 102 | exit 0 103 | fi 104 | fi 105 | fi 106 | 107 | echo "#define ${define_tag} \"$version\"" | 108 | if [ "$output_file" ]; then 109 | cat >"$output_file" 110 | else 111 | cat 112 | fi 113 | -------------------------------------------------------------------------------- /src/nanocall/Builtin_Model.cpp: -------------------------------------------------------------------------------- 1 | #include "Builtin_Model.hpp" 2 | 3 | const unsigned Builtin_Model::num = 4 | #include "builtin_model_num.inl" 5 | ; 6 | 7 | const unsigned Builtin_Model::strands[] = 8 | #include "builtin_model_strands.inl" 9 | ; 10 | 11 | const std::string Builtin_Model::names[] = 12 | #include "builtin_model_names.inl" 13 | ; 14 | 15 | const std::vector< float > Builtin_Model::init_lists[] = 16 | #include "builtin_model_init_lists.inl" 17 | ; 18 | -------------------------------------------------------------------------------- /src/nanocall/Builtin_Model.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __BUILTIN_MODEL_HPP 2 | #define __BUILTIN_MODEL_HPP 3 | 4 | #include 5 | #include 6 | 7 | struct Builtin_Model 8 | { 9 | static const unsigned num; 10 | static const unsigned strands[]; 11 | static const std::string names[]; 12 | static const std::vector< float > init_lists[]; 13 | }; // struct Builtin_Model 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /src/nanocall/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | message(STATUS "Processing: ${CMAKE_CURRENT_SOURCE_DIR}") 2 | 3 | add_executable(nanocall 4 | nanocall.cpp 5 | Builtin_Model.cpp 6 | ) 7 | target_link_libraries(nanocall 8 | version 9 | libhdf5 10 | ${CMAKE_DL_LIBS} 11 | ${ZLIB_LIBRARIES} 12 | ) 13 | install(TARGETS nanocall RUNTIME DESTINATION bin) 14 | 15 | if(NOT ${CMAKE_BUILD_TYPE} STREQUAL "Release") 16 | add_executable(compute-state-transitions compute-state-transitions.cpp) 17 | 18 | add_executable(compute-scaled-pore-model compute-scaled-pore-model.cpp) 19 | target_link_libraries(compute-scaled-pore-model libhdf5 ${CMAKE_DL_LIBS} ${ZLIB_LIBRARIES}) 20 | 21 | add_executable(run-fwbw run-fwbw.cpp) 22 | target_link_libraries(run-fwbw ${ZLIB_LIBRARIES}) 23 | 24 | add_executable(run-viterbi run-viterbi.cpp) 25 | target_link_libraries(run-viterbi ${ZLIB_LIBRARIES}) 26 | 27 | add_executable(list-directory list-directory.cpp) 28 | endif() 29 | -------------------------------------------------------------------------------- /src/nanocall/Event.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __EVENT_HPP 2 | #define __EVENT_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "fast5.hpp" 10 | #include "logger.hpp" 11 | #include "alg.hpp" 12 | 13 | template < typename Float_Type, unsigned Kmer_Size > 14 | class Event 15 | { 16 | public: 17 | Float_Type mean; 18 | Float_Type corrected_mean; 19 | Float_Type stdv; 20 | Float_Type start; 21 | Float_Type length; 22 | Float_Type log_mean; 23 | Float_Type log_corrected_mean; 24 | Float_Type log_stdv; 25 | //Float_Type log_start; 26 | // 27 | Float_Type orig_mean; 28 | Float_Type p_model_state; 29 | std::array< char, Kmer_Size > model_state; 30 | unsigned model_state_idx; 31 | int move; 32 | // 33 | void update_logs() 34 | { 35 | assert(mean > 0); 36 | log_mean = std::log(mean); 37 | assert(corrected_mean > 0); 38 | log_corrected_mean = std::log(corrected_mean); 39 | if (stdv == 0.0) 40 | { 41 | stdv = 0.01; 42 | } 43 | log_stdv = std::log(stdv); 44 | //log_start = std::log(start); 45 | } 46 | void set_model_state(const std::string& s) 47 | { 48 | assert(s.size() == Kmer_Size); 49 | std::copy_n(s.begin(), Kmer_Size, model_state.begin()); 50 | } 51 | friend std::ostream & operator << (std::ostream& os, const Event& ev) 52 | { 53 | os << ev.mean << '\t' 54 | << ev.stdv << '\t' 55 | << ev.start << '\t' 56 | << ev.length; 57 | return os; 58 | } 59 | friend std::istream & operator >> (std::istream& is, Event& ev) 60 | { 61 | is >> ev.mean 62 | >> ev.stdv 63 | >> ev.start 64 | >> ev.length; 65 | ev.corrected_mean = ev.mean; 66 | ev.update_logs(); 67 | return is; 68 | } 69 | }; // class Event 70 | 71 | template < typename Float_Type, unsigned Kmer_Size > 72 | struct Event_Sequence 73 | : std::vector< Event< Float_Type, Kmer_Size > > 74 | { 75 | typedef std::vector< Event< Float_Type, Kmer_Size > > Base; 76 | using Base::Base; 77 | void apply_drift_correction(Float_Type drift) 78 | { 79 | for (auto& e : *this) 80 | { 81 | e.corrected_mean -= drift * e.start; 82 | e.log_corrected_mean = std::log(e.corrected_mean); 83 | } 84 | } 85 | std::string get_base_seq() const 86 | { 87 | std::string res; 88 | const Base& v = *this; 89 | res.assign(v[0].model_state.begin(), v[0].model_state.end()); 90 | for (unsigned i = 1; i < v.size(); ++i) 91 | { 92 | unsigned a = std::min((unsigned)v[i].move, (unsigned)Kmer_Size); 93 | unsigned b = Kmer_Size - a; 94 | assert(std::string(v[i - 1].model_state.begin() + a, v[i - 1].model_state.end()) 95 | == std::string(v[i].model_state.begin(), v[i].model_state.begin() + b)); 96 | res += std::string(v[i].model_state.begin() + b, v[i].model_state.end()); 97 | } 98 | return res; 99 | } 100 | }; // struct Event_Sequence 101 | 102 | #endif 103 | -------------------------------------------------------------------------------- /src/nanocall/Fast5_Summary.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __FAST5_SUMMARY_HPP 2 | #define __FAST5_SUMMARY_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifndef H5_HAVE_THREADSAFE 10 | #include 11 | #endif 12 | 13 | #include "Pore_Model.hpp" 14 | #include "State_Transitions.hpp" 15 | #include "Event.hpp" 16 | #include "fast5.hpp" 17 | #include "alg.hpp" 18 | 19 | template < typename Float_Type, unsigned Kmer_Size > 20 | class Fast5_Summary 21 | { 22 | public: 23 | typedef Pore_Model< Float_Type, Kmer_Size > Pore_Model_Type; 24 | typedef Pore_Model_Dict< Float_Type, Kmer_Size > Pore_Model_Dict_Type; 25 | typedef Pore_Model_Parameters< Float_Type > Pore_Model_Parameters_Type; 26 | typedef Event< Float_Type, Kmer_Size > Event_Type; 27 | typedef Event_Sequence< Float_Type, Kmer_Size > Event_Sequence_Type; 28 | typedef State_Transition_Parameters< Float_Type > State_Transition_Parameters_Type; 29 | 30 | std::string file_name; 31 | std::string base_file_name; 32 | std::string read_id; 33 | std::string bc_grp; 34 | std::array< std::array< std::string, 2 >, 3 > preferred_model; 35 | std::map< std::array< std::string, 2 >, Pore_Model_Parameters_Type > pm_params_m; 36 | std::map< std::array< std::string, 2 >, std::array< State_Transition_Parameters_Type, 2 > > st_params_m; 37 | std::array< unsigned, 4 > strand_bounds; 38 | std::array< Float_Type, 2 > time_length; 39 | unsigned num_ed_events; 40 | Float_Type sampling_rate; 41 | Float_Type abasic_level; 42 | bool valid; 43 | bool scale_strands_together; 44 | 45 | // from fast5 file 46 | std::unique_ptr< std::vector< fast5::EventDetection_Event_Entry > > ed_events_ptr; 47 | // filtered 48 | std::array< std::unique_ptr< Event_Sequence_Type >, 2 > events_ptr; 49 | //std::array< Event_Sequence_Type, 2 > events; 50 | 51 | const std::vector< fast5::EventDetection_Event_Entry >& ed_events() const 52 | { 53 | assert(ed_events_ptr); 54 | return *ed_events_ptr; 55 | } 56 | std::vector< fast5::EventDetection_Event_Entry >& ed_events() 57 | { 58 | assert(ed_events_ptr); 59 | return *ed_events_ptr; 60 | } 61 | const Event_Sequence_Type& events(unsigned st) const 62 | { 63 | assert(st < 2); 64 | assert(events_ptr[st]); 65 | return *events_ptr[st]; 66 | } 67 | Event_Sequence_Type& events(unsigned st) 68 | { 69 | assert(st < 2); 70 | assert(events_ptr[st]); 71 | return *events_ptr[st]; 72 | } 73 | 74 | static unsigned& min_ed_events() 75 | { 76 | static unsigned _min_ed_events = 10; 77 | return _min_ed_events; 78 | } 79 | 80 | static unsigned& max_ed_events() 81 | { 82 | static unsigned _max_ed_events = 100000; 83 | return _max_ed_events; 84 | } 85 | 86 | static std::string& eventdetection_group() 87 | { 88 | static std::string _eventdetection_group = "000"; 89 | return _eventdetection_group; 90 | } 91 | 92 | // percent of top events to ignore 93 | static double& abasic_level_top_percent() 94 | { 95 | static double _abasic_level_top_percent = 1.0; 96 | return _abasic_level_top_percent; 97 | } 98 | 99 | // what to add to top level 100 | static double& abasic_level_top_offset() 101 | { 102 | static double _abasic_level_top_offset = 0.0; 103 | return _abasic_level_top_offset; 104 | } 105 | 106 | // window size to consider for hairpin detection 107 | static unsigned& hairpin_island_window_size() 108 | { 109 | static unsigned _hairpin_island_window_size = 10; 110 | return _hairpin_island_window_size; 111 | } 112 | 113 | // window load to consider for hairpin detection 114 | static unsigned& hairpin_island_window_load() 115 | { 116 | static unsigned _hairpin_island_window_load = 5; 117 | return _hairpin_island_window_load; 118 | } 119 | 120 | // if set, do not split strands 121 | static unsigned& template_only() 122 | { 123 | static unsigned _template_only = 0; 124 | return _template_only; 125 | } 126 | 127 | // trim margins: after start, before end, before hairpin start, after hairpin end 128 | static std::array< unsigned, 4 >& trim_margins() 129 | { 130 | static std::array< unsigned, 4 > _trim_margins = {{ 50u, 50u, 50u, 50u }}; 131 | return _trim_margins; 132 | } 133 | 134 | Fast5_Summary() : valid(false) {} 135 | Fast5_Summary(const std::string fn, const Pore_Model_Dict_Type& models, bool sst) 136 | : valid(false) { summarize(fn, models, sst); } 137 | 138 | void summarize(const std::string& fn, const Pore_Model_Dict_Type& models, bool sst) 139 | { 140 | valid = true; 141 | // initialize fields 142 | file_name = fn; 143 | auto pos = file_name.find_last_of('/'); 144 | base_file_name = (pos != std::string::npos? file_name.substr(pos + 1) : file_name); 145 | if (base_file_name.substr(base_file_name.size() - 6) == ".fast5") 146 | { 147 | base_file_name.resize(base_file_name.size() - 6); 148 | } 149 | read_id = base_file_name; 150 | strand_bounds = {{ 0, 0, 0, 0 }}; 151 | time_length = {{ 0.0, 0.0 }}; 152 | num_ed_events = 0; 153 | abasic_level = 0.0; 154 | fast5::File f; 155 | do 156 | { 157 | try 158 | { 159 | // open file 160 | f.open(file_name); // can throw 161 | // get sampling rate 162 | if (not f.have_sampling_rate()) 163 | { 164 | LOG("Fast5_Summary", info) << file_name << ": missing sampling rate" << std::endl; 165 | break; 166 | } 167 | sampling_rate = f.get_sampling_rate(); // can throw 168 | if (sampling_rate < 1000.0 or sampling_rate > 10000.0) 169 | { 170 | LOG("Fast5_Summary", warning) << file_name << ": unexpected sampling rate: " << sampling_rate << std::endl; 171 | break; 172 | } 173 | // get ed event params and ed events 174 | if (not f.have_eventdetection_events(eventdetection_group())) 175 | { 176 | LOG("Fast5_Summary", info) << file_name << ": missing eventdetection events" << std::endl; 177 | break; 178 | } 179 | auto ed_params = f.get_eventdetection_event_params(eventdetection_group()); // can throw 180 | if (not ed_params.read_id.empty()) 181 | { 182 | read_id = ed_params.read_id; 183 | } 184 | load_ed_events(&f); // also sets num_ed_events 185 | if (num_ed_events < trim_margins()[0] + trim_margins()[1] + min_ed_events()) 186 | { 187 | LOG("Fast5_Summary", info) 188 | << file_name << ": not enough eventdetection events: " << num_ed_events << std::endl; 189 | num_ed_events = 0; 190 | break; 191 | } 192 | // get abasic level 193 | abasic_level = detect_abasic_level(); 194 | if (abasic_level <= 1.0) 195 | { 196 | LOG("Fast5_Summary", info) 197 | << file_name << ": abasic level too low: " << abasic_level << std::endl; 198 | num_ed_events = 0; 199 | break; 200 | } 201 | // detect strands 202 | strand_bounds = {{ trim_margins()[0], num_ed_events - trim_margins()[1], 0, 0 }}; 203 | if (not template_only()) detect_strands(); 204 | if (strand_bounds[1] <= strand_bounds[0]) 205 | { 206 | LOG("Fast5_Summary", info) << file_name << ": no template strand detected" << std::endl; 207 | num_ed_events = 0; 208 | break; 209 | } 210 | scale_strands_together = (sst 211 | and strand_bounds[1] - strand_bounds[0] >= min_ed_events() 212 | and strand_bounds[3] - strand_bounds[2] >= min_ed_events()); 213 | // compute time lengths 214 | load_events(&f); 215 | for (unsigned st = 0; st < 2; ++st) 216 | { 217 | if (events(st).size() < min_ed_events()) continue; 218 | time_length[st] = events(st).rbegin()->start + events(st).rbegin()->length; 219 | } 220 | // 221 | // compute initial model scalings 222 | // 223 | if (scale_strands_together) 224 | { 225 | auto r0 = alg::mean_stdv_of< Float_Type >( 226 | events(0), 227 | [] (const Event_Type& ev) { return ev.mean; }); 228 | auto r1 = alg::mean_stdv_of< Float_Type >( 229 | events(1), 230 | [] (const Event_Type& ev) { return ev.mean; }); 231 | for (const auto& p0 : models) 232 | if (p0.second.strand() == 0 or p0.second.strand() == 2) 233 | for (const auto& p1 : models) 234 | if (p1.second.strand() == 1 or p1.second.strand() == 2) 235 | { 236 | std::array< std::string, 2 > m_name = {{ p0.first, p1.first }}; 237 | Pore_Model_Parameters_Type pm_params; 238 | pm_params.scale = (r0.second / p0.second.stdv() 239 | + r1.second / p1.second.stdv()) / 2; 240 | pm_params.shift = (r0.first - pm_params.scale * p0.second.mean() 241 | + r1.first - pm_params.scale * p1.second.mean()) / 2; 242 | LOG("Fast5_Summary", debug) 243 | << "initial_scaling read [" << read_id 244 | << "] strand [2] model [" << m_name[0] << "+" << m_name[1] 245 | << "] pm_params [" << pm_params << "]" << std::endl; 246 | pm_params_m[m_name] = std::move(pm_params); 247 | st_params_m[m_name][0] = State_Transition_Parameters_Type(); 248 | st_params_m[m_name][1] = State_Transition_Parameters_Type(); 249 | } 250 | } 251 | else // not scale_strands_together 252 | { 253 | for (unsigned st = 0; st < 2; ++st) 254 | { 255 | if (events(st).size() < min_ed_events()) continue; 256 | auto r = alg::mean_stdv_of< Float_Type >( 257 | events(st), 258 | [] (const Event_Type& ev) { return ev.mean; }); 259 | for (const auto& p : models) 260 | { 261 | if (p.second.strand() == st or p.second.strand() == 2) 262 | { 263 | std::array< std::string, 2 > m_name; 264 | m_name[st] = p.first; 265 | Pore_Model_Parameters_Type pm_params; 266 | pm_params.scale = r.second / p.second.stdv(); 267 | pm_params.shift = r.first - pm_params.scale * p.second.mean(); 268 | LOG("Fast5_Summary", debug) 269 | << "initial_scaling read [" << read_id 270 | << "] strand [" << st 271 | << "] model [" << m_name[st] 272 | << "] pm_params [" << pm_params << "]" << std::endl; 273 | pm_params_m[m_name] = std::move(pm_params); 274 | st_params_m[m_name][st] = State_Transition_Parameters_Type(); 275 | } 276 | } 277 | } 278 | } 279 | // detect basecall group to write 280 | auto bc_grp_l = f.get_basecall_group_list(); 281 | static const std::string bc_grp_prefix("Nanocall_"); 282 | std::set< std::string > used_tags; 283 | for (const auto& bc_grp : bc_grp_l) 284 | { 285 | if (bc_grp.size() <= bc_grp_prefix.size()) continue; 286 | auto p = std::mismatch(bc_grp_prefix.begin(), 287 | bc_grp_prefix.end(), 288 | bc_grp.begin()); 289 | if (p.first != bc_grp_prefix.end()) continue; 290 | std::string tag(p.second, bc_grp.end()); 291 | std::clog << "found basecall group: " << tag << std::endl; 292 | used_tags.emplace(std::move(tag)); 293 | } 294 | for (unsigned i = 0; i < 1000; ++i) 295 | { 296 | std::ostringstream tmp; 297 | tmp << std::setw(3) << std::setfill('0') << i; 298 | if (not used_tags.count(tmp.str())) 299 | { 300 | bc_grp = bc_grp_prefix + tmp.str(); 301 | break; 302 | } 303 | } 304 | if (bc_grp.empty()) 305 | { 306 | LOG(error) 307 | << "no available basecall tag" << std::endl; 308 | std::exit(EXIT_FAILURE); 309 | } 310 | } 311 | catch (hdf5_tools::Exception& e) 312 | { 313 | LOG(warning) << file_name << ": HDF5 error: " << e.what() << std::endl; 314 | num_ed_events = 0; 315 | } 316 | } while (false); 317 | drop_events(); 318 | ed_events_ptr.reset(); 319 | } // summarize 320 | 321 | void load_events(fast5::File* f_p = nullptr) 322 | { 323 | assert(valid); 324 | drop_events(); 325 | if (num_ed_events == 0) 326 | { 327 | return; 328 | } 329 | bool must_load_ed_events = not ed_events_ptr; 330 | if (must_load_ed_events) 331 | { 332 | #ifndef H5_HAVE_THREADSAFE 333 | static std::mutex fast5_mutex; 334 | std::lock_guard< std::mutex > fast5_lock(fast5_mutex); 335 | #endif 336 | bool must_open_file = not f_p; 337 | if (must_open_file) 338 | { 339 | f_p = new fast5::File(file_name); 340 | } 341 | assert(f_p->is_open()); 342 | load_ed_events(f_p); 343 | if (must_open_file) 344 | { 345 | delete f_p; 346 | } 347 | } 348 | for (unsigned st = 0; st < 2; ++st) 349 | { 350 | events_ptr[st] = typename decltype(events_ptr)::value_type(new typename decltype(events_ptr)::value_type::element_type ()); 351 | for (unsigned j = strand_bounds[2 * st]; j < strand_bounds[2 * st + 1]; ++j) 352 | { 353 | if (filter_ed_event(ed_events()[j], abasic_level)) 354 | { 355 | Event_Type e; 356 | e.mean = ed_events()[j].mean; 357 | e.corrected_mean = e.mean; 358 | e.stdv = ed_events()[j].stdv; 359 | e.start = (ed_events()[j].start - ed_events()[strand_bounds[scale_strands_together? 0 : 2 * st]].start) / sampling_rate; 360 | e.length = ed_events()[j].length / sampling_rate; 361 | e.update_logs(); 362 | events(st).emplace_back(std::move(e)); 363 | } 364 | } 365 | } 366 | if (must_load_ed_events) 367 | { 368 | ed_events_ptr.reset(); 369 | } 370 | } 371 | void drop_events() 372 | { 373 | for (unsigned st = 0; st < 2; ++st) 374 | { 375 | events_ptr[st].reset(); 376 | } 377 | } 378 | 379 | void add_basecall_seq(const std::string& name, unsigned st, const std::string& seq, int default_qual = 33) const 380 | { 381 | try 382 | { 383 | // open file 384 | fast5::File f(file_name, true); // can throw 385 | // write seq 386 | f.add_basecall_seq(st, bc_grp, name, seq, default_qual); 387 | } 388 | catch (hdf5_tools::Exception& e) 389 | { 390 | LOG(warning) << file_name << ": HDF5 error: " << e.what() << std::endl; 391 | } 392 | } 393 | 394 | void add_basecall_events(unsigned st, const Event_Sequence_Type& ev) const 395 | { 396 | try 397 | { 398 | // open file 399 | fast5::File f(file_name, true); // can throw 400 | // write seq 401 | f.add_basecall_events(st, bc_grp, ev); 402 | } 403 | catch (hdf5_tools::Exception& e) 404 | { 405 | LOG(warning) << file_name << ": HDF5 error: " << e.what() << std::endl; 406 | } 407 | } 408 | 409 | void add_basecall_model(unsigned st, const Pore_Model_Type& model) const 410 | { 411 | try 412 | { 413 | // open file 414 | fast5::File f(file_name, true); // can throw 415 | // write model params 416 | f.add_basecall_model(st, bc_grp, model.get_state_vector()); 417 | } 418 | catch (hdf5_tools::Exception& e) 419 | { 420 | LOG(warning) << file_name << ": HDF5 error: " << e.what() << std::endl; 421 | } 422 | } 423 | 424 | void add_basecall_model_params(unsigned st, const Pore_Model_Parameters_Type& params) const 425 | { 426 | try 427 | { 428 | // open file 429 | fast5::File f(file_name, true); // can throw 430 | // write model params 431 | f.add_basecall_model_params(st, bc_grp, params); 432 | } 433 | catch (hdf5_tools::Exception& e) 434 | { 435 | LOG(warning) << file_name << ": HDF5 error: " << e.what() << std::endl; 436 | } 437 | } 438 | 439 | friend std::ostream& operator << (std::ostream& os, const Fast5_Summary& fs) 440 | { 441 | os << "[base_file_name=" << fs.base_file_name << " valid=" << fs.valid; 442 | if (fs.valid) 443 | { 444 | os << " num_ed_events=" << fs.num_ed_events; 445 | if (fs.num_ed_events > 0) 446 | { 447 | os << " read_id=" << fs.read_id 448 | << " abasic_level=" << fs.abasic_level 449 | << " strand_bounds=[" << fs.strand_bounds[0] << "," 450 | << fs.strand_bounds[1] << "," 451 | << fs.strand_bounds[2] << "," 452 | << fs.strand_bounds[3] 453 | << "] time_length=[" << fs.time_length[0] << "," << fs.time_length[1] << "]"; 454 | } 455 | } 456 | os << "]"; 457 | return os; 458 | } 459 | 460 | static void write_tsv_header(std::ostream& os) 461 | { 462 | os << "file_name" << "\tread_name" << "\tnum_ed_events" << "\tabasic_level" 463 | << "\ttemplate_start_idx" << "\ttemplate_end_idx" 464 | << "\tcomplement_start_idx" << "\tcomplement_end_idx"; 465 | for (unsigned st = 0; st < 2; ++st) 466 | { 467 | os << "\tn" << st << "_model_name" 468 | << "\tn" << st << "_scale" 469 | << "\tn" << st << "_shift" 470 | << "\tn" << st << "_drift" 471 | << "\tn" << st << "_var" 472 | << "\tn" << st << "_scale_sd" 473 | << "\tn" << st << "_var_sd" 474 | << "\tn" << st << "_p_stay" 475 | << "\tn" << st << "_p_skip"; 476 | } 477 | } 478 | 479 | void write_tsv(std::ostream& os) const 480 | { 481 | os << base_file_name << '\t' << read_id << '\t' << num_ed_events << '\t' << abasic_level 482 | << '\t' << strand_bounds[0] << '\t' << strand_bounds[1] 483 | << '\t' << strand_bounds[2] << '\t' << strand_bounds[3]; 484 | for (unsigned st = 0; st < 2; ++st) 485 | { 486 | os << '\t'; 487 | if (not preferred_model[st][st].empty()) 488 | { 489 | os << preferred_model[st][st] << '\t'; 490 | pm_params_m.at(preferred_model[st]).write_tsv(os); 491 | os << '\t'; 492 | st_params_m.at(preferred_model[st])[st].write_tsv(os); 493 | } 494 | else 495 | { 496 | os << ".\t"; 497 | Pore_Model_Parameters_Type().write_tsv(os); 498 | os << '\t'; 499 | State_Transition_Parameters_Type().write_tsv(os); 500 | } 501 | } 502 | } 503 | 504 | private: 505 | void load_ed_events(fast5::File* f_p) 506 | { 507 | ed_events_ptr = decltype(ed_events_ptr)( 508 | new typename decltype(ed_events_ptr)::element_type( 509 | f_p->get_eventdetection_events(eventdetection_group()))); 510 | if (num_ed_events == 0) 511 | { 512 | if (ed_events().size() > max_ed_events()) 513 | { 514 | LOG("Fast5_Summary", info) 515 | << file_name << ": using only " << max_ed_events() 516 | << " of " << ed_events().size() << " events" << std::endl; 517 | num_ed_events = max_ed_events(); 518 | } 519 | else 520 | { 521 | num_ed_events = ed_events().size(); 522 | } 523 | } 524 | ed_events().resize(num_ed_events); 525 | } 526 | 527 | // crude detection of abasic level 528 | Float_Type detect_abasic_level() 529 | { 530 | // 531 | // exclude top abasic_level_top_percent() levels 532 | // add abasic_level_top_offset() 533 | // 534 | std::vector< Float_Type > s; 535 | s.resize(ed_events().size()); 536 | unsigned i; 537 | for (i = 0; i < ed_events().size(); ++i) 538 | { 539 | s[i] = ed_events()[i].mean; 540 | } 541 | std::sort(s.begin(), s.end()); 542 | return s[(double)s.size() * (1.0 - abasic_level_top_percent() / 100.0)] + abasic_level_top_offset(); 543 | } // detect_abasic_level() 544 | 545 | std::vector< std::pair< unsigned, unsigned > > find_islands_5_consec() const 546 | { 547 | // 548 | // find islands of >= 5 consecutive events at high level 549 | // 550 | std::vector< std::pair< unsigned, unsigned > > islands; 551 | unsigned i = 0; 552 | while (i < ed_events().size()) 553 | { 554 | if (ed_events()[i].mean >= abasic_level) 555 | { 556 | unsigned j = i + 1; 557 | while (j < ed_events().size() and ed_events()[j].mean >= abasic_level) ++j; 558 | if (j - i >= 5) 559 | { 560 | islands.push_back(std::make_pair(i, j)); 561 | LOG("Fast5_Summary", debug) << "abasic_island [" << i << "," << j << "]" << std::endl; 562 | } 563 | i = j + 1; 564 | } 565 | else 566 | { 567 | ++i; 568 | } 569 | } 570 | return islands; 571 | } 572 | 573 | std::vector< std::pair< unsigned, unsigned > > find_islands_5_of_10_consec() const 574 | { 575 | // 576 | // find islands of >= 5/10 consecutive events at high level 577 | // 578 | std::vector< std::pair< unsigned, unsigned > > islands; 579 | unsigned i = 0; 580 | unsigned window_start = 0; 581 | unsigned window_count = 0; 582 | while (i < ed_events().size()) 583 | { 584 | if (ed_events()[i].mean >= abasic_level) 585 | { 586 | while (window_start + 10 <= i) 587 | { 588 | if (ed_events()[window_start].mean >= abasic_level) 589 | { 590 | --window_count; 591 | } 592 | ++window_start; 593 | } 594 | while (window_start < i and ed_events()[window_start].mean < abasic_level) 595 | { 596 | ++window_start; 597 | } 598 | assert(i < window_start + 10); 599 | ++window_count; 600 | if (window_count >= 5) 601 | { 602 | islands.push_back(std::make_pair(window_start, i)); 603 | LOG("Fast5_Summary", debug) << "abasic_island [" << window_start << "," << i << "]" << std::endl; 604 | window_start = i + 1; 605 | window_count = 0; 606 | } 607 | } 608 | ++i; 609 | } 610 | return islands; 611 | } 612 | 613 | // crude detection of hairpin islands 614 | // look for >= hairping_window_load/hairpin_window_size consecutive events at high level 615 | std::vector< std::pair< unsigned, unsigned > > find_hairpin_islands() const 616 | { 617 | std::vector< std::pair< unsigned, unsigned > > islands; 618 | unsigned i = 0; 619 | unsigned window_start = 0; 620 | unsigned window_count = 0; 621 | while (i < ed_events().size()) 622 | { 623 | if (ed_events()[i].mean >= abasic_level) 624 | { 625 | while (window_start + hairpin_island_window_size() <= i) 626 | { 627 | if (ed_events()[window_start].mean >= abasic_level) 628 | { 629 | --window_count; 630 | } 631 | ++window_start; 632 | } 633 | while (window_start < i and ed_events()[window_start].mean < abasic_level) 634 | { 635 | ++window_start; 636 | } 637 | assert(i < window_start + hairpin_island_window_size()); 638 | ++window_count; 639 | if (window_count >= hairpin_island_window_load()) 640 | { 641 | islands.push_back(std::make_pair(window_start, i)); 642 | LOG("Fast5_Summary", debug) << "abasic_island [" << window_start << "," << i << "]" << std::endl; 643 | window_start = i + 1; 644 | window_count = 0; 645 | } 646 | } 647 | ++i; 648 | } 649 | return islands; 650 | } // find_hairpin_islands() 651 | 652 | // crude detection of strands in event sequence 653 | void detect_strands() 654 | { 655 | LOG("Fast5_Summary", debug) 656 | << "num_events=" << ed_events().size() 657 | << " abasic_level=" << abasic_level << std::endl; 658 | // 659 | // find islands of consecutive events at high level 660 | // 661 | auto islands = find_islands_5_consec(); //find_hairpin_islands(); 662 | // 663 | // merge islands within 50bp of each other 664 | // 665 | for (unsigned i = 1; i < islands.size(); ++i) 666 | { 667 | if (islands[i - 1].second + std::max(trim_margins()[2], trim_margins()[3]) >= islands[i].first) 668 | { 669 | LOG("Fast5_Summary", debug) << "merge_islands " 670 | << "[" << islands[i - 1].first << "," << islands[i - 1].second << "] with " 671 | << "[" << islands[i].first << "," << islands[i].second << "]" << std::endl; 672 | islands[i - 1].second = islands[i].second; 673 | islands.erase(islands.begin() + i); 674 | i = 0; 675 | } 676 | } 677 | LOG("Fast5_Summary", debug) 678 | << "final_islands: " << alg::os_join( 679 | islands, " ", 680 | [] (const std::pair< unsigned, unsigned >& p) { 681 | std::ostringstream tmp; 682 | tmp << "[" << p.first << "," << p.second << "]"; 683 | return tmp.str(); 684 | }) << std::endl; 685 | if (islands.empty()) 686 | { 687 | LOG("Fast5_Summary", info) 688 | << "template_only read_id=[" << read_id << "]" << std::endl; 689 | return; 690 | } 691 | // 692 | // pick island closest to the middle of the event sequence 693 | // 694 | auto dist_to_middle = [&] (const std::pair< unsigned, unsigned >& p) { 695 | return std::min((unsigned)std::abs((long)p.first - (long)ed_events().size() / 2), 696 | (unsigned)std::abs((long)p.second - (long)ed_events().size() / 2)); 697 | }; 698 | auto it = alg::min_of(islands, dist_to_middle); 699 | // check island is in the middle third; if not, intepret it as template only 700 | if (dist_to_middle(*it) > ed_events().size() / 6) 701 | { 702 | LOG("Fast5_Summary", info) 703 | << "drop_read read_id=[" << read_id 704 | << "] islands=[" << alg::os_join( 705 | islands, " ", 706 | [] (const std::pair< unsigned, unsigned >& p) { 707 | std::ostringstream tmp; 708 | tmp << "[" << p.first << "," << p.second << "]"; 709 | return tmp.str(); 710 | }) 711 | << "]" << std::endl; 712 | return; 713 | } 714 | else 715 | { 716 | LOG("Fast5_Summary", debug) 717 | << "hairpin_island [" << it->first << "," << it->second << "]" << std::endl; 718 | strand_bounds[0] = trim_margins()[0]; 719 | if (islands[0].first < trim_margins()[0] + trim_margins()[2]) 720 | { 721 | strand_bounds[0] = std::max(strand_bounds[0], islands[0].second); 722 | } 723 | strand_bounds[1] = it->first - trim_margins()[2]; 724 | strand_bounds[2] = it->first + trim_margins()[3]; 725 | strand_bounds[3] = ed_events().size() - trim_margins()[1]; 726 | if (islands[islands.size() - 1].second > ed_events().size() - (trim_margins()[3] + trim_margins()[1])) 727 | { 728 | strand_bounds[3] = std::min(strand_bounds[3], islands[islands.size() - 1].first); 729 | } 730 | } 731 | } // detect_strands() 732 | 733 | // crude filtering of eventdetection events 734 | static bool filter_ed_event(const fast5::EventDetection_Event_Entry& e, Float_Type abasic_level) 735 | { 736 | if (e.mean >= abasic_level) 737 | { 738 | return false; 739 | } 740 | if (e.stdv > 4.0) 741 | { 742 | return false; 743 | } 744 | return true; 745 | } // filter_ed_event() 746 | }; // struct Fast5_Summary 747 | 748 | #endif 749 | -------------------------------------------------------------------------------- /src/nanocall/Forward_Backward.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __FORWARD_BACKWARD_HPP 2 | #define __FORWARD_BACKWARD_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "Pore_Model.hpp" 10 | #include "State_Transitions.hpp" 11 | #include "logsumset.hpp" 12 | #include "logger.hpp" 13 | 14 | template < typename Float_Type, unsigned Kmer_Size = 6 > 15 | class Forward_Backward 16 | { 17 | public: 18 | typedef Kmer< Kmer_Size > Kmer_Type; 19 | typedef Pore_Model< Float_Type, Kmer_Size > Pore_Model_Type; 20 | typedef State_Transitions< Float_Type, Kmer_Size > State_Transitions_Type; 21 | typedef Event< Float_Type, Kmer_Size > Event_Type; 22 | typedef Event_Sequence< Float_Type, Kmer_Size > Event_Sequence_Type; 23 | typedef logsum::logsumset< Float_Type > LogSumSet_Type; 24 | 25 | struct Matrix_Entry 26 | { 27 | Float_Type alpha; // := Pr[ E_1 ... E_i, S_i = j ] 28 | Float_Type beta; // := Pr[ E_{i+1} ... E_n | S_i = j ] 29 | }; // struct Matrix_Entry 30 | 31 | static const unsigned n_states = Pore_Model_Type::n_states; 32 | 33 | void clear() { _m.clear(); } 34 | unsigned n_events() const { return _m.size() / n_states; } 35 | 36 | // i: event index 37 | // j: state/kmer index 38 | const Matrix_Entry& cell(unsigned i, unsigned j) const { return _m[i * n_states + j]; } 39 | Matrix_Entry& cell(unsigned i, unsigned j) { return _m[i * n_states + j]; } 40 | 41 | Float_Type log_posterior(unsigned i, unsigned j) const { return cell(i, j).alpha + cell(i, j).beta - _log_pr_data; } 42 | Float_Type log_pr_data() const { return _log_pr_data; } 43 | 44 | static unsigned& n_threads() { static unsigned _n_threads = 1; return _n_threads; } 45 | 46 | void fill(const Pore_Model_Type& pm, 47 | const State_Transitions_Type& st, 48 | const Event_Sequence_Type& ev) 49 | { 50 | clear(); 51 | unsigned n_events = ev.size(); 52 | _m.resize(n_states * n_events); 53 | Float_Type log_n_states = std::log(static_cast< Float_Type >(n_states)); 54 | LogSumSet_Type s(false); 55 | // 56 | // forward: alpha, i == 0 57 | // 58 | { 59 | unsigned i = 0; 60 | LOG("Forward_Backward", debug1) << "forward: i=" << i << std::endl; 61 | for (unsigned j = 0; j < n_states; ++j) 62 | { 63 | cell(i, j).alpha = pm.log_pr_corrected_emission(j, ev[0]) - log_n_states; 64 | LOG("Forward_Backward", debug2) 65 | << "i=" << i << " j=" << j << " kmer_j=" << Kmer_Type::to_string(j) 66 | << " alpha=" << cell(i, j).alpha << std::endl; 67 | } 68 | } 69 | // 70 | // forward: alpha, i > 0 71 | // 72 | for (unsigned i = 1; i < ev.size(); ++i) 73 | { 74 | LOG("Forward_Backward", debug1) << "forward: i=" << i << std::endl; 75 | for (unsigned j = 0; j < n_states; ++j) 76 | { 77 | s.clear(); 78 | for (const auto& p : st.neighbours(j).from_v) 79 | { 80 | const unsigned& j_prev = p.first; 81 | const Float_Type& log_pr_transition = p.second; 82 | s.add(log_pr_transition + cell(i - 1, j_prev).alpha); 83 | } 84 | cell(i, j).alpha = pm.log_pr_corrected_emission(j, ev[i]) + s.val(); 85 | LOG("Forward_Backward", debug2) 86 | << "i=" << i << " j=" << j << " kmer_j=" << Kmer_Type::to_string(j) 87 | << " alpha=" << cell(i, j).alpha << std::endl; 88 | } 89 | } 90 | // 91 | // backward: beta, i == n-1 92 | // 93 | { 94 | unsigned i = ev.size() - 1; 95 | LOG("Forward_Backward", debug1) << "backward: i=" << i << std::endl; 96 | for (unsigned j = 0; j < n_states; ++j) 97 | { 98 | cell(i, j).beta = 0; 99 | LOG("Forward_Backward", debug2) 100 | << "i=" << i << " j=" << j << " kmer_j=" << Kmer_Type::to_string(j) 101 | << " beta=" << cell(i, j).beta << std::endl; 102 | } 103 | } 104 | // 105 | // backward: beta, i < n-1 106 | // 107 | for (unsigned ip1 = ev.size() - 1; ip1 > 0; --ip1) 108 | { 109 | unsigned i = ip1 - 1; 110 | LOG("Forward_Backward", debug1) << "backward: i=" << i << std::endl; 111 | for (unsigned j = 0; j < n_states; ++j) 112 | { 113 | s.clear(); 114 | for (const auto& p : st.neighbours(j).to_v) 115 | { 116 | const unsigned& j_next = p.first; 117 | const Float_Type& log_pr_transition = p.second; 118 | s.add(log_pr_transition + pm.log_pr_corrected_emission(j_next, ev[ip1]) + cell(ip1, j_next).beta); 119 | } 120 | cell(i, j).beta += s.val(); 121 | LOG("Forward_Backward", debug2) 122 | << "i=" << i << " j=" << j << " kmer_j=" << Kmer_Type::to_string(j) 123 | << " beta=" << cell(i, j).beta << std::endl; 124 | } 125 | } 126 | // 127 | // pr_data 128 | // 129 | s.clear(); 130 | for (unsigned j = 0; j < n_states; ++j) 131 | { 132 | s.add(cell(ev.size() - 1, j).alpha); 133 | } 134 | _log_pr_data = s.val(); 135 | } 136 | 137 | friend std::ostream& operator << (std::ostream& os, const Forward_Backward& fwbw) 138 | { 139 | for (unsigned i = 0; i < fwbw.n_events(); ++i) 140 | { 141 | for (unsigned j = 0; j < fwbw.n_states; ++j) 142 | { 143 | os << i << '\t' << j << '\t' 144 | << fwbw.cell(i, j).alpha << '\t' 145 | << fwbw.cell(i, j).beta << std::endl; 146 | } 147 | } 148 | return os; 149 | } 150 | 151 | private: 152 | std::vector< Matrix_Entry > _m; 153 | Float_Type _log_pr_data; 154 | }; // class Forward_Backward 155 | 156 | #endif 157 | -------------------------------------------------------------------------------- /src/nanocall/Forward_Backward_Custom.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __FORWARD_BACKWARD_CUSTOM_HPP 2 | #define __FORWARD_BACKWARD_CUSTOM_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "Pore_Model.hpp" 10 | #include "State_Transitions.hpp" 11 | #include "logsumset.hpp" 12 | #include "logger.hpp" 13 | 14 | template < typename Float_Type, unsigned Kmer_Size = 6 > 15 | class Forward_Backward_Custom 16 | { 17 | public: 18 | typedef Kmer< Kmer_Size > Kmer_Type; 19 | typedef Pore_Model< Float_Type, Kmer_Size > Pore_Model_Type; 20 | typedef State_Transitions< Float_Type, Kmer_Size > State_Transitions_Type; 21 | typedef Event< Float_Type, Kmer_Size > Event_Type; 22 | typedef Event_Sequence< Float_Type, Kmer_Size > Event_Sequence_Type; 23 | typedef logsum::logsumset< Float_Type > LogSumSet_Type; 24 | 25 | struct Matrix_Entry 26 | { 27 | Float_Type alpha; // := Pr[ S_i = j | e_1 ... e_{i-1} ] 28 | Float_Type beta; // := Pr[ S_i = j | e_1 ... e_i ] 29 | Float_Type gamma; // := Pr[ S_i = j | e_1 ... e_n ] 30 | }; // struct Matrix_Entry 31 | 32 | static const unsigned n_states = Pore_Model_Type::n_states; 33 | 34 | void clear() { _m.clear(); } 35 | unsigned n_events() const { return _m.size() / n_states; } 36 | 37 | // i: event index 38 | // j: state/kmer index 39 | const Matrix_Entry& cell(unsigned i, unsigned j) const { return _m[i * n_states + j]; } 40 | Matrix_Entry& cell(unsigned i, unsigned j) { return _m[i * n_states + j]; } 41 | 42 | Float_Type log_posterior(unsigned i, unsigned j) const { return cell(i, j).gamma; } 43 | 44 | static unsigned& n_threads() { static unsigned _n_threads = 1; return _n_threads; } 45 | 46 | void fill(const Pore_Model_Type& pm, 47 | const State_Transitions_Type& st, 48 | const Event_Sequence_Type& ev) 49 | { 50 | clear(); 51 | unsigned n_events = ev.size(); 52 | _m.resize(n_states * n_events); 53 | Float_Type log_n_states = std::log(static_cast< Float_Type >(n_states)); 54 | LogSumSet_Type s1(false); 55 | LogSumSet_Type s2(false); 56 | // 57 | // forward: alpha, beta; i == 0 58 | // 59 | { 60 | for (unsigned j = 0; j < n_states; ++j) 61 | { 62 | // alpha 63 | cell(0, j).alpha = - log_n_states; 64 | // beta 65 | cell(0, j).beta = pm.log_pr_emission(j, ev[0]) + cell(0, j).alpha; 66 | s1.add(cell(0, j).beta); 67 | } 68 | Float_Type denom = s1.val(); 69 | LOG("Forward_Backward_Custom", debug1) << "i=0 beta_denom=" << denom << std::endl; 70 | for (unsigned j = 0; j < n_states; ++j) 71 | { 72 | cell(0, j).beta -= denom; 73 | LOG("Forward_Backward_Custom", debug2) 74 | << "i=0 j=" << Kmer_Type::to_string(j) 75 | << " alpha=" << cell(0, j).alpha 76 | << " beta=" << cell(0, j).beta << std::endl; 77 | } 78 | } 79 | // 80 | // forward: alpha, beta; i > 0 81 | // 82 | for (unsigned i = 1; i < ev.size(); ++i) 83 | { 84 | LOG("Forward_Backward_Custom", debug1) << "forward: i=" << i << std::endl; 85 | s1.clear(); 86 | for (unsigned j = 0; j < n_states; ++j) // TODO: parallelize 87 | { 88 | // alpha 89 | s2.clear(); 90 | for (const auto& p : st.neighbours(j).from_v) 91 | { 92 | const unsigned& j_prev = p.first; 93 | const Float_Type& log_pr_transition = p.second; 94 | s2.add(log_pr_transition + cell(i - 1, j_prev).beta); 95 | } 96 | cell(i, j).alpha = s2.val(); 97 | // beta 98 | cell(i, j).beta = pm.log_pr_emission(j, ev[i]) + cell(i, j).alpha; 99 | s1.add(cell(i, j).beta); 100 | } 101 | Float_Type denom = s1.val(); 102 | LOG("Forward_Backward_Custom", debug1) << "i=" << i << " beta_denom=" << denom << std::endl; 103 | for (unsigned j = 0; j < n_states; ++j) 104 | { 105 | cell(i, j).beta -= denom; 106 | LOG("Forward_Backward_Custom", debug2) 107 | << "i=" << i << " j=" << Kmer_Type::to_string(j) 108 | << " alpha=" << cell(i, j).alpha 109 | << " beta=" << cell(i, j).beta << std::endl; 110 | } 111 | } 112 | // 113 | // backward, gamma; i == n-1 114 | // 115 | for (unsigned j = 0; j < n_states; ++j) 116 | { 117 | cell(n_events - 1, j).gamma = cell(n_events - 1, j).beta; 118 | } 119 | // 120 | // backward, gamma; i < n-1 121 | // 122 | for (unsigned ip1 = ev.size() - 1; ip1 > 0; --ip1) 123 | { 124 | unsigned i = ip1 - 1; 125 | LOG("Forward_Backward_Custom", debug1) << "backward: i=" << i << std::endl; 126 | for (unsigned j = 0; j < n_states; ++j) // TODO: parallelize 127 | { 128 | cell(i, j).gamma = cell(i, j).beta; 129 | s2.clear(); 130 | for (const auto& p : st.neighbours(j).to_v) 131 | { 132 | const unsigned& j_next = p.first; 133 | const Float_Type& log_pr_transition = p.second; 134 | s2.add(log_pr_transition + cell(ip1, j_next).gamma - cell(ip1, j_next).alpha); 135 | } 136 | cell(i, j).gamma += s2.val(); 137 | LOG("Forward_Backward_Custom", debug2) 138 | << "i=" << i << " j=" << Kmer_Type::to_string(j) 139 | << " gamma=" << cell(i, j).gamma << std::endl; 140 | } 141 | } 142 | } 143 | 144 | friend std::ostream& operator << (std::ostream& os, const Forward_Backward_Custom& fwbw) 145 | { 146 | for (unsigned i = 0; i < fwbw.n_events(); ++i) 147 | { 148 | for (unsigned j = 0; j < fwbw.n_states; ++j) 149 | { 150 | os << i << '\t' << j << '\t' 151 | << fwbw.cell(i, j).alpha << '\t' 152 | << fwbw.cell(i, j).beta << '\t' 153 | << fwbw.cell(i, j).gamma << std::endl; 154 | } 155 | } 156 | return os; 157 | } 158 | 159 | private: 160 | std::vector< Matrix_Entry > _m; 161 | }; // class Forward_Backward_Custom 162 | 163 | #endif 164 | -------------------------------------------------------------------------------- /src/nanocall/Kmer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __KMER_HPP 2 | #define __KMER_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | template < unsigned Kmer_Size > 9 | class Kmer 10 | { 11 | public: 12 | static const unsigned n_states = (1u << (2 * Kmer_Size)); 13 | static size_t to_int(const std::string& s) 14 | { 15 | static std::array< int8_t, 256 > base_to_int; 16 | static bool table_initialized = false; 17 | if (not table_initialized) 18 | { 19 | for (unsigned i = 0; i < 256; ++i) 20 | { 21 | base_to_int[i] = -1; 22 | } 23 | base_to_int['A'] = 0; 24 | base_to_int['C'] = 1; 25 | base_to_int['G'] = 2; 26 | base_to_int['T'] = 3; 27 | table_initialized = true; 28 | } 29 | size_t res = 0; 30 | for (size_t i = 0; i < s.size(); ++i) 31 | { 32 | res <<= 2; 33 | res += base_to_int[static_cast< unsigned >(s[i])]; 34 | } 35 | return res; 36 | } 37 | static size_t to_int(const std::array< char, Kmer_Size >& a) 38 | { 39 | return to_int(std::string(a.begin(), a.end())); 40 | } 41 | static std::string to_string(size_t k) 42 | { 43 | static const std::string int_to_base("ACGT"); 44 | std::string res; 45 | for (size_t j = 0; j < Kmer_Size; ++j) 46 | { 47 | res += int_to_base[(k >> (2 * (Kmer_Size - j - 1))) & 0x3]; 48 | } 49 | return res; 50 | } 51 | static unsigned min_skip(unsigned k1, unsigned k2) 52 | { 53 | if (k1 == k2) 54 | { 55 | return 0; 56 | } 57 | else 58 | { 59 | for (unsigned k = Kmer_Size - 1; k > 0; --k) 60 | { 61 | if ((k1 & ((1u << (2 * k)) - 1)) == (k2 >> (2 * (Kmer_Size - k)))) 62 | { 63 | return Kmer_Size - k; 64 | } 65 | } 66 | return Kmer_Size; 67 | } 68 | } 69 | static unsigned prefix(unsigned i, unsigned k) 70 | { 71 | return i >> (2 * (Kmer_Size - k)); 72 | } 73 | static unsigned suffix(unsigned i, unsigned k) 74 | { 75 | return i & ((1u << (2 * k)) - 1); 76 | } 77 | 78 | /* 79 | * Precompute, for every kmer i, the maximum k such that suffix(i, k) == prefix(i, k) 80 | */ 81 | static unsigned max_self_overlap(unsigned i) 82 | { 83 | assert(i < n_states); 84 | std::array< unsigned, n_states > _max_self_overlap; 85 | bool _inited = false; 86 | if (not _inited) 87 | { 88 | static std::mutex _mutex; 89 | { 90 | std::lock_guard< std::mutex > _lock(_mutex); 91 | if (not _inited) // recheck 92 | { 93 | for (unsigned i = 0; i < n_states; ++i) 94 | { 95 | _max_self_overlap[i] = 0; 96 | for (unsigned k = Kmer_Size - 1; k >= 1; --k) 97 | { 98 | if (suffix(i, k) == prefix(i, k)) 99 | { 100 | _max_self_overlap[i] = k; 101 | break; 102 | } 103 | } 104 | } 105 | _inited = true; 106 | } 107 | } 108 | } 109 | return _max_self_overlap[i]; 110 | } 111 | 112 | /* 113 | * Precompute neighbours at distance 1 and 2. 114 | */ 115 | static const std::vector< unsigned >& neighbour_list(unsigned i, unsigned d) 116 | { 117 | assert(i < n_states); 118 | assert(d == 1 or d == 2); 119 | static std::array< std::array< std::vector< unsigned >, 2 >, 4096 > _neighbour_list; 120 | static bool _inited = false; 121 | if (not _inited) 122 | { 123 | static std::mutex _mutex; 124 | { 125 | std::lock_guard< std::mutex > _lock(_mutex); 126 | if (not _inited) // recheck 127 | { 128 | for (unsigned i = 0; i < n_states; ++i) 129 | { 130 | _neighbour_list[i][0].clear(); 131 | _neighbour_list[i][1].clear(); 132 | for (unsigned b1 = 0; b1 < 4; ++b1) 133 | { 134 | unsigned i1 = (suffix(i, Kmer_Size - 1) << 2) + b1; 135 | _neighbour_list[i][0].push_back(i1); 136 | for (unsigned b2 = 0; b2 < 4; ++b2) 137 | { 138 | unsigned i2 = (suffix(i1, Kmer_Size - 1) << 2) + b2; 139 | _neighbour_list[i][1].push_back(i2); 140 | } 141 | } 142 | } 143 | _inited = true; 144 | } 145 | } 146 | } 147 | return _neighbour_list[i][d - 1]; 148 | } 149 | }; // class Kmer 150 | 151 | #endif 152 | -------------------------------------------------------------------------------- /src/nanocall/Parameter_Trainer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __PARAMETER_TRAINER 2 | #define __PARAMETER_TRAINER 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "global_assert.hpp" 9 | #include "Pore_Model.hpp" 10 | #include "State_Transitions.hpp" 11 | #include "Forward_Backward.hpp" 12 | #include "logsumset.hpp" 13 | #include "logger.hpp" 14 | 15 | template < typename Float_Type, unsigned Kmer_Size = 6 > 16 | struct Parameter_Trainer 17 | { 18 | typedef Kmer< Kmer_Size > Kmer_Type; 19 | typedef Pore_Model< Float_Type, Kmer_Size > Pore_Model_Type; 20 | typedef Pore_Model_Parameters< Float_Type > Pore_Model_Parameters_Type; 21 | typedef State_Transitions< Float_Type, Kmer_Size > State_Transitions_Type; 22 | typedef State_Transition_Parameters< Float_Type > State_Transition_Parameters_Type; 23 | typedef Event< Float_Type, Kmer_Size > Event_Type; 24 | typedef Event_Sequence< Float_Type, Kmer_Size > Event_Sequence_Type; 25 | typedef Forward_Backward< Float_Type, Kmer_Size > Forward_Backward_Type; 26 | typedef logsum::logsumset< Float_Type > LogSumSet_Type; 27 | 28 | static const unsigned n_states = Pore_Model_Type::n_states; 29 | 30 | static void init() 31 | { 32 | // pick states i s.t. i has self-overlap 0, 33 | // and all its 1-step neighbours have self-overlap <=1 34 | st_train_kmers().clear(); 35 | for (unsigned i = 0; i < n_states; ++i) 36 | { 37 | if (Kmer_Type::max_self_overlap(i) > 0) 38 | { 39 | continue; 40 | } 41 | bool all_good = true; 42 | for (unsigned b1 = 0; b1 < 4; ++b1) 43 | { 44 | unsigned j = (Kmer_Type::suffix(i, Kmer_Size - 1) << 2) + b1; 45 | if (Kmer_Type::max_self_overlap(j) > 1) 46 | { 47 | all_good = false; 48 | break; 49 | } 50 | } 51 | if (all_good) 52 | { 53 | st_train_kmers().push_back(i); 54 | } 55 | } 56 | LOG(info) << "using [" << st_train_kmers().size() << "] states for state trainsition training" << std::endl; 57 | } 58 | 59 | static std::vector< unsigned >& st_train_kmers() 60 | { 61 | static std::vector< unsigned > _st_train_kmers; 62 | return _st_train_kmers; 63 | } 64 | 65 | static unsigned& pm_train_drift() 66 | { 67 | static unsigned _pm_train_drift = 1; 68 | return _pm_train_drift; 69 | } 70 | 71 | /** 72 | * Struct used for training rounds. 73 | * @event_seq_ptr_v Vector of pairs, first: an event sequence, second: strand from which it comes 74 | * @model_ptr_v Pointers to unscaled pore models (per strand) 75 | * @default_transitions_ptr Default state transitions 76 | * @pm_params_ptr Pore model scaling parameters (common to both strands) 77 | * @st_params_ptr_v State transition parameters (per strand) 78 | */ 79 | struct Train_Data 80 | { 81 | // input 82 | std::vector< std::pair< const Event_Sequence_Type*, unsigned > > event_seq_ptr_v; 83 | std::array< const Pore_Model_Type*, 2 > model_ptr_v; 84 | const State_Transitions_Type* default_transitions_ptr; 85 | const Pore_Model_Parameters_Type* pm_params_ptr; 86 | std::array< const State_Transition_Parameters_Type*, 2 > st_params_ptr_v; 87 | // output 88 | std::array< Pore_Model_Type, 2 > scaled_model_v; 89 | std::array< State_Transitions_Type, 2 > custom_transitions_v; 90 | std::array< const State_Transitions_Type*, 2 > transitions_ptr_v; 91 | std::vector< Event_Sequence_Type > corrected_event_seq_v; 92 | std::vector< Forward_Backward_Type > fwbw_v; 93 | Float_Type fit; 94 | }; 95 | 96 | /** 97 | * Fill training data for one training round. 98 | */ 99 | static void fill_train_data(Train_Data& data) 100 | { 101 | // compute scaled pore models 102 | data.scaled_model_v[0].clear(); 103 | data.scaled_model_v[1].clear(); 104 | std::array< bool, 2 > init_scaled_models = {{ false, false }}; 105 | for (const auto& p : data.event_seq_ptr_v) 106 | { 107 | ASSERT(p.second < 2); 108 | if (init_scaled_models[p.second]) continue; 109 | ASSERT(data.model_ptr_v[p.second]); 110 | ASSERT(data.pm_params_ptr); 111 | data.scaled_model_v[p.second] = *data.model_ptr_v[p.second]; 112 | data.scaled_model_v[p.second].scale(*data.pm_params_ptr); 113 | init_scaled_models[p.second] = true; 114 | } 115 | // compute custom state transitions 116 | data.custom_transitions_v[0].clear(); 117 | data.custom_transitions_v[1].clear(); 118 | std::array< bool, 2 > init_transitions = {{ false, false }}; 119 | for (const auto& p : data.event_seq_ptr_v) 120 | { 121 | if (init_transitions[p.second]) continue; 122 | ASSERT(data.st_params_ptr_v[p.second]); 123 | if (not data.st_params_ptr_v[p.second]->is_default()) 124 | { 125 | data.custom_transitions_v[p.second].compute_transitions_fast(*data.st_params_ptr_v[p.second]); 126 | data.transitions_ptr_v[p.second] = &data.custom_transitions_v[p.second]; 127 | } 128 | else 129 | { 130 | data.transitions_ptr_v[p.second] = data.default_transitions_ptr; 131 | } 132 | init_transitions[p.second] = true; 133 | } 134 | // compute drift-corrected event sequences 135 | unsigned n_event_seqs = data.event_seq_ptr_v.size(); 136 | data.corrected_event_seq_v.clear(); 137 | data.corrected_event_seq_v.reserve(n_event_seqs); 138 | data.fwbw_v.clear(); 139 | data.fwbw_v.reserve(n_event_seqs); 140 | data.fit = 0.0; 141 | for (unsigned k = 0; k < n_event_seqs; ++k) 142 | { 143 | unsigned st = data.event_seq_ptr_v[k].second; 144 | ASSERT(init_scaled_models[st]); 145 | ASSERT(init_transitions[st]); 146 | // first, copy events 147 | data.corrected_event_seq_v.emplace_back(*data.event_seq_ptr_v[k].first); 148 | // then, apply drift correction 149 | data.corrected_event_seq_v.back().apply_drift_correction(data.pm_params_ptr->drift); 150 | // finally, run fwbw 151 | data.fwbw_v.emplace_back(); 152 | data.fwbw_v.back().fill( 153 | data.scaled_model_v[st], *data.transitions_ptr_v[st], data.corrected_event_seq_v.back()); 154 | data.fit += data.fwbw_v.back().log_pr_data(); 155 | } 156 | #ifdef DUMP_TRAINING_DATA 157 | for (unsigned k = 0; k < n_event_seqs; ++k) 158 | { 159 | unsigned st = data.event_seq_ptr_v[k].second; 160 | unsigned n_events = data.event_seq_ptr_v[k].first->size(); 161 | std::ostringstream k_sstr; 162 | k_sstr << k; 163 | std::ofstream ofs; 164 | ofs.open(std::string("emissions.") + k_sstr.str() + ".tab"); 165 | for (unsigned i = 0; i < n_events; ++i) 166 | { 167 | for (unsigned j = 0; j < n_states; ++j) 168 | { 169 | if (j > 0) ofs << '\t'; 170 | ofs << data.scaled_model_v[st].log_pr_corrected_emission(j, data.corrected_event_seq_v[k][i]); 171 | } 172 | ofs << std::endl; 173 | } 174 | ofs.close(); 175 | ofs.open(std::string("transitions.") + k_sstr.str() + ".tab"); 176 | for (unsigned j1 = 0; j1 < n_states; ++j1) 177 | { 178 | std::map< unsigned, Float_Type > neighbour_m; 179 | for (const auto& p : data.transitions_ptr_v[st]->neighbours(j1).to_v) 180 | { 181 | neighbour_m[p.first] = p.second; 182 | } 183 | for (unsigned j2 = 0; j2 < n_states; ++j2) 184 | { 185 | if (j2 > 0) ofs << '\t'; 186 | if (neighbour_m.count(j2)) 187 | { 188 | ofs << neighbour_m.at(j2); 189 | } 190 | else 191 | { 192 | ofs << -1000.0; 193 | } 194 | } 195 | ofs << std::endl; 196 | } 197 | ofs.close(); 198 | ofs.open(std::string("fw.") + k_sstr.str() + ".tab"); 199 | for (unsigned i = 0; i < n_events; ++i) 200 | { 201 | for (unsigned j = 0; j < n_states; ++j) 202 | { 203 | if (j > 0) ofs << '\t'; 204 | ofs << data.fwbw_v[k].cell(i, j).alpha; 205 | } 206 | ofs << std::endl; 207 | } 208 | ofs.close(); 209 | ofs.open(std::string("bw.") + k_sstr.str() + ".tab"); 210 | for (unsigned i = 0; i < n_events; ++i) 211 | { 212 | for (unsigned j = 0; j < n_states; ++j) 213 | { 214 | if (j > 0) ofs << '\t'; 215 | ofs << data.fwbw_v[k].cell(i, j).beta; 216 | } 217 | ofs << std::endl; 218 | } 219 | } 220 | abort(); 221 | #endif 222 | } 223 | 224 | /** 225 | * Train pm_params on training data. 226 | * @data Training data, as filled by fill_train_data. 227 | * @new_pm_params Destination for new params. 228 | * @done Bool; if true, training failed, and no rounds are possible because of a singularity. 229 | */ 230 | static void train_pm_params(const Train_Data& data, Pore_Model_Parameters_Type& new_pm_params, bool& done) 231 | { 232 | done = false; 233 | unsigned n_event_seqs = data.event_seq_ptr_v.size(); 234 | unsigned total_n_events = 0; 235 | ASSERT(data.pm_params_ptr); 236 | // 237 | // compute the scaling matrices in normal space (not logspace!) 238 | // against unscaled pm & uncorrected events 239 | // 240 | auto& a_hat = new_pm_params.shift; 241 | auto& b_hat = new_pm_params.scale; 242 | auto& c_hat = new_pm_params.drift; 243 | auto& d_hat = new_pm_params.var; 244 | auto& v_hat = new_pm_params.scale_sd; 245 | auto& u_hat = new_pm_params.var_sd; 246 | std::array< std::array< double, 3 >, 3 > A = {{ {{ 0.0, 0.0, 0.0 }}, 247 | {{ 0.0, 0.0, 0.0 }}, 248 | {{ 0.0, 0.0, 0.0 }} }}; 249 | std::array< double, 3 > B = {{ 0.0, 0.0, 0.0 }}; 250 | double D = 0.0; // = \sum_i x^2_i s_{i,0} (used for var) 251 | double V_numer = 0.0; // = \sum_i y_i \sum_j p_{i,j} \lambda_j / \eta^2_j (for scale_sd) 252 | double V_denom = 0.0; // = \sum_i \sum_j p_{i,j} \lambda_j / \eta_j (for scale_sd) 253 | double U_pos = 0.0; // = \sum_i (1/y_i) \sum_j p_{i,j} \lambda_j (for var_sd) 254 | for (unsigned k = 0; k < n_event_seqs; ++k) 255 | { 256 | unsigned st = data.event_seq_ptr_v.at(k).second; 257 | ASSERT(st < 2); 258 | const Event_Sequence_Type& events = *data.event_seq_ptr_v[k].first; 259 | unsigned n_events = events.size(); 260 | total_n_events += n_events; 261 | const Pore_Model_Type& pm = *data.model_ptr_v[st]; 262 | const Forward_Backward_Type& fwbw = data.fwbw_v.at(k); 263 | for (unsigned i = 0; i < n_events; ++i) 264 | { 265 | Float_Type x_i = events[i].mean; 266 | Float_Type y_i = events[i].stdv; 267 | Float_Type t_i = events[i].start; 268 | LOG(debug1) 269 | << "outter_loop k=" << k << " i=" << i 270 | << " x_i=" << x_i 271 | << " t_i=" << t_i << std::endl; 272 | // \sum_j p_{i,j} \mu^*_j / \simga^2_j 273 | std::array< float, 3 > s = {{ 0.0, 0.0, 0.0 }}; 274 | // \sum_j p_{i,j} \lambda_j / \eta^*_j 275 | std::array< float, 3 > l = {{ 0.0, 0.0, 0.0 }}; 276 | for (unsigned j = 0; j < Pore_Model_Type::n_states; ++j) 277 | { 278 | Float_Type p_ij = std::exp(fwbw.log_posterior(i, j)); 279 | Float_Type term_s0 = p_ij / (pm.state(j).level_stdv * pm.state(j).level_stdv); 280 | Float_Type term_s1 = term_s0 * pm.state(j).level_mean; 281 | Float_Type term_s2 = term_s1 * pm.state(j).level_mean; 282 | Float_Type term_l0 = p_ij * pm.state(j).sd_lambda; 283 | Float_Type term_l1 = term_l0 / pm.state(j).sd_mean; 284 | Float_Type term_l2 = term_l1 / pm.state(j).sd_mean; 285 | LOG(debug2) 286 | << "inner_loop k=" << k << " i=" << i << " j=" << j << " p_ij=" << p_ij 287 | << " term_s0=" << term_s0 << " term_s1=" << term_s1 << " term_s2=" << term_s2 288 | << " term_l0=" << term_l0 << " term_l1=" << term_l1 << " term_l2=" << term_l2 289 | << std::endl; 290 | s[0] += term_s0; 291 | s[1] += term_s1; 292 | s[2] += term_s2; 293 | l[0] += term_l0; 294 | l[1] += term_l1; 295 | l[2] += term_l2; 296 | } // for j 297 | A[0][0] += s[0]; 298 | A[0][1] += s[1]; 299 | A[1][1] += s[2]; 300 | B[0] += s[0] * x_i; 301 | B[1] += s[1] * x_i; 302 | if (pm_train_drift()) 303 | { 304 | A[0][2] += s[0] * t_i; 305 | A[1][2] += s[1] * t_i; 306 | A[2][2] += s[0] * t_i * t_i; 307 | B[2] += s[0] * x_i * t_i; 308 | } 309 | D += s[0] * x_i * x_i; 310 | V_numer += l[2] * y_i; 311 | V_denom += l[1]; 312 | U_pos += l[0] / y_i; 313 | } // for i 314 | } // for k 315 | A[1][0] = A[0][1]; 316 | A[2][0] = A[0][2]; 317 | A[2][1] = A[1][2]; 318 | if (not pm_train_drift()) 319 | { 320 | A[2][2] = 1.0; 321 | } 322 | auto A_copy = A; 323 | auto B_copy = B; 324 | // compute scaling vector used for scaled partial pivoting 325 | std::array< double, 3 > C; 326 | for (unsigned i = 0; i < 3; ++i) 327 | { 328 | C[i] = alg::max_value_of(A[i]); // no need for abs(), as A>0 329 | } 330 | LOG(debug1) 331 | << "A={{" << A[0][0] << ", " << A[0][1] << ", " << A[0][2] 332 | << "}, {" << A[1][0] << ", " << A[1][1] << ", " << A[1][2] 333 | << "}, {" << A[2][0] << ", " << A[2][1] << ", " << A[2][2] 334 | << "}} B={" << B[0] << ", " << B[1] << ", " << B[2] 335 | << "} C={" << C[0] << ", " << C[1] << ", " << C[2] << "}" << std::endl; 336 | // 337 | // solve A * X = B using Gaussian elimination with partial pivoting 338 | // 339 | for (unsigned i = 0; i < 3; ++i) 340 | { 341 | unsigned p = i; 342 | double p_val = std::abs(A[i][i]) / C[p]; 343 | for (unsigned i2 = i + 1; i2 < 3; ++i2) 344 | { 345 | double i2_val = std::abs(A[i2][i]) / C[i2]; 346 | if (i2_val > p_val) 347 | { 348 | p = i2; 349 | p_val = i2_val; 350 | } 351 | } 352 | LOG(debug1) 353 | << "gaussian_elimination i=" << i << " p=" << p << " p_val=" << p_val << std::endl; 354 | // if the pivot is too small, consider matrix singular, and give up 355 | if (p_val < 1e-7) 356 | { 357 | done = true; 358 | new_pm_params = *data.pm_params_ptr; 359 | return; 360 | } 361 | // if necessary, interchange rows i & p 362 | if (p > i) 363 | { 364 | std::swap(A[i], A[p]); 365 | std::swap(B[i], B[p]); 366 | std::swap(C[i], C[p]); 367 | } 368 | // eliminate variable i from the last i-1 equations 369 | for (p = i + 1; p < 3; ++p) 370 | { 371 | double m = A[p][i] / A[i][i]; 372 | A[p][i] = 0; 373 | for (unsigned j = i + 1; j < 3; ++j) 374 | { 375 | A[p][j] -= m * A[i][j]; 376 | } 377 | B[p] -= m * B[i]; 378 | } 379 | LOG(debug1) 380 | << "gaussian_elimination i=" << i 381 | << " A={{" << A[0][0] << ", " << A[0][1] << ", " << A[0][2] 382 | << "}, {" << A[1][0] << ", " << A[1][1] << ", " << A[1][2] 383 | << "}, {" << A[2][0] << ", " << A[2][1] << ", " << A[2][2] 384 | << "}} B={" << B[0] << ", " << B[1] << ", " << B[2] 385 | << "} C={" << C[0] << ", " << C[1] << ", " << C[2] << "}" << std::endl; 386 | } 387 | // solve the upper triangular system by hand, storing the solutions as the new parameters 388 | c_hat = B[2] / A[2][2]; 389 | b_hat = (B[1] - A[1][2] * c_hat) / A[1][1]; 390 | a_hat = (B[0] - A[0][1] * b_hat - A[0][2] * c_hat) / A[0][0]; 391 | LOG(debug1) 392 | << "update_step a=" << a_hat << " b=" << b_hat << " c=" << c_hat << std::endl; 393 | #ifndef NDEBUG 394 | // sanity check 395 | for (unsigned i = 0; i < 3; ++i) 396 | { 397 | double x = (A_copy[i][0] * a_hat 398 | + A_copy[i][1] * b_hat 399 | + A_copy[i][2] * c_hat); 400 | ASSERT((x - B_copy[i])/std::max(x, B_copy[i]) < pm_train_drift()? 1e-3 : 1e-2); 401 | } 402 | #endif 403 | // 404 | // update var 405 | // 406 | double d_numer = (D 407 | + a_hat * a_hat * A_copy[0][0] 408 | + b_hat * b_hat * A_copy[1][1] 409 | + c_hat * c_hat * A_copy[2][2] 410 | + 2.0 * a_hat * b_hat * A_copy[0][1] 411 | + 2.0 * a_hat * c_hat * A_copy[0][2] 412 | + 2.0 * b_hat * c_hat * A_copy[1][2] 413 | - 2.0 * (a_hat * B_copy[0] 414 | + b_hat * B_copy[1] 415 | + c_hat * B_copy[2]) 416 | ); 417 | d_hat = std::sqrt(d_numer / (double)total_n_events); 418 | LOG(debug1) << "update_step d=" << d_hat << std::endl; 419 | // 420 | // update scale_sd 421 | // 422 | v_hat = V_numer / V_denom; 423 | // 424 | // update var_sd 425 | // 426 | u_hat = (double)total_n_events / (U_pos - V_denom / v_hat); 427 | } 428 | 429 | /** 430 | * Train st_params on training data. 431 | * @data Training data, as filled by fill_train_data. 432 | * @new_st_params Destination for new st params. 433 | */ 434 | static void train_st_params(const Train_Data& data, 435 | std::array< State_Transition_Parameters_Type, 2 >& new_st_params) 436 | { 437 | unsigned n_event_seqs = data.event_seq_ptr_v.size(); 438 | for (unsigned st = 0; st < 2; ++st) 439 | { 440 | ASSERT(data.st_params_ptr_v[st]); 441 | LogSumSet_Type s_p_stay_num(false); 442 | LogSumSet_Type s_p_skip_num(false); 443 | LogSumSet_Type s_denom(false); 444 | Float_Type log_p_stay = std::log(data.st_params_ptr_v[st]->p_stay); 445 | Float_Type log_p_step_4 = std::log(1.0 - data.st_params_ptr_v[st]->p_stay - data.st_params_ptr_v[st]->p_skip) - std::log(4.0); 446 | for (unsigned k = 0; k < n_event_seqs; ++k) 447 | { 448 | if (data.event_seq_ptr_v[k].second != st) continue; 449 | const Pore_Model_Type& scaled_pm = data.scaled_model_v[st]; 450 | const Event_Sequence_Type& corrected_events = data.corrected_event_seq_v.at(k); 451 | unsigned n_events = corrected_events.size(); 452 | const Forward_Backward_Type& fwbw = data.fwbw_v.at(k); 453 | // 454 | // P[S_i = j1, S_{i+1} = j2] 455 | // 456 | auto log_joint_prob = [&] (unsigned i, unsigned j1, unsigned j2, Float_Type log_p_trans) { 457 | Float_Type p = fwbw.cell(i, j1).alpha 458 | + log_p_trans 459 | + scaled_pm.log_pr_corrected_emission(j2, corrected_events[i + 1]) 460 | + fwbw.cell(i + 1, j2).beta 461 | - fwbw.log_pr_data(); 462 | LOG(debug2) << "step_prob k=" << k 463 | << " i=" << i 464 | << " j1=" << Kmer_Type::to_string(j1) 465 | << " j2=" << Kmer_Type::to_string(j2) 466 | << " log_p_trans=" << log_p_trans 467 | << " res=" << p << std::endl; 468 | return p; 469 | }; 470 | 471 | for (unsigned i = 0; i < n_events - 1; ++i) 472 | { 473 | for (auto j1 : st_train_kmers()) 474 | { 475 | // Pr[ S_i = j1 ] 476 | Float_Type log_p_j1 = fwbw.log_posterior(i, j1); 477 | s_denom.add(log_p_j1); 478 | // Pr[ S_i = j1, S_{i+1} = j1 ] 479 | Float_Type log_p_j1_j1 = log_joint_prob(i, j1, j1, log_p_stay); 480 | if (log_p_j1_j1 > log_p_j1) 481 | { 482 | if (log_p_j1_j1 > log_p_j1 + std::max(std::abs(log_p_j1), 1.0f) * 1.0e-3) 483 | { 484 | LOG(warning) << "numerical error log_p_j1 [" << log_p_j1 485 | << "] log_p_j1_j1 [" << log_p_j1_j1 << "]" << std::endl; 486 | } 487 | log_p_j1_j1 = log_p_j1; 488 | } 489 | s_p_stay_num.add(log_p_j1_j1); 490 | // Pr[ S_i = j1, dist(j1,S_{i+1}) > 1 ] 491 | Float_Type log_p_j1_d01; 492 | { 493 | LogSumSet_Type s2(false); 494 | s2.add(log_p_j1_j1); 495 | for (auto j2 : Kmer_Type::neighbour_list(j1, 1)) 496 | { 497 | // transition prob j1 to j2 is (p_step / 4) 498 | s2.add(log_joint_prob(i, j1, j2, log_p_step_4)); 499 | } 500 | log_p_j1_d01 = s2.val(); 501 | } 502 | if (log_p_j1_d01 > log_p_j1) 503 | { 504 | if (log_p_j1_d01 > log_p_j1 + std::max(std::abs(log_p_j1), 1.0f) * 1.0e-3) 505 | { 506 | LOG(warning) << "numerical error log_p_j1 [" << log_p_j1 507 | << "] log_p_j1_d01 [" << log_p_j1_d01 << "]" << std::endl; 508 | } 509 | log_p_j1_d01 = log_p_j1; 510 | } 511 | Float_Type p_j1_d2 = std::exp(log_p_j1) - std::exp(log_p_j1_d01); 512 | s_p_skip_num.add(std::log(p_j1_d2)); 513 | } // for j1 514 | } // for i 515 | } // for k 516 | new_st_params[st].p_stay = std::exp(s_p_stay_num.val() - s_denom.val()); 517 | new_st_params[st].p_skip = std::exp(s_p_skip_num.val() - s_denom.val()); 518 | if (new_st_params[st].p_stay < .05 or new_st_params[st].p_stay > .4 519 | or new_st_params[st].p_skip < .05 or new_st_params[st].p_skip > .4) 520 | { 521 | State_Transition_Parameters_Type alt_st_params; 522 | alt_st_params.p_stay = std::max(new_st_params[st].p_stay, .05f); 523 | alt_st_params.p_stay = std::min(alt_st_params.p_stay, .4f); 524 | alt_st_params.p_skip = std::max(new_st_params[st].p_skip, .05f); 525 | alt_st_params.p_skip = std::min(alt_st_params.p_skip, .4f); 526 | LOG(warning) << "unusual state transition parameters " << new_st_params[st] 527 | << " for strand [" << st 528 | << "] resetting them to " << alt_st_params << std::endl; 529 | std::swap(alt_st_params, new_st_params[st]); 530 | } 531 | } // for st 532 | } // train_st_params() 533 | 534 | /** 535 | * Perform one training round. 536 | * @new_pm_params Destination for trained pm params (common to both strands) 537 | * @new_st_params Destination for trained st params (per strand) 538 | * @fit Destination for pr_data using crt params 539 | * @done Bool; set to true if no more training rounds can be performed due to singularity. 540 | */ 541 | static void train_one_round( 542 | const std::vector< std::pair< const Event_Sequence_Type*, unsigned > >& event_seq_ptrs, 543 | const std::array< const Pore_Model_Type*, 2 >& model_ptrs, 544 | const State_Transitions_Type& default_transitions, 545 | const Pore_Model_Parameters_Type& crt_pm_params, 546 | const std::array< State_Transition_Parameters_Type, 2 >& crt_st_params, 547 | Pore_Model_Parameters_Type& new_pm_params, 548 | std::array< State_Transition_Parameters_Type, 2 >& new_st_params, 549 | Float_Type& fit, 550 | bool& done, 551 | bool train_scaling, 552 | bool train_transitions) 553 | { 554 | // initialize training data 555 | Train_Data data; 556 | data.event_seq_ptr_v = event_seq_ptrs; 557 | data.model_ptr_v = model_ptrs; 558 | data.default_transitions_ptr = &default_transitions; 559 | data.pm_params_ptr = &crt_pm_params; 560 | data.st_params_ptr_v = {{ &crt_st_params[0], &crt_st_params[1] }}; 561 | // fill the training data 562 | fill_train_data(data); 563 | fit = data.fit; 564 | if (train_scaling) 565 | { 566 | // train pm params 567 | train_pm_params(data, new_pm_params, done); 568 | if (done) 569 | { 570 | new_st_params = crt_st_params; 571 | return; 572 | } 573 | } 574 | if (train_transitions) 575 | { 576 | // train st params 577 | train_st_params(data, new_st_params); 578 | } 579 | } // train_one_round 580 | 581 | }; // class Parameter_Trainer 582 | 583 | #endif 584 | -------------------------------------------------------------------------------- /src/nanocall/Pore_Model.hpp: -------------------------------------------------------------------------------- 1 | //--------------------------------------------------------- 2 | // Copyright 2015 Ontario Institute for Cancer Research 3 | // Written by Jared Simpson (jared.simpson@oicr.on.ca) 4 | //--------------------------------------------------------- 5 | // 6 | // nanopolish_poremodel -- Representation of the Oxford 7 | // Nanopore sequencing model, as described in a FAST5 file 8 | // 9 | #ifndef __POREMODEL_HPP 10 | #define __POREMODEL_HPP 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "Kmer.hpp" 20 | #include "Event.hpp" 21 | #include "fast5.hpp" 22 | #include "alg.hpp" 23 | 24 | template < typename Float_Type > 25 | inline Float_Type log_normal_pdf(Float_Type x, Float_Type mean, Float_Type stdv, Float_Type log_stdv) 26 | { 27 | // From SO: http://stackoverflow.com/questions/10847007/using-the-gaussian-probability-density-function-in-c 28 | static const Float_Type log_2pi = std::log(2.0 * M_PI); 29 | Float_Type a = (x - mean) / stdv; 30 | return - log_stdv - (log_2pi + a * a) / static_cast< Float_Type >(2.0); 31 | } 32 | 33 | template < typename Float_Type > 34 | inline Float_Type log_invgauss_pdf(Float_Type x, Float_Type log_x, 35 | Float_Type mu, Float_Type lambda, Float_Type log_lambda) 36 | { 37 | static const Float_Type log_2pi = std::log(2.0 * M_PI); 38 | Float_Type a = (x - mu) / mu; 39 | return (log_lambda - log_2pi - static_cast< Float_Type >(3.0) * log_x - lambda * a * a / x) / static_cast< Float_Type >(2.0); 40 | } 41 | 42 | template < typename Float_Type > 43 | struct Pore_Model_Parameters 44 | { 45 | Pore_Model_Parameters() : scale(1.0), shift(0.0), drift(0.0), var(1.0), scale_sd(1.0), var_sd(1.0) {} 46 | 47 | Float_Type scale; 48 | Float_Type shift; 49 | Float_Type drift; 50 | Float_Type var; 51 | Float_Type scale_sd; 52 | Float_Type var_sd; 53 | 54 | void load_from_fast5(const fast5::File& f, bool strand) 55 | { 56 | assert(f.have_basecall_model(strand)); 57 | auto m_p = f.get_basecall_model_params(strand); 58 | scale = m_p.scale; 59 | shift = m_p.shift; 60 | drift = m_p.drift; 61 | var = m_p.var; 62 | scale_sd = m_p.scale_sd; 63 | var_sd = m_p.var_sd; 64 | } 65 | 66 | friend std::ostream& operator << (std::ostream& os, const Pore_Model_Parameters& p) 67 | { 68 | os << "[scale=" << p.scale << " shift=" << p.shift << " drift=" << p.drift 69 | << " var=" << p.var << " scale_sd=" << p.scale_sd << " var_sd=" << p.var_sd << "]"; 70 | return os; 71 | } 72 | void write_tsv(std::ostream& os) const 73 | { 74 | os << std::fixed << std::setprecision(5) 75 | << scale << '\t' << shift << '\t' << drift << '\t' << var << '\t' << scale_sd << '\t' << var_sd; 76 | } 77 | }; // struct Pore_Model_Parameters 78 | 79 | template < typename Float_Type, unsigned Kmer_Size > 80 | struct Pore_Model_State 81 | { 82 | typedef Event< Float_Type, Kmer_Size > Event_Type; 83 | typedef Pore_Model_Parameters< Float_Type > Pore_Model_Parameters_Type; 84 | 85 | Float_Type level_mean; 86 | Float_Type level_stdv; 87 | Float_Type sd_mean; 88 | Float_Type sd_stdv; 89 | Float_Type sd_lambda; 90 | 91 | Float_Type log_level_mean; 92 | Float_Type log_level_stdv; 93 | Float_Type log_sd_mean; 94 | Float_Type log_sd_stdv; 95 | Float_Type log_sd_lambda; 96 | 97 | std::array< char, Kmer_Size > kmer; 98 | 99 | Pore_Model_State& operator = (const fast5::Model_Entry& e) 100 | { 101 | level_mean = e.level_mean; 102 | level_stdv = e.level_stdv; 103 | sd_mean = e.sd_mean; 104 | sd_stdv = e.sd_stdv; 105 | std::copy_n(e.kmer.begin(), Kmer_Size, kmer.begin()); 106 | update_sd_lambda(); 107 | update_logs(); 108 | return *this; 109 | } 110 | 111 | // update sd_lambda based on sd_mean & sd_stdv 112 | void update_sd_lambda() { sd_lambda = pow(sd_mean, 3.0) / pow(sd_stdv, 2.0); } 113 | 114 | // update sd_stdv based on sd_mean & sd_lambda 115 | void update_sd_stdv() { sd_stdv = pow(pow(sd_mean, 3.0) / sd_lambda, .5); } 116 | 117 | // update logs 118 | void update_logs() 119 | { 120 | log_level_mean = std::log(level_mean); 121 | log_level_stdv = std::log(level_stdv); 122 | log_sd_mean = std::log(sd_mean); 123 | log_sd_lambda = std::log(sd_lambda); 124 | } 125 | 126 | void scale(const Pore_Model_Parameters_Type& params, const Pore_Model_Parameters_Type& log_params) 127 | { 128 | // these functions are provided by ONT 129 | level_mean = level_mean * params.scale + params.shift; 130 | level_stdv = level_stdv * params.var; 131 | sd_mean = sd_mean * params.scale_sd; 132 | sd_lambda = sd_lambda * params.var_sd; 133 | update_sd_stdv(); 134 | log_level_mean = std::log(level_mean); 135 | log_level_stdv += log_params.var; 136 | log_sd_mean += log_params.scale_sd; 137 | log_sd_lambda += log_params.var_sd; 138 | } 139 | 140 | Float_Type log_pr_emission(const Event_Type& e) const 141 | { 142 | return (log_normal_pdf< Float_Type >(e.mean, level_mean, level_stdv, log_level_stdv) 143 | + log_invgauss_pdf< Float_Type >(e.stdv, e.log_stdv, sd_mean, sd_lambda, log_sd_lambda)); 144 | } 145 | Float_Type log_pr_corrected_emission(const Event_Type& e) const 146 | { 147 | return (log_normal_pdf< Float_Type >(e.corrected_mean, level_mean, level_stdv, log_level_stdv) 148 | + log_invgauss_pdf< Float_Type >(e.stdv, e.log_stdv, sd_mean, sd_lambda, log_sd_lambda)); 149 | } 150 | 151 | friend std::ostream& operator << (std::ostream& os, const Pore_Model_State& state) 152 | { 153 | os << std::string(state.kmer.begin(), state.kmer.end()) << '\t' 154 | << state.level_mean << '\t' 155 | << state.level_stdv << '\t' 156 | << state.sd_mean << '\t' 157 | << state.sd_stdv; 158 | return os; 159 | } 160 | 161 | friend bool operator < (const Pore_Model_State& lhs, const Pore_Model_State& rhs) 162 | { 163 | return lhs.kmer < rhs.kmer; 164 | } 165 | }; // struct Pore_Model_State 166 | 167 | template < typename Float_Type, unsigned Kmer_Size = 6 > 168 | class Pore_Model 169 | { 170 | public: 171 | typedef Kmer< Kmer_Size > Kmer_Type; 172 | typedef Event< Float_Type, Kmer_Size > Event_Type; 173 | typedef Pore_Model_State< Float_Type, Kmer_Size > Pore_Model_State_Type; 174 | typedef Pore_Model_Parameters< Float_Type > Pore_Model_Parameters_Type; 175 | static const unsigned n_states = 1u << (2 * Kmer_Size); 176 | 177 | Pore_Model() : _strand(2) {} 178 | void clear() { _state.clear(); } 179 | 180 | const Pore_Model_State_Type& state(unsigned i) const { return _state.at(i); } 181 | Pore_Model_State_Type& state(unsigned i) { return _state.at(i); } 182 | 183 | const std::vector< Pore_Model_State_Type >& get_state_vector() const { return _state; } 184 | 185 | const unsigned& strand() const { return _strand; } 186 | unsigned& strand() { return _strand; } 187 | Float_Type mean() const { return _mean; } 188 | Float_Type stdv() const { return _stdv; } 189 | 190 | void scale(const Pore_Model_Parameters_Type& params) 191 | { 192 | Pore_Model_Parameters_Type log_params; 193 | log_params.var = std::log(params.var); 194 | log_params.scale_sd = std::log(params.scale_sd); 195 | log_params.var_sd = std::log(params.var_sd); 196 | for(unsigned i = 0; i < n_states; ++i) 197 | { 198 | state(i).scale(params, log_params); 199 | } 200 | update_statistics(); 201 | } 202 | 203 | // load model from fast5 file 204 | void load_from_fast5(const fast5::File& f, bool strand) 205 | { 206 | assert(f.have_basecall_model(strand)); 207 | auto m = f.get_basecall_model(strand); 208 | assert(m.size() == n_states); 209 | _state.clear(); 210 | _state.reserve(n_states); 211 | for (unsigned i = 0; i < n_states; ++i) 212 | { 213 | _state.emplace_back(); 214 | state(i) = m.at(i); 215 | } 216 | update_statistics(); 217 | } 218 | 219 | // load from vector 220 | template < typename V_Float_Type > 221 | void load_from_vector(const std::vector< V_Float_Type >& v) 222 | { 223 | assert(v.size() == n_states * 4); 224 | _state.clear(); 225 | _state.reserve(n_states); 226 | for (unsigned i = 0; i < n_states; ++i) 227 | { 228 | _state.emplace_back(); 229 | state(i).level_mean = v[4 * i + 0]; 230 | state(i).level_stdv = v[4 * i + 1]; 231 | state(i).sd_mean = v[4 * i + 2]; 232 | state(i).sd_stdv = v[4 * i + 3]; 233 | auto s = Kmer_Type::to_string(i); 234 | std::copy_n(s.begin(), Kmer_Size, state(i).kmer.begin()); 235 | state(i).update_sd_lambda(); 236 | state(i).update_logs(); 237 | } 238 | update_statistics(); 239 | } 240 | 241 | // write model to out stream 242 | friend std::ostream& operator << (std::ostream& os, const Pore_Model& pm) 243 | { 244 | for (unsigned i = 0; i < pm.n_states; ++i) 245 | { 246 | os << pm.state(i) << std::endl; 247 | } 248 | return os; 249 | } 250 | // load model from input stream 251 | friend std::istream& operator >> (std::istream& is, Pore_Model& pm) 252 | { 253 | pm._state.clear(); 254 | pm._state.reserve(n_states); 255 | unsigned i = 0; 256 | std::string line; 257 | while (std::getline(is, line)) 258 | { 259 | std::istringstream iss(line); 260 | std::string s; 261 | iss >> s; 262 | if (s[0] == '#') continue; 263 | if (line.find("kmer") != std::string::npos) continue; 264 | pm._state.emplace_back(); 265 | iss >> pm.state(i).level_mean 266 | >> pm.state(i).level_stdv 267 | >> pm.state(i).sd_mean 268 | >> pm.state(i).sd_stdv; 269 | std::copy_n(s.begin(), Kmer_Size, pm.state(i).kmer.begin()); 270 | pm.state(i).update_sd_lambda(); 271 | pm.state(i).update_logs(); 272 | ++i; 273 | } 274 | if (i != pm.n_states) 275 | { 276 | LOG(error) 277 | << "unexpected number of states" << std::endl; 278 | std::exit(EXIT_FAILURE); 279 | } 280 | std::sort(pm._state.begin(), pm._state.end()); 281 | for (unsigned i = 0; i < pm.n_states; ++i) 282 | { 283 | assert(Kmer_Type::to_int(pm.state(i).kmer) == i); 284 | } 285 | pm.update_statistics(); 286 | return is; 287 | } 288 | 289 | // log of probability of an emission from a state 290 | Float_Type log_pr_emission(unsigned i, const Event_Type& e) const 291 | { 292 | Float_Type res = state(i).log_pr_emission(e); 293 | return res; 294 | } 295 | Float_Type log_pr_corrected_emission(unsigned i, const Event_Type& e) const 296 | { 297 | Float_Type res = state(i).log_pr_corrected_emission(e); 298 | return res; 299 | } 300 | 301 | private: 302 | std::vector< Pore_Model_State_Type > _state; 303 | Float_Type _mean; 304 | Float_Type _stdv; 305 | unsigned _strand; 306 | 307 | void update_statistics() 308 | { 309 | assert(_state.size() == n_states); 310 | std::tie(_mean, _stdv) = alg::mean_stdv_of< Float_Type >( 311 | _state, 312 | [] (const Pore_Model_State_Type& s) { return s.level_mean; }); 313 | } 314 | }; // class Pore_Model 315 | 316 | template < typename Float_Type, unsigned Kmer_Size > 317 | using Pore_Model_Dict = std::map< std::string, Pore_Model< Float_Type, Kmer_Size > >; 318 | 319 | #endif 320 | -------------------------------------------------------------------------------- /src/nanocall/State_Transitions.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __STATE_TRANSITIONS_BASE_HPP 2 | #define __STATE_TRANSITIONS_BASE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "Kmer.hpp" 11 | #include "logsumset.hpp" 12 | #include "logger.hpp" 13 | 14 | template < typename Float_Type > 15 | struct State_Transition_Parameters 16 | { 17 | Float_Type p_stay; 18 | Float_Type p_skip; 19 | 20 | static Float_Type& default_p_stay() 21 | { 22 | static Float_Type _default_p_stay = .09; 23 | return _default_p_stay; 24 | } 25 | static Float_Type& default_p_skip() 26 | { 27 | static Float_Type _default_p_skip = .28; 28 | return _default_p_skip; 29 | } 30 | 31 | State_Transition_Parameters() 32 | : p_stay(default_p_stay()), p_skip(default_p_skip()) {} 33 | 34 | bool is_default() const 35 | { 36 | return p_stay == default_p_stay() and p_skip == default_p_skip(); 37 | } 38 | 39 | friend std::ostream& operator << (std::ostream& os, const State_Transition_Parameters& stp) 40 | { 41 | os << "[p_stay=" << stp.p_stay 42 | << " p_skip=" << stp.p_skip << "]"; 43 | return os; 44 | } 45 | void write_tsv(std::ostream& os) const 46 | { 47 | os << std::fixed << std::setprecision(5) 48 | << p_stay << '\t' 49 | << p_skip; 50 | } 51 | }; // struct State_Transition_Parameters 52 | 53 | template < typename Float_Type > 54 | struct State_Neighbours 55 | { 56 | State_Neighbours() : p_rest_from(-INFINITY), p_rest_to(-INFINITY) {} 57 | std::vector< std::pair< unsigned, Float_Type > > from_v; 58 | std::vector< std::pair< unsigned, Float_Type > > to_v; 59 | Float_Type p_rest_from; 60 | Float_Type p_rest_to; 61 | }; // struct State_Neighbours 62 | 63 | template < typename Float_Type, unsigned Kmer_Size = 6 > 64 | class State_Transitions 65 | { 66 | public: 67 | typedef Kmer< Kmer_Size > Kmer_Type; 68 | typedef State_Neighbours< Float_Type > State_Neighbours_Type; 69 | typedef State_Transition_Parameters< Float_Type > State_Transition_Parameters_Type; 70 | static const unsigned n_states = 1u << (2 * Kmer_Size); 71 | 72 | State_Transitions() = default; 73 | void clear() { _neighbours.clear(); } 74 | 75 | const State_Neighbours_Type& neighbours(unsigned i) const { return _neighbours.at(i); } 76 | State_Neighbours_Type& neighbours(unsigned i) { return _neighbours.at(i); } 77 | 78 | // update fields from_v, p_rest_from, p_rest_to based on to_v 79 | void update_fields() 80 | { 81 | for (unsigned i = 0; i < n_states; ++i) 82 | { 83 | neighbours(i).from_v.clear(); 84 | } 85 | for (unsigned i = 0; i < n_states; ++i) 86 | { 87 | logsum::logsumset< Float_Type > s(false); 88 | for (const auto& p : neighbours(i).to_v) 89 | { 90 | neighbours(p.first).from_v.push_back(std::make_pair(i, p.second)); 91 | s.add(p.second); 92 | } 93 | neighbours(i).p_rest_to = std::log(1 - std::exp(s.val())); 94 | } 95 | for (unsigned i = 0; i < n_states; ++i) 96 | { 97 | logsum::logsumset< Float_Type > s(false); 98 | for (const auto& p : neighbours(i).from_v) 99 | { 100 | s.add(p.second); 101 | } 102 | neighbours(i).p_rest_from = std::log(1 - std::exp(s.val())); 103 | } 104 | } 105 | 106 | // drop transitions with low probability 107 | void drop_transitions(Float_Type p_cutoff) 108 | { 109 | Float_Type log_p_cutoff = std::log(p_cutoff); 110 | for (unsigned i = 0; i < n_states; ++i) 111 | { 112 | decltype(neighbours(i).to_v) to_v; 113 | for (const auto& p : neighbours(i).to_v) 114 | { 115 | if (p.second > log_p_cutoff) 116 | { 117 | to_v.push_back(p); 118 | } 119 | } 120 | neighbours(i).to_v = std::move(to_v); 121 | } 122 | update_fields(); 123 | } 124 | 125 | static Float_Type get_trans_prob(unsigned i, unsigned j, 126 | Float_Type p_stay, Float_Type p_step, Float_Type p_skip_1) 127 | { 128 | Float_Type p = 0; 129 | if (i == j) 130 | { 131 | p += p_stay; 132 | } 133 | if (Kmer_Type::suffix(i, Kmer_Size - 1) == Kmer_Type::prefix(j, Kmer_Size - 1)) 134 | { 135 | p += p_step / 4; 136 | } 137 | for (unsigned l = 2; l < Kmer_Size; ++l) 138 | if (Kmer_Type::suffix(i, Kmer_Size - l) == Kmer_Type::prefix(j, Kmer_Size - l)) 139 | { 140 | p += pow(p_skip_1, l - 1) / (1u << (2 * l)); 141 | } 142 | p += (pow(p_skip_1, 5) / (Float_Type(1.0) - p_skip_1)) / n_states; 143 | return p; 144 | } 145 | 146 | // recompute transition table 147 | void compute_transitions(Float_Type p_skip_default, Float_Type p_stay, Float_Type p_cutoff, 148 | const std::map< unsigned, Float_Type >& p_skip_map = {}) 149 | { 150 | _neighbours.clear(); 151 | _neighbours.reserve(n_states); 152 | for (unsigned i = 0; i < n_states; ++i) 153 | { 154 | _neighbours.emplace_back(); 155 | Float_Type p_skip = p_skip_default; 156 | if (p_skip_map.count(i)) 157 | { 158 | p_skip = p_skip_map.at(i); 159 | } 160 | Float_Type p_step = 1.0 - p_stay - p_skip; 161 | // p_skip = sum_{i>=1} p_skip_1^i 162 | Float_Type p_skip_1 = p_skip / (p_skip + 1.0); 163 | LOG(debug2) << "i=" << Kmer_Type::to_string(i) 164 | << " p_stay=" << p_stay 165 | << " p_skip=" << p_skip 166 | << " p_step=" << p_step 167 | << " p_skip_1=" << p_skip_1 << std::endl; 168 | for (unsigned j = 0; j < n_states; ++j) 169 | { 170 | Float_Type p = get_trans_prob(i, j, p_stay, p_step, p_skip_1); 171 | if (p > p_cutoff) 172 | { 173 | neighbours(i).to_v.push_back(std::make_pair(j, std::log(p))); 174 | } 175 | } 176 | } 177 | update_fields(); 178 | } 179 | 180 | // compute transition table allowing a maximum of 1 skip 181 | void compute_transitions_fast(Float_Type p_skip_default, Float_Type p_stay, 182 | const std::map< unsigned, Float_Type >& p_skip_map = {}) 183 | { 184 | struct Default_Float_Type 185 | { 186 | Default_Float_Type(Float_Type _val = 0.0) : val(_val) {} 187 | Float_Type val; 188 | }; // struct Default_Float 189 | 190 | _neighbours.clear(); 191 | _neighbours.reserve(n_states); 192 | for (unsigned i = 0; i < n_states; ++i) 193 | { 194 | _neighbours.emplace_back(); 195 | Float_Type p_skip = p_skip_default; 196 | if (p_skip_map.count(i)) 197 | { 198 | p_skip = p_skip_map.at(i); 199 | } 200 | Float_Type p_step = 1.0 - p_stay - p_skip; 201 | // p_skip = sum_{i>=1} p_skip_1^i 202 | Float_Type p_skip_1 = p_skip / (p_skip + 1.0); 203 | LOG(debug2) << "i=" << Kmer_Type::to_string(i) 204 | << " p_stay=" << p_stay 205 | << " p_skip=" << p_skip 206 | << " p_step=" << p_step 207 | << " p_skip_1=" << p_skip_1 << std::endl; 208 | std::set< unsigned > to_s{i}; 209 | const auto& nl1 = Kmer_Type::neighbour_list(i, 1); 210 | to_s.insert(nl1.begin(), nl1.end()); 211 | const auto& nl2 = Kmer_Type::neighbour_list(i, 2); 212 | to_s.insert(nl2.begin(), nl2.end()); 213 | for (const auto& j : to_s) 214 | { 215 | Float_Type p = get_trans_prob(i, j, p_stay, p_step, p_skip_1); 216 | neighbours(i).to_v.push_back(std::make_pair(j, std::log(p))); 217 | } 218 | } 219 | update_fields(); 220 | } 221 | void compute_transitions_fast(const State_Transition_Parameters_Type& stp) 222 | { 223 | compute_transitions_fast(stp.p_skip, stp.p_stay); 224 | } 225 | 226 | friend std::ostream& operator << (std::ostream& os, const State_Transitions& st) 227 | { 228 | for (unsigned i = 0; i < n_states; ++i) 229 | { 230 | for (const auto& p : st.neighbours(i).to_v) 231 | { 232 | os << Kmer_Type::to_string(i) << '\t' << Kmer_Type::to_string(p.first) << '\t' << p.second << std::endl; 233 | } 234 | } 235 | return os; 236 | } 237 | friend std::istream& operator >> (std::istream& is, State_Transitions& st) 238 | { 239 | st._neighbours.clear(); 240 | st._neighbours.resize(n_states); 241 | std::string k_i; 242 | std::string k_j; 243 | Float_Type p; 244 | while (is >> k_i >> k_j >> p) 245 | { 246 | unsigned i = Kmer_Type::to_int(k_i); 247 | unsigned j = Kmer_Type::to_int(k_j); 248 | st.neighbours(i).to_v.push_back(std::make_pair(j, p)); 249 | } 250 | st.update_fields(); 251 | return is; 252 | } 253 | 254 | private: 255 | std::vector< State_Neighbours_Type > _neighbours; 256 | }; // class State_Transitions 257 | 258 | #endif 259 | -------------------------------------------------------------------------------- /src/nanocall/Viterbi.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __VITERBI_HPP 2 | #define __VITERBI_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "Pore_Model.hpp" 10 | #include "State_Transitions.hpp" 11 | #include "logsumset.hpp" 12 | #include "logger.hpp" 13 | #include "fast5.hpp" 14 | 15 | template < typename Float_Type, unsigned Kmer_Size = 6 > 16 | class Viterbi 17 | { 18 | public: 19 | typedef Kmer< Kmer_Size > Kmer_Type; 20 | typedef Pore_Model< Float_Type, Kmer_Size > Pore_Model_Type; 21 | typedef State_Transitions< Float_Type, Kmer_Size > State_Transitions_Type; 22 | typedef Event< Float_Type, Kmer_Size > Event_Type; 23 | typedef Event_Sequence< Float_Type, Kmer_Size > Event_Sequence_Type; 24 | typedef logsum::logsumset< Float_Type > LogSumSet_Type; 25 | 26 | struct Matrix_Entry 27 | { 28 | Float_Type alpha; // := Pr[ MLSS producing e_1 ... e_i, with S_i == j ] 29 | unsigned beta; // := previous state in the MLSS 30 | }; // struct Matrix_Entry 31 | 32 | static const unsigned n_states = Pore_Model_Type::n_states; 33 | 34 | unsigned n_events() const { return _n_events; } 35 | Float_Type path_probability() const { return _path_probability; } 36 | 37 | // i: event index 38 | // j: state/kmer index 39 | const Matrix_Entry& cell(unsigned i, unsigned j) const { return _m[i * n_states + j]; } 40 | Matrix_Entry& cell(unsigned i, unsigned j) { return _m[i * n_states + j]; } 41 | 42 | static unsigned& n_threads() { static unsigned _n_threads = 1; return _n_threads; } 43 | 44 | void fill(const Pore_Model_Type& pm, 45 | const State_Transitions_Type& st, 46 | Event_Sequence_Type& ev) 47 | { 48 | _n_events = ev.size(); 49 | _m.clear(); 50 | _m.resize(n_states * n_events()); 51 | Float_Type log_n_states = std::log(static_cast< Float_Type >(n_states)); 52 | // 53 | // alpha, beta; i == 0 54 | // 55 | { 56 | LOG("Viterbi", debug1) << "forward: i=0" << std::endl; 57 | for (unsigned j = 0; j < n_states; ++j) 58 | { 59 | // alpha 60 | cell(0, j).alpha = pm.log_pr_corrected_emission(j, ev[0]) - log_n_states; 61 | // beta 62 | cell(0, j).beta = n_states; 63 | LOG("Viterbi", debug2) 64 | << "i=0 j=" << Kmer_Type::to_string(j) 65 | << " alpha=" << cell(0, j).alpha 66 | << " beta=" << cell(0, j).beta << std::endl; 67 | } 68 | } 69 | // 70 | // alpha, beta; i > 0 71 | // 72 | for (unsigned i = 1; i < n_events(); ++i) 73 | { 74 | LOG("Viterbi", debug1) << "forward: i=" << i << std::endl; 75 | for (unsigned j = 0; j < n_states; ++j) // TODO: parallelize 76 | { 77 | cell(i, j).alpha = -INFINITY; 78 | cell(i, j).beta = n_states; 79 | for (const auto& p : st.neighbours(j).from_v) 80 | { 81 | const unsigned& j_prev = p.first; 82 | const Float_Type& log_pr_transition = p.second; 83 | Float_Type v = log_pr_transition + cell(i - 1, j_prev).alpha; 84 | if (v > cell(i, j).alpha) 85 | { 86 | cell(i, j).alpha = v; 87 | cell(i, j).beta = j_prev; 88 | } 89 | } 90 | cell(i, j).alpha += pm.log_pr_corrected_emission(j, ev[i]); 91 | LOG("Viterbi", debug2) 92 | << "i=" << i << " j=" << Kmer_Type::to_string(j) 93 | << " alpha=" << cell(i, j).alpha 94 | << " beta=" << cell(i, j).beta << std::endl; 95 | } 96 | } 97 | fill_state_seq(ev); 98 | fill_move_seq(ev); 99 | } 100 | 101 | friend std::ostream& operator << (std::ostream& os, const Viterbi& vit) 102 | { 103 | for (unsigned i = 0; i < vit.n_events(); ++i) 104 | { 105 | for (unsigned j = 0; j < vit.n_states; ++j) 106 | { 107 | os << i << '\t' << j << '\t' 108 | << vit.cell(i, j).alpha << '\t' 109 | << vit.cell(i, j).beta << std::endl; 110 | } 111 | } 112 | return os; 113 | } 114 | 115 | private: 116 | std::vector< Matrix_Entry > _m; 117 | Float_Type _path_probability; 118 | unsigned _n_events; 119 | 120 | void fill_state_seq(Event_Sequence_Type& ev) 121 | { 122 | assert(Kmer_Size <= MAX_K_LEN); 123 | Float_Type max_v = -INFINITY; 124 | unsigned max_j = n_states; 125 | for (unsigned j = 0; j < n_states; ++j) 126 | { 127 | if (cell(n_events() - 1, j).alpha > max_v) 128 | { 129 | max_j = j; 130 | max_v = cell(n_events() - 1, j).alpha; 131 | } 132 | } 133 | _path_probability = max_v; 134 | for (unsigned i = n_events() - 1; i > 0; --i) 135 | { 136 | ev[i].model_state_idx = max_j; 137 | ev[i].set_model_state(Kmer_Type::to_string(ev[i].model_state_idx)); 138 | max_j = cell(i, max_j).beta; 139 | } 140 | ev[0].model_state_idx = max_j; 141 | ev[0].set_model_state(Kmer_Type::to_string(ev[0].model_state_idx)); 142 | } 143 | 144 | void fill_move_seq(Event_Sequence_Type& ev) 145 | { 146 | for (unsigned i = 0; i < n_events(); ++i) 147 | { 148 | ev[i].move = i > 0? Kmer_Type::min_skip(ev[i - 1].model_state_idx, ev[i].model_state_idx) : 0u; 149 | } 150 | } 151 | 152 | }; // class Viterbi 153 | 154 | #endif 155 | -------------------------------------------------------------------------------- /src/nanocall/compute-scaled-pore-model.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "zstr.hpp" 6 | #include "Pore_Model.hpp" 7 | #include "logger.hpp" 8 | 9 | using namespace std; 10 | 11 | #ifndef FLOAT_TYPE 12 | #define FLOAT_TYPE float 13 | #endif 14 | typedef Pore_Model< FLOAT_TYPE > Pore_Model_Type; 15 | typedef Pore_Model_Type::Pore_Model_Parameters_Type Pore_Model_Parameters_Type; 16 | 17 | namespace opts 18 | { 19 | using namespace TCLAP; 20 | string description = 21 | "Compute scaled pore model."; 22 | CmdLine cmd_parser(description); 23 | MultiArg< string > log_level("d", "log-level", "Log level.", false, "string", cmd_parser); 24 | ValueArg< string > file_name("f", "file-name", "Fast5 file.", true, "", "file", cmd_parser); 25 | ValueArg< unsigned > strand("s", "strand", "Strand.", false, 0, "file", cmd_parser); 26 | ValueArg< string > output_file_name("o", "output", "Output file name.", false, "", "file", cmd_parser); 27 | } // namespace opts 28 | 29 | void real_main() 30 | { 31 | Pore_Model_Type m; 32 | Pore_Model_Parameters_Type m_params; 33 | m.load_from_fast5(fast5::File(opts::file_name), opts::strand); 34 | m_params.load_from_fast5(fast5::File(opts::file_name), opts::strand); 35 | m.scale(m_params); 36 | if (not opts::output_file_name.get().empty()) 37 | { 38 | strict_fstream::ofstream(opts::output_file_name.get()) << m; 39 | } 40 | else 41 | { 42 | cout << m; 43 | } 44 | } 45 | 46 | int main(int argc, char * argv[]) 47 | { 48 | opts::cmd_parser.parse(argc, argv); 49 | logger::Logger::set_levels_from_options(opts::log_level); 50 | real_main(); 51 | } 52 | -------------------------------------------------------------------------------- /src/nanocall/compute-state-transitions.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "zstr.hpp" 6 | #include "State_Transitions.hpp" 7 | #include "logger.hpp" 8 | 9 | using namespace std; 10 | 11 | #ifndef FLOAT_TYPE 12 | #define FLOAT_TYPE float 13 | #endif 14 | typedef State_Transitions< FLOAT_TYPE > State_Transitions_Type; 15 | 16 | namespace opts 17 | { 18 | using namespace TCLAP; 19 | string description = 20 | "Compute state transition probabilities based on the overlap model, for a given pr_skip and pr_stay."; 21 | CmdLine cmd_parser(description); 22 | MultiArg< string > log_level("d", "log-level", "Log level.", false, "string", cmd_parser); 23 | ValueArg< string > output_file_name("o", "output", "Output file name.", false, "", "file", cmd_parser); 24 | ValueArg< float > p_cutoff("p", "pr-cutoff", "Minimim prob to keep.", false, 0.001, "float", cmd_parser); 25 | ValueArg< float > p_skip("k", "pr-skip", "Pr skip.", false, 0.28, "float", cmd_parser); 26 | ValueArg< float > p_stay("t", "pr-stay", "Pr stay.", false, 0.09, "float", cmd_parser); 27 | SwitchArg fast("", "fast", "Use fast computation.", cmd_parser); 28 | } // namespace opts 29 | 30 | void real_main() 31 | { 32 | State_Transitions_Type st; 33 | if (opts::fast) 34 | { 35 | st.compute_transitions_fast(opts::p_skip, opts::p_stay); 36 | } 37 | else 38 | { 39 | st.compute_transitions(opts::p_skip, opts::p_stay, opts::p_cutoff); 40 | } 41 | if (not opts::output_file_name.get().empty()) 42 | { 43 | strict_fstream::ofstream(opts::output_file_name) << st; 44 | } 45 | else 46 | { 47 | cout << st; 48 | } 49 | } 50 | 51 | int main(int argc, char * argv[]) 52 | { 53 | opts::cmd_parser.parse(argc, argv); 54 | logger::Logger::set_levels_from_options(opts::log_level); 55 | real_main(); 56 | } 57 | -------------------------------------------------------------------------------- /src/nanocall/fs_support.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __FS_SUPPORT_HPP 2 | #define __FS_SUPPORT_HPP 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | // This should work in windows. 11 | // Ref: 12 | // http://stackoverflow.com/a/612176/717706 13 | 14 | bool is_directory(const std::string& file_name) 15 | { 16 | auto dir = opendir(file_name.c_str()); 17 | if (not dir) return false; 18 | closedir(dir); 19 | return true; 20 | } 21 | 22 | std::vector< std::string > list_directory(const std::string& file_name) 23 | { 24 | std::vector< std::string > res; 25 | DIR* dir; 26 | struct dirent *ent; 27 | 28 | dir = opendir(file_name.c_str()); 29 | if (not dir) return res; 30 | while ((ent = readdir(dir)) != nullptr) 31 | { 32 | res.push_back(ent->d_name); 33 | } 34 | closedir(dir); 35 | return res; 36 | } 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/nanocall/global_assert.hpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------- 2 | // Copyright 2013 Ontario Institute for Cancer Research 3 | // Written by Matei David (mdavid@oicr.on.ca) 4 | // Released under the GPL license 5 | //----------------------------------------------- 6 | 7 | #ifndef __GLOBAL_ASSERT_HPP 8 | #define __GLOBAL_ASSERT_HPP 9 | 10 | #include 11 | #include 12 | 13 | struct global_assert 14 | { 15 | static std::string& prog_name() 16 | { 17 | static std::string _prog_name; 18 | return _prog_name; 19 | } 20 | static std::string& global_msg() 21 | { 22 | static thread_local std::string _global_msg; 23 | return _global_msg; 24 | } 25 | 26 | static void assertion_failed(const std::string& expr, const std::string& msg, 27 | const std::string& function, const std::string& file, long line) 28 | { 29 | std::cerr << prog_name() << ": " 30 | << file << ":" << line << ": " 31 | << function; 32 | if (not global_msg().empty()) 33 | { 34 | std::cerr << " [" << global_msg() << "]"; 35 | } 36 | std::cerr << ": " << "Assertion '" << expr << "' failed"; 37 | if (not msg.empty()) 38 | { 39 | std::cerr << ": " << msg; 40 | } 41 | std::cerr << std::endl; 42 | abort(); 43 | } 44 | }; // struct global_assert 45 | 46 | #undef ASSERT 47 | #undef ASSERT_MSG 48 | 49 | #if defined(NDEBUG) 50 | 51 | #define ASSERT(expr) ((void)0) 52 | #define ASSERT_MSG(expr, msg) ((void)0) 53 | 54 | #else 55 | 56 | #define ASSERT_MSG(expr, msg) ((expr)? ((void)0): global_assert::assertion_failed(#expr, msg, __func__, __FILE__, __LINE__)) 57 | #define ASSERT(expr) ASSERT_MSG(expr, "") 58 | 59 | #endif 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /src/nanocall/list-directory.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "fs_support.hpp" 5 | 6 | using namespace std; 7 | 8 | int main(int argc, char* argv[]) 9 | { 10 | if (argc != 2) 11 | { 12 | cerr << "use: " << argv[0] << " " << endl; 13 | return EXIT_FAILURE; 14 | } 15 | string file_name = argv[1]; 16 | auto is_dir = is_directory(file_name); 17 | if (not is_dir) 18 | { 19 | cerr << "not a directory: " << file_name << endl; 20 | return EXIT_FAILURE; 21 | } 22 | auto l = list_directory(file_name); 23 | for (const auto& f : l) 24 | { 25 | cout << f << endl; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/nanocall/nanocall.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "global_assert.hpp" 9 | #include "version.hpp" 10 | #include "Pore_Model.hpp" 11 | #include "Builtin_Model.hpp" 12 | #include "State_Transitions.hpp" 13 | #include "Event.hpp" 14 | #include "Fast5_Summary.hpp" 15 | #include "Viterbi.hpp" 16 | #include "Forward_Backward.hpp" 17 | #include "Parameter_Trainer.hpp" 18 | #include "logger.hpp" 19 | #include "alg.hpp" 20 | #include "zstr.hpp" 21 | #include "fast5.hpp" 22 | #include "pfor.hpp" 23 | #include "fs_support.hpp" 24 | 25 | using namespace std; 26 | 27 | long get_cpu_time_ms() 28 | { 29 | auto t = clock(); 30 | return (t * 1000) / CLOCKS_PER_SEC; 31 | } 32 | 33 | #ifndef FLOAT_TYPE 34 | #define FLOAT_TYPE float 35 | #endif 36 | #ifndef KMER_SIZE 37 | #define KMER_SIZE 6 38 | #endif 39 | typedef State_Transitions< FLOAT_TYPE, KMER_SIZE > State_Transitions_Type; 40 | typedef State_Transition_Parameters< FLOAT_TYPE > State_Transition_Parameters_Type; 41 | typedef Pore_Model< FLOAT_TYPE, KMER_SIZE > Pore_Model_Type; 42 | typedef Pore_Model_Dict< FLOAT_TYPE, KMER_SIZE > Pore_Model_Dict_Type; 43 | typedef Pore_Model_Parameters< FLOAT_TYPE > Pore_Model_Parameters_Type; 44 | typedef Event< FLOAT_TYPE, KMER_SIZE > Event_Type; 45 | typedef Event_Sequence< FLOAT_TYPE, KMER_SIZE > Event_Sequence_Type; 46 | typedef Fast5_Summary< FLOAT_TYPE, KMER_SIZE > Fast5_Summary_Type; 47 | typedef Parameter_Trainer< FLOAT_TYPE, KMER_SIZE > Parameter_Trainer_Type; 48 | typedef Viterbi< FLOAT_TYPE, KMER_SIZE > Viterbi_Type; 49 | 50 | namespace opts 51 | { 52 | using namespace TCLAP; 53 | string description = "Call bases in Oxford Nanopore reads."; 54 | CmdLine cmd_parser(description, ' ', package_version); 55 | // 56 | ValueArg< string > ed_group("", "ed-group", "EventDetection group to use. (default: smallest available)", false, "", "000|001|...", cmd_parser); 57 | ValueArg< unsigned > chunk_size("", "chunk-size", "Thread chunk size.", false, 1, "int", cmd_parser); 58 | MultiArg< string > log_level("", "log", "Log level. (default: info)", false, "string", cmd_parser); 59 | ValueArg< string > stats_fn("", "stats", "Stats.", false, "", "file", cmd_parser); 60 | ValueArg< string > train_drift("", "train-drift", "Train drift parameter. (default: yes for R73, no for R9)", false, "", "0|1", cmd_parser); 61 | ValueArg< unsigned > trim_ed_hp_end("", "trim-ed-hp-end", "Number of events to trim after hairpin end.", false, 50, "int", cmd_parser); 62 | ValueArg< unsigned > trim_ed_hp_start("", "trim-ed-hp-start", "Number of events to trim before hairpin start.", false, 50, "int", cmd_parser); 63 | ValueArg< unsigned > trim_ed_sq_end("", "trim-ed-sq-end", "Number of events to trim before sequence end.", false, 50, "int", cmd_parser); 64 | ValueArg< unsigned > trim_ed_sq_start("", "trim-ed-sq-start", "Number of events to trim after sequence start.", false, 50, "int", cmd_parser); 65 | ValueArg< unsigned > max_ed_events("", "max-ed-events", "Maximum EventDetection events.", false, 100000, "int", cmd_parser); 66 | ValueArg< unsigned > min_ed_events("", "min-ed-events", "Minimum EventDetection events.", false, 10, "int", cmd_parser); 67 | ValueArg< unsigned > fasta_line_width("", "fasta-line-width", "Maximum fasta line width.", false, 80, "int", cmd_parser); 68 | // 69 | ValueArg< float > scaling_select_threshold("", "scaling-select-threshold", "Select best model per strand during scaling if log score better by threshold.", false, 20.0, "float", cmd_parser); 70 | ValueArg< float > scaling_min_progress("", "scaling-min-progress", "Minimum scaling fit progress.", false, 1.0, "float", cmd_parser); 71 | ValueArg< unsigned > scaling_max_rounds("", "scaling-max-rounds", "Maximum scaling rounds.", false, 10, "int", cmd_parser); 72 | ValueArg< unsigned > scaling_num_events("", "scaling-num-events", "Number of events used for model scaling.", false, 200, "int", cmd_parser); 73 | // 74 | SwitchArg template_only("", "1d", "Interpret entire read as 1D template only.", cmd_parser); 75 | SwitchArg single_strand_scaling("", "single-strand-scaling", "Train scaling parameters per strand.", cmd_parser); 76 | SwitchArg double_strand_scaling("", "double-strand-scaling", "Train scaling parameters per read. (default)", cmd_parser); 77 | SwitchArg no_train_transitions("", "no-train-transitions", "Do not train state transitions.", cmd_parser); 78 | SwitchArg no_train_scaling("", "no-train-scaling", "Do not train pore model scaling.", cmd_parser); 79 | SwitchArg train("", "train", "Enable training. (default)", cmd_parser); 80 | SwitchArg no_train("", "no-train", "Disable all training.", cmd_parser); 81 | SwitchArg basecall("", "basecall", "Enable basecalling (default).", cmd_parser); 82 | SwitchArg no_basecall("", "no-basecall", "Disable basecalling.", cmd_parser); 83 | // 84 | ValueArg< float > pr_skip("", "pr-skip", "Transition probability of skipping at least 1 state.", false, .3, "float", cmd_parser); 85 | ValueArg< float > pr_stay("", "pr-stay", "Transition probability of staying in the same state.", false, .1, "float", cmd_parser); 86 | ValueArg< string > trans_fn("s", "trans", "Custom initial state transitions.", false, "", "file", cmd_parser); 87 | ValueArg< string > model_fofn("", "model-fofn", "File of pore models.", false, "", "file", cmd_parser); 88 | MultiArg< string > model_fn("m", "model", "Custom pore model for strand (0=template, 1=complement, 2=both).", false, "strand:file", cmd_parser); 89 | // 90 | ValueArg< string > pore("", "pore", "Pore name, used to select builtin pore model.", false, "r9", "r73|r9", cmd_parser); 91 | SwitchArg write_fast5("", "write-fast5", "Write basecalls to fast5 files.", cmd_parser); 92 | ValueArg< string > output_fn("o", "output", "Output.", false, "", "file", cmd_parser); 93 | ValueArg< unsigned > num_threads("t", "threads", "Number of parallel threads.", false, 1, "int", cmd_parser); 94 | UnlabeledMultiArg< string > input_fn("inputs", "Inputs: directories, fast5 files, or files of fast5 file names (use \"-\" to read fofn from stdin).", true, "path", cmd_parser); 95 | } // namespace opts 96 | 97 | void init_models(Pore_Model_Dict_Type& models) 98 | { 99 | auto parse_model_name = [] (const string& s) { 100 | if (s.size() < 3 101 | or (s[0] != '0' and s[0] != '1' and s[0] != '2') 102 | or s[1] != ':') 103 | { 104 | LOG(error) << "could not parse model name: \"" << s << "\"; format should be \"[0|1|2]:\"" << endl; 105 | exit(EXIT_FAILURE); 106 | } 107 | unsigned st = s[0] - '0'; 108 | return make_pair(st, s.substr(2)); 109 | }; 110 | 111 | map< unsigned, list< string > > model_list; 112 | if (not opts::model_fn.get().empty()) 113 | { 114 | for (const auto& s : opts::model_fn.get()) 115 | { 116 | auto p = parse_model_name(s); 117 | model_list[p.first].push_back(p.second); 118 | } 119 | } 120 | if (not opts::model_fofn.get().empty()) 121 | { 122 | zstr::ifstream ifs(opts::model_fofn); 123 | string s; 124 | while (getline(ifs, s)) 125 | { 126 | auto p = parse_model_name(s); 127 | model_list[p.first].push_back(p.second); 128 | } 129 | } 130 | if (model_list[2].empty() and (model_list[0].empty() != model_list[1].empty())) 131 | { 132 | LOG(error) << "models were specified only for strand " << (int)model_list[0].empty() 133 | << "! give models for both strands, or for neither." << endl; 134 | exit(EXIT_FAILURE); 135 | } 136 | if (not (model_list[0].empty() and model_list[1].empty() and model_list[2].empty())) 137 | { 138 | for (unsigned st = 0; st < 3; ++st) 139 | { 140 | for (const auto& e : model_list[st]) 141 | { 142 | Pore_Model_Type pm; 143 | string pm_name = e; 144 | zstr::ifstream(e) >> pm; 145 | pm.strand() = st; 146 | models[pm_name] = move(pm); 147 | LOG(info) << "loaded module [" << pm_name 148 | << "] for strand [" << st 149 | << "] statistics [mean=" << pm.mean() 150 | << ", stdv=" << pm.stdv() << "]" << endl; 151 | } 152 | } 153 | } 154 | else 155 | { 156 | // use built-in models 157 | for (unsigned i = 0; i < Builtin_Model::num; ++i) 158 | { 159 | Pore_Model_Type pm; 160 | string pm_name = Builtin_Model::names[i]; 161 | if (pm_name.compare(0, opts::pore.get().size() + 1, opts::pore.get() + ".")) continue; 162 | pm.load_from_vector(Builtin_Model::init_lists[i]); 163 | pm.strand() = Builtin_Model::strands[i]; 164 | models[Builtin_Model::names[i]] = move(pm); 165 | LOG(info) 166 | << "loaded builtin module [" << Builtin_Model::names[i] 167 | << "] for strand [" << Builtin_Model::strands[i] 168 | << "] statistics [mean=" << pm.mean() 169 | << ", stdv=" << pm.stdv() << "]" << endl; 170 | } 171 | if (models.empty()) 172 | { 173 | LOG(error) 174 | << "no builtin models found for pore [" << opts::pore.get() << "]" << endl; 175 | exit(EXIT_FAILURE); 176 | } 177 | } 178 | } // init_models 179 | 180 | void init_transitions(State_Transitions_Type& transitions) 181 | { 182 | if (not opts::trans_fn.get().empty()) 183 | { 184 | zstr::ifstream(opts::trans_fn) >> transitions; 185 | LOG(info) << "loaded state transitions from [" << opts::trans_fn.get() << "]" << endl; 186 | } 187 | else 188 | { 189 | transitions.compute_transitions_fast(opts::pr_skip, opts::pr_stay); 190 | LOG(info) << "init_state_transitions pr_skip=[" << opts::pr_skip 191 | << "], pr_stay=[" << opts::pr_stay << "]" << endl; 192 | } 193 | } // init_transitions 194 | 195 | // Parse command line arguments. For each of them: 196 | // - if it is a directory, find all fast5 files in it, ignore non-fast5 files. 197 | // - if it is a file, check that it is indeed a fast5 file. 198 | void init_files(list< string >& files) 199 | { 200 | for (const auto& f : opts::input_fn) 201 | { 202 | if (is_directory(f)) 203 | { 204 | auto l = list_directory(f); 205 | for (const auto& g : l) 206 | { 207 | string f2 = f + (f[f.size() - 1] != '/'? "/" : "") + g; 208 | if (is_directory(f2)) 209 | { 210 | LOG(info) << "ignoring subdirectory [" << f2 << "]" << endl; 211 | } 212 | else if (fast5::File::is_valid_file(f2)) 213 | { 214 | files.push_back(f2); 215 | LOG(info) << "adding input file [" << f2 << "]" << endl; 216 | } 217 | else 218 | { 219 | LOG(info) << "ignoring file [" << f2 << "]" << endl; 220 | } 221 | } 222 | } 223 | else // not a directory 224 | { 225 | if (f != "-" and fast5::File::is_valid_file(f)) 226 | { 227 | files.push_back(f); 228 | LOG(info) << "adding input file [" << f << "]" << endl; 229 | } 230 | else // not fast5, interpret as fofn 231 | { 232 | LOG(info) << "interpreting [" << f << "] as fofn" << endl; 233 | istream* is_p = nullptr; 234 | strict_fstream::ifstream ifs; 235 | if (f == "-") 236 | { 237 | is_p = &cin; 238 | } 239 | else 240 | { 241 | ifs.open(f); 242 | is_p = &ifs; 243 | } 244 | string g; 245 | while (getline(*is_p, g)) 246 | { 247 | if (fast5::File::is_valid_file(g)) 248 | { 249 | files.push_back(g); 250 | LOG(info) << "adding input file [" << g << "]" << endl; 251 | } 252 | } 253 | } 254 | } 255 | } 256 | if (files.empty()) 257 | { 258 | LOG(error) << "no fast5 files to process" << endl; 259 | exit(EXIT_FAILURE); 260 | } 261 | } // init_files 262 | 263 | void init_reads(const Pore_Model_Dict_Type& models, 264 | const list< string >& files, 265 | deque< Fast5_Summary_Type >& reads) 266 | { 267 | for (const auto& f : files) 268 | { 269 | Fast5_Summary_Type s(f, models, opts::double_strand_scaling); 270 | LOG(info) << "summary: " << s << endl; 271 | reads.emplace_back(move(s)); 272 | } 273 | } // init_reads 274 | 275 | void train_reads(const Pore_Model_Dict_Type& models, 276 | const State_Transitions_Type& default_transitions, 277 | deque< Fast5_Summary_Type >& reads) 278 | { 279 | auto time_start_ms = get_cpu_time_ms(); 280 | Parameter_Trainer_Type::init(); 281 | unsigned crt_idx = 0; 282 | pfor::pfor< unsigned >( 283 | opts::num_threads, 284 | opts::chunk_size, 285 | // get_item 286 | [&] (unsigned& i) { 287 | if (crt_idx >= reads.size()) return false; 288 | i = crt_idx++; 289 | return true; 290 | }, 291 | // process item 292 | [&] (unsigned& i) { 293 | Fast5_Summary_Type& read_summary = reads[i]; 294 | if (read_summary.num_ed_events == 0) return; 295 | global_assert::global_msg() = read_summary.read_id; 296 | read_summary.load_events(); 297 | // 298 | // create per-strand list of models to try 299 | // 300 | array< list< string >, 2 > model_list; 301 | for (unsigned st = 0; st < 2; ++st) 302 | { 303 | // if not enough events, ignore strand 304 | if (read_summary.events(st).size() < opts::min_ed_events) continue; 305 | // create list of models to try 306 | if (not read_summary.preferred_model[st][st].empty()) 307 | { 308 | // if we have a preferred model, use that 309 | model_list[st].push_back(read_summary.preferred_model[st][st]); 310 | } 311 | else 312 | { 313 | // no preferred model, try all that apply to this strand 314 | for (const auto& p : models) 315 | { 316 | if (p.second.strand() == st or p.second.strand() == 2) 317 | { 318 | model_list[st].push_back(p.first); 319 | } 320 | } 321 | } 322 | ASSERT(not model_list.empty()); 323 | } 324 | // 325 | // create per-strand list of event sequences on which to train 326 | // 327 | array< vector< Event_Sequence_Type >, 2 > train_event_seqs; 328 | for (unsigned st = 0; st < 2; ++st) 329 | { 330 | // if not enough events, ignore strand 331 | if (read_summary.events(st).size() < opts::min_ed_events) continue; 332 | // create 2 event sequences on which to train 333 | unsigned num_train_events = min((size_t)opts::scaling_num_events.get(), read_summary.events(st).size()); 334 | train_event_seqs[st].emplace_back( 335 | read_summary.events(st).begin(), read_summary.events(st).begin() + num_train_events / 2); 336 | train_event_seqs[st].emplace_back( 337 | read_summary.events(st).end() - num_train_events / 2, read_summary.events(st).end()); 338 | } 339 | // 340 | // branch on whether pore models should be scaled together 341 | // 342 | if (read_summary.scale_strands_together) 343 | { 344 | // prepare vector of event sequences 345 | vector< pair< const Event_Sequence_Type*, unsigned > > train_event_seq_ptrs; 346 | for (unsigned st = 0; st < 2; ++st) 347 | { 348 | for (const auto& events : train_event_seqs[st]) 349 | { 350 | train_event_seq_ptrs.push_back(make_pair(&events, st)); 351 | } 352 | } 353 | // track model fit 354 | // key = pore model name; value = fit 355 | map< array< string, 2 >, FLOAT_TYPE > model_fit; 356 | for (const auto& m_name_0 : model_list[0]) 357 | { 358 | for (const auto& m_name_1 : model_list[1]) 359 | { 360 | array< string, 2 > m_name_key = {{ m_name_0, m_name_1 }}; 361 | string m_name = m_name_0 + "+" + m_name_1; 362 | unsigned round = 0; 363 | auto& crt_pm_params = read_summary.pm_params_m.at(m_name_key); 364 | auto& crt_st_params = read_summary.st_params_m.at(m_name_key); 365 | auto& crt_fit = model_fit[m_name_key]; 366 | crt_fit = -INFINITY; 367 | while (true) 368 | { 369 | Pore_Model_Parameters_Type old_pm_params(crt_pm_params); 370 | std::array< State_Transition_Parameters_Type, 2 > old_st_params(crt_st_params); 371 | auto old_fit = crt_fit; 372 | bool done; 373 | 374 | Parameter_Trainer_Type::train_one_round( 375 | train_event_seq_ptrs, 376 | {{ &models.at(m_name_0), &models.at(m_name_1) }}, 377 | default_transitions, 378 | old_pm_params, old_st_params, 379 | crt_pm_params, crt_st_params, crt_fit, done, 380 | not opts::no_train_scaling, not opts::no_train_transitions); 381 | 382 | LOG(debug) 383 | << "scaling_round read [" << read_summary.read_id 384 | << "] strand [" << 2 385 | << "] model [" << m_name 386 | << "] old_pm_params [" << old_pm_params 387 | << "] old_st_params [" << old_st_params[0] << "," << old_st_params[1] 388 | << "] old_fit [" << old_fit 389 | << "] crt_pm_params [" << crt_pm_params 390 | << "] crt_st_params [" << crt_st_params[0] << "," << crt_st_params[1] 391 | << "] crt_fit [" << crt_fit 392 | << "] round [" << round << "]" << endl; 393 | 394 | if (done) 395 | { 396 | // singularity detected; stop 397 | break; 398 | } 399 | 400 | if (crt_fit < old_fit) 401 | { 402 | LOG(info) << "scaling_regression read [" << read_summary.read_id 403 | << "] strand [" << 2 404 | << "] model [" << m_name 405 | << "] old_params [" << old_pm_params 406 | << "] old_st_params [" << old_st_params[0] << "," << old_st_params[1] 407 | << "] old_fit [" << old_fit 408 | << "] crt_pm_params [" << crt_pm_params 409 | << "] crt_st_params [" << crt_st_params[0] << "," << crt_st_params[1] 410 | << "] crt_fit [" << crt_fit 411 | << "] round [" << round << "]" << endl; 412 | crt_pm_params = old_pm_params; 413 | crt_st_params = old_st_params; 414 | crt_fit = old_fit; 415 | break; 416 | } 417 | 418 | ++round; 419 | // stop condition 420 | if (round >= 2u * opts::scaling_max_rounds 421 | or (round > 1 and crt_fit < old_fit + opts::scaling_min_progress)) 422 | { 423 | break; 424 | } 425 | 426 | }; // while true 427 | LOG(info) 428 | << "scaling_result read [" << read_summary.read_id 429 | << "] strand [" << 2 430 | << "] model [" << m_name 431 | << "] pm_params [" << crt_pm_params 432 | << "] st_params [" << crt_st_params[0] << "," << crt_st_params[1] 433 | << "] fit [" << crt_fit 434 | << "] rounds [" << round << "]" << endl; 435 | } // for m_name[1] 436 | } // for m_name[0] 437 | if (opts::scaling_select_threshold.get() < INFINITY) 438 | { 439 | auto it_max = alg::max_of( 440 | model_fit, 441 | [] (const decltype(model_fit)::value_type& p) { return p.second; }); 442 | // check maximum is unique 443 | if (alg::all_of( 444 | model_fit, 445 | [&] (const decltype(model_fit)::value_type& p) { 446 | return &p == &*it_max 447 | or p.second + opts::scaling_select_threshold.get() < it_max->second; 448 | })) 449 | { 450 | const auto& m_name_0 = it_max->first[0]; 451 | const auto& m_name_1 = it_max->first[1]; 452 | auto m_name = m_name_0 + '+' + m_name_1; 453 | read_summary.preferred_model[2][0] = m_name_0; 454 | read_summary.preferred_model[2][1] = m_name_1; 455 | LOG(info) 456 | << "selected_model read [" << read_summary.read_id 457 | << "] strand [2] model [" << m_name << "]" << endl; 458 | } 459 | } 460 | } 461 | else // not scale_strands_together 462 | { 463 | for (unsigned st = 0; st < 2; ++st) 464 | { 465 | // if not enough events, ignore strand 466 | if (read_summary.events(st).size() < opts::min_ed_events) continue; 467 | // prepare vector of event sequences 468 | vector< pair< const Event_Sequence_Type*, unsigned > > train_event_seq_ptrs; 469 | for (const auto& events : train_event_seqs[st]) 470 | { 471 | train_event_seq_ptrs.push_back(make_pair(&events, st)); 472 | } 473 | map< string, FLOAT_TYPE > model_fit; 474 | for (const auto& m_name : model_list[st]) 475 | { 476 | array< string, 2 > m_name_key; 477 | m_name_key[st] = m_name; 478 | unsigned round = 0; 479 | auto& crt_pm_params = read_summary.pm_params_m.at(m_name_key); 480 | auto& crt_st_params = read_summary.st_params_m.at(m_name_key); 481 | auto& crt_fit = model_fit[m_name]; 482 | crt_fit = -INFINITY; 483 | while (true) 484 | { 485 | Pore_Model_Parameters_Type old_pm_params(crt_pm_params); 486 | array< State_Transition_Parameters_Type, 2 > old_st_params(crt_st_params); 487 | auto old_fit = crt_fit; 488 | bool done; 489 | 490 | Parameter_Trainer_Type::train_one_round( 491 | train_event_seq_ptrs, 492 | {{ &models.at(m_name), &models.at(m_name) }}, 493 | default_transitions, 494 | old_pm_params, old_st_params, 495 | crt_pm_params, crt_st_params, crt_fit, done, 496 | not opts::no_train_scaling, not opts::no_train_transitions); 497 | 498 | LOG(debug) 499 | << "scaling_round read [" << read_summary.read_id 500 | << "] strand [" << st 501 | << "] model [" << m_name 502 | << "] old_pm_params [" << old_pm_params 503 | << "] old_st_params [" << old_st_params[st] 504 | << "] old_fit [" << old_fit 505 | << "] crt_pm_params [" << crt_pm_params 506 | << "] crt_st_params [" << crt_st_params[st] 507 | << "] crt_fit [" << crt_fit 508 | << "] round [" << round << "]" << endl; 509 | 510 | if (done) 511 | { 512 | // singularity detected; stop 513 | break; 514 | } 515 | 516 | if (crt_fit < old_fit) 517 | { 518 | LOG(info) << "scaling_regression read [" << read_summary.read_id 519 | << "] strand [" << st 520 | << "] model [" << m_name 521 | << "] old_pm_params [" << old_pm_params 522 | << "] old_st_params [" << old_st_params[st] 523 | << "] old_fit [" << old_fit 524 | << "] crt_pm_params [" << crt_pm_params 525 | << "] crt_st_params [" << crt_st_params[st] 526 | << "] crt_fit [" << crt_fit 527 | << "] round [" << round << "]" << endl; 528 | crt_pm_params = old_pm_params; 529 | crt_st_params = old_st_params; 530 | crt_fit = old_fit; 531 | break; 532 | } 533 | 534 | ++round; 535 | // stop condition 536 | if (round >= opts::scaling_max_rounds 537 | or (round > 1 and crt_fit < old_fit + opts::scaling_min_progress)) 538 | { 539 | break; 540 | } 541 | 542 | }; // while true 543 | LOG(info) 544 | << "scaling_result read [" << read_summary.read_id 545 | << "] strand [" << st 546 | << "] model [" << m_name 547 | << "] pm_params [" << crt_pm_params 548 | << "] st_params [" << crt_st_params[st] 549 | << "] fit [" << crt_fit 550 | << "] rounds [" << round << "]" << endl; 551 | } // for m_name 552 | if (opts::scaling_select_threshold.get() < INFINITY) 553 | { 554 | auto it_max = alg::max_of( 555 | model_fit, 556 | [] (const decltype(model_fit)::value_type& p) { return p.second; }); 557 | if (alg::all_of( 558 | model_fit, 559 | [&] (const decltype(model_fit)::value_type& p) { 560 | return &p == &*it_max 561 | or p.second + opts::scaling_select_threshold.get() < it_max->second; 562 | })) 563 | { 564 | read_summary.preferred_model[st][st] = it_max->first; 565 | LOG(info) 566 | << "selected_model read [" << read_summary.read_id 567 | << "] strand [" << st 568 | << "] model [" << it_max->first << "]" << endl; 569 | } 570 | } 571 | } // for st 572 | } // if not scale_strands_together 573 | read_summary.drop_events(); 574 | }, // process_item 575 | // progress_report 576 | [&] (unsigned items, unsigned seconds) { 577 | clog << "Processed " << setw(6) << right << items << " reads in " 578 | << setw(6) << right << seconds << " seconds\r"; 579 | }); // pfor 580 | auto time_end_ms = get_cpu_time_ms(); 581 | LOG(info) << "training user_cpu_secs=" << (time_end_ms - time_start_ms)/1000 << endl; 582 | } // train_reads 583 | 584 | void write_fasta(ostream& os, const string& name, const string& seq) 585 | { 586 | os << ">" << name << endl; 587 | for (unsigned pos = 0; pos < seq.size(); pos += opts::fasta_line_width) 588 | { 589 | os << seq.substr(pos, opts::fasta_line_width) << endl; 590 | } 591 | } // write_fasta 592 | 593 | void basecall_reads(const Pore_Model_Dict_Type& models, 594 | const State_Transitions_Type& default_transitions, 595 | deque< Fast5_Summary_Type >& reads) 596 | { 597 | auto time_start_ms = get_cpu_time_ms(); 598 | strict_fstream::ofstream ofs; 599 | ostream* os_p = nullptr; 600 | if (not opts::output_fn.get().empty()) 601 | { 602 | ofs.open(opts::output_fn); 603 | os_p = &ofs; 604 | } 605 | else 606 | { 607 | os_p = &cout; 608 | } 609 | 610 | unsigned crt_idx = 0; 611 | pfor::pfor< unsigned, ostringstream >( 612 | opts::num_threads, 613 | opts::chunk_size, 614 | // get_item 615 | [&] (unsigned& i) { 616 | if (crt_idx >= reads.size()) return false; 617 | i = crt_idx++; 618 | return true; 619 | }, 620 | // process_item 621 | [&] (unsigned& i, ostringstream& oss) { 622 | Fast5_Summary_Type& read_summary = reads[i]; 623 | if (read_summary.num_ed_events == 0) return; 624 | global_assert::global_msg() = read_summary.read_id; 625 | read_summary.load_events(); 626 | 627 | // compute read statistics used to check scaling 628 | array< pair< FLOAT_TYPE, FLOAT_TYPE >, 2 > r_stats; 629 | for (unsigned st = 0; st < 2; ++st) 630 | { 631 | // if not enough events, ignore strand 632 | if (read_summary.events(st).size() < opts::min_ed_events) continue; 633 | r_stats[st] = alg::mean_stdv_of< FLOAT_TYPE >( 634 | read_summary.events(st), 635 | [] (const Event_Type& ev) { return ev.mean; }); 636 | LOG(debug) 637 | << "mean_stdv read [" << read_summary.read_id 638 | << "] strand [" << st 639 | << "] ev_mean=[" << r_stats[st].first 640 | << "] ev_stdv=[" << r_stats[st].second << "]" << endl; 641 | } 642 | 643 | // basecalling functor 644 | // returns: (path_prob, base_seq) 645 | auto basecall_strand = [&] (unsigned st, string m_name, 646 | const Pore_Model_Parameters_Type& pm_params, 647 | const State_Transition_Parameters_Type& st_params) { 648 | // scale model 649 | Pore_Model_Type pm(models.at(m_name)); 650 | pm.scale(pm_params); 651 | State_Transitions_Type custom_transitions; 652 | const State_Transitions_Type* transitions_ptr; 653 | if (not st_params.is_default()) 654 | { 655 | custom_transitions.compute_transitions_fast(st_params); 656 | transitions_ptr = &custom_transitions; 657 | } 658 | else 659 | { 660 | transitions_ptr = &default_transitions; 661 | } 662 | LOG(info) 663 | << "basecalling read [" << read_summary.read_id 664 | << "] strand [" << st 665 | << "] model [" << m_name 666 | << "] pm_params [" << pm_params 667 | << "] st_params [" << st_params << "]" << endl; 668 | LOG(debug) 669 | << "mean_stdv read [" << read_summary.read_id 670 | << "] strand [" << st 671 | << "] model_mean [" << pm.mean() 672 | << "] model_stdv [" << pm.stdv() << "]" << endl; 673 | if (abs(r_stats[st].first - pm.mean()) > 5.0) 674 | { 675 | LOG(warning) 676 | << "means_apart read [" << read_summary.read_id 677 | << "] strand [" << st 678 | << "] model [" << m_name 679 | << "] parameters [" << pm_params 680 | << "] model_mean=[" << pm.mean() 681 | << "] events_mean=[" << r_stats[st].first 682 | << "]" << endl; 683 | } 684 | // correct drift 685 | Event_Sequence_Type corrected_events = read_summary.events(st); 686 | corrected_events.apply_drift_correction(pm_params.drift); 687 | Viterbi_Type vit; 688 | vit.fill(pm, *transitions_ptr, corrected_events); 689 | return std::make_tuple(vit.path_probability(), std::move(corrected_events)); 690 | }; 691 | 692 | if (read_summary.scale_strands_together) 693 | { 694 | // create list of models to try 695 | list< array< string, 2 > > model_sublist; 696 | if (not read_summary.preferred_model[2][0].empty()) 697 | { 698 | // if we have a preferred model, use that 699 | model_sublist.push_back(read_summary.preferred_model[2]); 700 | } 701 | else 702 | { 703 | // no preferred model, try all for which we have scaling parameters 704 | for (const auto& p : read_summary.pm_params_m) 705 | { 706 | if (p.first[0].empty() or p.first[1].empty()) continue; 707 | model_sublist.push_back(p.first); 708 | } 709 | } 710 | // basecall using applicable models 711 | deque< tuple< FLOAT_TYPE, 712 | FLOAT_TYPE, FLOAT_TYPE, 713 | string, string, 714 | Event_Sequence_Type, Event_Sequence_Type > > results; 715 | for (const auto& m_name : model_sublist) 716 | { 717 | array< tuple< FLOAT_TYPE, Event_Sequence_Type >, 2 > part_results; 718 | for (unsigned st = 0; st < 2; ++st) 719 | { 720 | part_results[st] = basecall_strand( 721 | st, m_name[st], 722 | read_summary.pm_params_m.at(m_name), 723 | read_summary.st_params_m.at(m_name)[st]); 724 | } 725 | results.emplace_back(get<0>(part_results[0]) + get<0>(part_results[1]), 726 | get<0>(part_results[0]), 727 | get<0>(part_results[1]), 728 | string(m_name[0]), 729 | string(m_name[1]), 730 | std::move(get<1>(part_results[0])), 731 | std::move(get<1>(part_results[1]))); 732 | } 733 | // sort results by first component (log path probability) 734 | sort(results.begin(), 735 | results.end(), 736 | [] (const decltype(results)::value_type& lhs, const decltype(results)::value_type& rhs) { 737 | return get<0>(lhs) < get<0>(rhs); 738 | }); 739 | array< FLOAT_TYPE, 2 > best_log_path_prob{{ get<1>(results.back()), get<2>(results.back()) }}; 740 | array< string, 2 > best_m_name{{ get<3>(results.back()), get<4>(results.back()) }}; 741 | array< const Event_Sequence_Type*, 2 > event_seq_ptr = { 742 | &get<5>(results.back()), 743 | &get<6>(results.back()) 744 | }; 745 | array< string, 2 > base_seq = { 746 | get<5>(results.back()).get_base_seq(), 747 | get<6>(results.back()).get_base_seq() 748 | }; 749 | string best_m_name_str = best_m_name[0] + '+' + best_m_name[1]; 750 | auto& best_pm_params = read_summary.pm_params_m.at(best_m_name); 751 | auto& best_st_params = read_summary.st_params_m.at(best_m_name); 752 | for (unsigned st = 0; st < 2; ++st) 753 | { 754 | LOG(info) 755 | << "best_model read [" << read_summary.read_id 756 | << "] strand [" << st 757 | << "] model [" << best_m_name[st] 758 | << "] pm_params [" << best_pm_params 759 | << "] st_params [" << best_st_params[st] 760 | << "] log_path_prob [" << best_log_path_prob[st] << "]" << endl; 761 | read_summary.preferred_model[st][st] = best_m_name[st]; 762 | read_summary.pm_params_m[read_summary.preferred_model[st]] = best_pm_params; 763 | read_summary.st_params_m[read_summary.preferred_model[st]][st] = best_st_params[st]; 764 | string seq_name; 765 | { 766 | ostringstream tmp; 767 | tmp << read_summary.read_id << ":" << read_summary.base_file_name << ":" << st; 768 | seq_name = tmp.str(); 769 | } 770 | if (opts::write_fast5) 771 | { 772 | read_summary.add_basecall_seq(seq_name, st, base_seq[st]); 773 | read_summary.add_basecall_events(st, *event_seq_ptr[st]); 774 | read_summary.add_basecall_model(st, models.at(best_m_name[st])); 775 | read_summary.add_basecall_model_params(st, best_pm_params); 776 | } 777 | else 778 | { 779 | write_fasta(oss, seq_name, base_seq[st]); 780 | } 781 | } 782 | } 783 | else // not scale_strands_together 784 | { 785 | for (unsigned st = 0; st < 2; ++st) 786 | { 787 | // if not enough events, ignore strand 788 | if (read_summary.events(st).size() < opts::min_ed_events) continue; 789 | // create list of models to try 790 | list< array< string, 2 > > model_sublist; 791 | if (not read_summary.preferred_model[st][st].empty()) 792 | { 793 | // if we have a preferred model, use that 794 | model_sublist.push_back(read_summary.preferred_model[st]); 795 | } 796 | else 797 | { 798 | // no preferred model, try all for which we have scaling 799 | for (const auto& p : read_summary.pm_params_m) 800 | { 801 | if (not p.first[st].empty() and p.first[1 - st].empty()) 802 | { 803 | model_sublist.push_back(p.first); 804 | } 805 | } 806 | } 807 | // deque of results 808 | deque< tuple< FLOAT_TYPE, string, Event_Sequence_Type > > results; 809 | for (const auto& m_name : model_sublist) 810 | { 811 | auto r = basecall_strand( 812 | st, m_name[st], 813 | read_summary.pm_params_m.at(m_name), 814 | read_summary.st_params_m.at(m_name)[st]); 815 | results.emplace_back(get<0>(r), 816 | string(m_name[st]), 817 | std::move(get<1>(r))); 818 | } 819 | sort(results.begin(), 820 | results.end(), 821 | [] (const decltype(results)::value_type& lhs, const decltype(results)::value_type& rhs) { 822 | return get<0>(lhs) < get<0>(rhs); 823 | }); 824 | const string& best_m_name = get<1>(results.back()); 825 | const Event_Sequence_Type& event_seq = get<2>(results.back()); 826 | string base_seq = event_seq.get_base_seq(); 827 | array< string, 2 > best_m_key; 828 | best_m_key[st] = best_m_name; 829 | LOG(info) 830 | << "best_model read [" << read_summary.read_id 831 | << "] strand [" << st 832 | << "] model [" << best_m_name 833 | << "] pm_params [" << read_summary.pm_params_m.at(best_m_key) 834 | << "] st_params [" << read_summary.st_params_m.at(best_m_key)[st] 835 | << "] log_path_prob [" << get<0>(results.back()) << "]" << endl; 836 | read_summary.preferred_model[st][st] = best_m_name; 837 | string seq_name; 838 | { 839 | ostringstream tmp; 840 | tmp << read_summary.read_id << ":" << read_summary.base_file_name << ":" << st; 841 | seq_name = tmp.str(); 842 | } 843 | if (opts::write_fast5) 844 | { 845 | read_summary.add_basecall_seq(seq_name, st, base_seq); 846 | read_summary.add_basecall_events(st, event_seq); 847 | read_summary.add_basecall_model(st, models.at(best_m_name)); 848 | read_summary.add_basecall_model_params(st, read_summary.pm_params_m.at(best_m_key)); 849 | } 850 | else 851 | { 852 | write_fasta(oss, seq_name, base_seq); 853 | } 854 | } // for st 855 | } 856 | read_summary.drop_events(); 857 | }, 858 | // output_chunk 859 | [&] (ostringstream& oss) { 860 | *os_p << oss.str(); 861 | }, 862 | // progress_report 863 | [&] (unsigned items, unsigned seconds) { 864 | clog << "Processed " << setw(6) << right << items << " reads in " 865 | << setw(6) << right << seconds << " seconds\r"; 866 | }); // pfor 867 | auto time_end_ms = get_cpu_time_ms(); 868 | LOG(info) << "basecalling user_cpu_secs=" << (time_end_ms - time_start_ms)/1000 << endl; 869 | } // basecall_reads 870 | 871 | int real_main() 872 | { 873 | Pore_Model_Dict_Type models; 874 | State_Transitions_Type default_transitions; 875 | deque< Fast5_Summary_Type > reads; 876 | list< string > files; 877 | // initialize structs 878 | init_models(models); 879 | init_transitions(default_transitions); 880 | init_files(files); 881 | init_reads(models, files, reads); 882 | if (opts::train) 883 | { 884 | // do some training 885 | train_reads(models, default_transitions, reads); 886 | } 887 | if (opts::basecall) 888 | { 889 | // basecall reads 890 | basecall_reads(models, default_transitions, reads); 891 | } 892 | // print stats 893 | if (not opts::stats_fn.get().empty()) 894 | { 895 | strict_fstream::ofstream ofs(opts::stats_fn); 896 | Fast5_Summary_Type::write_tsv_header(ofs); 897 | ofs << endl; 898 | for (const auto& s : reads) 899 | { 900 | s.write_tsv(ofs); 901 | ofs << endl; 902 | } 903 | } 904 | assert(fast5::File::get_object_count() == 0); 905 | return EXIT_SUCCESS; 906 | } 907 | 908 | int main(int argc, char * argv[]) 909 | { 910 | opts::cmd_parser.parse(argc, argv); 911 | logger::Logger::set_default_level(logger::level::info); 912 | logger::Logger::set_levels_from_options(opts::log_level); 913 | LOG(info) << "program: " << opts::cmd_parser.getProgramName() << endl; 914 | LOG(info) << "version: " << opts::cmd_parser.getVersion() << endl; 915 | LOG(info) << "args: " << opts::cmd_parser.getOrigArgv() << endl; 916 | LOG(info) << "num_threads=" << opts::num_threads.get() << endl; 917 | #ifndef H5_HAVE_THREADSAFE 918 | if (opts::num_threads > 1) 919 | { 920 | LOG(warning) << "enabled multi-threading with non-threadsafe HDF5: using experimental locking" << endl; 921 | } 922 | #endif 923 | State_Transition_Parameters_Type::default_p_stay() = opts::pr_stay; 924 | State_Transition_Parameters_Type::default_p_skip() = opts::pr_skip; 925 | Fast5_Summary_Type::min_ed_events() = opts::min_ed_events; 926 | Fast5_Summary_Type::max_ed_events() = opts::max_ed_events; 927 | Fast5_Summary_Type::eventdetection_group() = opts::ed_group; 928 | Fast5_Summary_Type::template_only() = opts::template_only; 929 | Fast5_Summary_Type::trim_margins() = {{ opts::trim_ed_sq_start, opts::trim_ed_sq_end, opts::trim_ed_hp_start, opts::trim_ed_hp_end }}; 930 | LOG (info) << "eventdetection_group=" << (Fast5_Summary_Type::eventdetection_group().empty() 931 | ? string("smallest") 932 | : Fast5_Summary_Type::eventdetection_group()) << endl; 933 | // 934 | // set pore-related options 935 | // 936 | if (not opts::train_drift.get().empty() 937 | and opts::train_drift.get() != "0" 938 | and opts::train_drift.get() != "1") 939 | { 940 | LOG(error) << "train-drift not understdood: " << opts::train_drift.get() << endl; 941 | return EXIT_FAILURE; 942 | } 943 | if (opts::pore.get() == "r9") 944 | { 945 | Fast5_Summary_Type::abasic_level_top_percent() = 1.0; 946 | Fast5_Summary_Type::abasic_level_top_offset() = 0.0; 947 | Fast5_Summary_Type::hairpin_island_window_size() = 10; 948 | Fast5_Summary_Type::hairpin_island_window_load() = 5; 949 | if (opts::train_drift.get().empty()) 950 | { 951 | opts::train_drift.get() = "0"; 952 | } 953 | } 954 | else if (opts::pore.get() == "r73") 955 | { 956 | Fast5_Summary_Type::abasic_level_top_percent() = 1.0; 957 | Fast5_Summary_Type::abasic_level_top_offset() = 5.0; 958 | Fast5_Summary_Type::hairpin_island_window_size() = 5; 959 | Fast5_Summary_Type::hairpin_island_window_load() = 5; 960 | if (opts::train_drift.get().empty()) 961 | { 962 | opts::train_drift.get() = "1"; 963 | } 964 | } 965 | else 966 | { 967 | LOG(error) << "unknown pore type: " << opts::pore.get() << endl; 968 | return EXIT_FAILURE; 969 | } 970 | Parameter_Trainer_Type::pm_train_drift() = opts::train_drift.get() == "1"; 971 | LOG(info) 972 | << "ed_event_trimming: " 973 | << " sq_start=" << Fast5_Summary_Type::trim_margins()[0] 974 | << " sq_end=" << Fast5_Summary_Type::trim_margins()[1] 975 | << " hp_start=" << Fast5_Summary_Type::trim_margins()[2] 976 | << " hp_end=" << Fast5_Summary_Type::trim_margins()[3] << endl; 977 | if (not opts::template_only.get()) 978 | { 979 | LOG(info) 980 | << "hairpin_detection:" 981 | << " abasic_level_top_percent=" << Fast5_Summary_Type::abasic_level_top_percent() 982 | << " abasic_level_top_offset=" << Fast5_Summary_Type::abasic_level_top_offset() 983 | << " hairpin_island_window_size=" << Fast5_Summary_Type::hairpin_island_window_size() 984 | << " hairpin_island_window_load=" << Fast5_Summary_Type::hairpin_island_window_load() 985 | << endl; 986 | } 987 | else 988 | { 989 | LOG(info) 990 | << "hairpin_detection: disabled" << endl; 991 | } 992 | // 993 | // set training option 994 | // 995 | if (opts::train and opts::no_train) 996 | { 997 | LOG(error) 998 | << "either --train or --no-train may be used, but not both" << endl; 999 | return EXIT_FAILURE; 1000 | } 1001 | else if (not opts::train and not opts::no_train) 1002 | { 1003 | // by default, enable training 1004 | opts::train.set(true); 1005 | } 1006 | ASSERT(opts::train != opts::no_train); 1007 | // 1008 | // set basecalling option 1009 | // 1010 | if (opts::basecall and opts::no_basecall) 1011 | { 1012 | LOG(error) 1013 | << "either --basecall or --no-basecall may be used, but not both" << endl; 1014 | return EXIT_FAILURE; 1015 | } 1016 | else if (not opts::basecall and not opts::no_basecall) 1017 | { 1018 | // by default, enable basecalling 1019 | opts::basecall.set(true); 1020 | } 1021 | ASSERT(opts::basecall != opts::no_basecall); 1022 | // 1023 | // set single/double strand scaling option 1024 | // 1025 | if (opts::train and not opts::no_train_scaling) 1026 | { 1027 | if (opts::single_strand_scaling and opts::double_strand_scaling) 1028 | { 1029 | LOG(error) 1030 | << "either --single-strand-scaling or --double-strand-scaling may be used, but not both" << endl; 1031 | return EXIT_FAILURE; 1032 | } 1033 | else if (not opts::single_strand_scaling and not opts::double_strand_scaling) 1034 | { 1035 | // by default, do double strand scaling 1036 | opts::double_strand_scaling.set(true); 1037 | } 1038 | } 1039 | // 1040 | // check other options 1041 | // 1042 | if (opts::scaling_select_threshold.get() < 0.0) 1043 | { 1044 | LOG(error) 1045 | << "invalid scaling_select_threshold: " << opts::scaling_select_threshold.get() << endl; 1046 | return EXIT_FAILURE; 1047 | } 1048 | if (opts::scaling_min_progress < 0.0) 1049 | { 1050 | LOG(error) 1051 | << "invalid scaling_min_progress: " << opts::scaling_min_progress.get() << endl; 1052 | return EXIT_FAILURE; 1053 | } 1054 | if (not opts::output_fn.get().empty() and opts::write_fast5) 1055 | { 1056 | LOG(error) 1057 | << "output may be written to fast5 files or to a single output file, but not both" << endl; 1058 | return EXIT_FAILURE; 1059 | } 1060 | // 1061 | // print training options 1062 | // 1063 | LOG(info) << "train=" << opts::train.get() << endl; 1064 | if (opts::train) 1065 | { 1066 | LOG(info) << "train_scaling=" << not opts::no_train_scaling.get() << endl; 1067 | LOG(info) << "train_transitions=" << not opts::no_train_transitions.get() << endl; 1068 | if (not opts::no_train_scaling) 1069 | { 1070 | LOG(info) << "double_strands_scaling=" << opts::double_strand_scaling.get() << endl; 1071 | LOG(info) << "scaling_num_events=" << opts::scaling_num_events.get() << endl; 1072 | LOG(info) << "scaling_max_rounds=" << opts::scaling_max_rounds.get() << endl; 1073 | LOG(info) << "scaling_min_progress=" << opts::scaling_min_progress.get() << endl; 1074 | LOG(info) << "scaling_select_threshold=" << opts::scaling_select_threshold.get() << endl; 1075 | LOG(info) << "train_drift=" << opts::train_drift.get() << endl; 1076 | } 1077 | } 1078 | LOG(info) << "basecall=" << opts::basecall.get() << endl; 1079 | return real_main(); 1080 | } 1081 | -------------------------------------------------------------------------------- /src/nanocall/run-fwbw.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Pore_Model.hpp" 6 | #include "State_Transitions.hpp" 7 | #include "Event.hpp" 8 | #include "Forward_Backward.hpp" 9 | #include "Forward_Backward_Custom.hpp" 10 | #include "logger.hpp" 11 | #include "zstr.hpp" 12 | 13 | using namespace std; 14 | 15 | #ifndef FLOAT_TYPE 16 | #define FLOAT_TYPE float 17 | #endif 18 | #ifndef KMER_SIZE 19 | #define KMER_SIZE 6 20 | #endif 21 | typedef State_Transitions< FLOAT_TYPE, KMER_SIZE > State_Transitions_Type; 22 | typedef Pore_Model< FLOAT_TYPE, KMER_SIZE > Pore_Model_Type; 23 | typedef Event< FLOAT_TYPE, KMER_SIZE> Event_Type; 24 | typedef Event_Sequence< FLOAT_TYPE, KMER_SIZE > Event_Sequence_Type; 25 | typedef Forward_Backward< FLOAT_TYPE, KMER_SIZE > Forward_Backward_Type; 26 | typedef Forward_Backward_Custom< FLOAT_TYPE, KMER_SIZE > Forward_Backward_Custom_Type; 27 | 28 | namespace opts 29 | { 30 | using namespace TCLAP; 31 | string description = 32 | "Given a scaled pore model, a state trasition table, and a sequence of events, " 33 | "compute the state distribution conditioned on the prefix event sequence"; 34 | CmdLine cmd_parser(description); 35 | MultiArg< string > log_level("d", "log-level", "Log level.", false, "string", cmd_parser); 36 | ValueArg< string > pm_file_name("p", "pore-model", "Scaled pore model file name.", true, "", "file", cmd_parser); 37 | ValueArg< string > st_file_name("s", "state-transitions", "State transitions file name.", true, "", "file", cmd_parser); 38 | ValueArg< string > ev_file_name("e", "events", "Events file name.", true, "", "file", cmd_parser); 39 | ValueArg< string > output_file_name("o", "output", "Output file name.", false, "", "file", cmd_parser); 40 | SwitchArg custom_fwbw("", "custom-fwbw", "Use custom fwbw.", cmd_parser); 41 | } // namespace opts 42 | 43 | void real_main() 44 | { 45 | Pore_Model_Type pm; 46 | //Pore_Model_Parameters<> params; 47 | State_Transitions_Type st; 48 | Event_Sequence_Type ev; 49 | zstr::ifstream(opts::pm_file_name) >> pm; 50 | zstr::ifstream(opts::st_file_name) >> st; 51 | { 52 | zstr::ifstream ifs(opts::ev_file_name); 53 | Event_Type e; 54 | while (ifs >> e) 55 | { 56 | ev.push_back(e); 57 | } 58 | } 59 | 60 | Forward_Backward_Type fwbw; 61 | Forward_Backward_Custom_Type fwbw_custom; 62 | if (not opts::custom_fwbw) 63 | { 64 | fwbw.fill(pm, st, ev); 65 | } 66 | else 67 | { 68 | fwbw_custom.fill(pm, st, ev); 69 | } 70 | 71 | // print all kmers with posterior >= .1 for the middle event 72 | multiset< pair< FLOAT_TYPE, unsigned > > s; 73 | for (unsigned j = 0; j < pm.n_states; ++j) 74 | { 75 | FLOAT_TYPE v = exp(not opts::custom_fwbw 76 | ? fwbw.log_posterior(ev.size() / 2, j) 77 | : fwbw_custom.log_posterior(ev.size() / 2, j)); 78 | if (v >= .1) 79 | { 80 | s.insert(make_pair(v, j)); 81 | } 82 | } 83 | while (not s.empty()) 84 | { 85 | auto it = prev(s.end()); 86 | cout << Forward_Backward_Type::Kmer_Type::to_string(it->second) << '\t' << it->first << endl; 87 | s.erase(it); 88 | } 89 | 90 | if (not opts::output_file_name.get().empty()) 91 | { 92 | strict_fstream::ofstream(opts::output_file_name) << fwbw; 93 | } 94 | } 95 | 96 | int main(int argc, char * argv[]) 97 | { 98 | opts::cmd_parser.parse(argc, argv); 99 | logger::Logger::set_levels_from_options(opts::log_level); 100 | real_main(); 101 | } 102 | -------------------------------------------------------------------------------- /src/nanocall/run-viterbi.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Pore_Model.hpp" 6 | #include "State_Transitions.hpp" 7 | #include "Event.hpp" 8 | #include "Viterbi.hpp" 9 | #include "logger.hpp" 10 | #include "zstr.hpp" 11 | 12 | using namespace std; 13 | 14 | #ifndef FLOAT_TYPE 15 | #define FLOAT_TYPE float 16 | #endif 17 | #ifndef KMER_SIZE 18 | #define KMER_SIZE 6 19 | #endif 20 | typedef State_Transitions< FLOAT_TYPE, KMER_SIZE > State_Transitions_Type; 21 | typedef Pore_Model< FLOAT_TYPE, KMER_SIZE > Pore_Model_Type; 22 | typedef Event< FLOAT_TYPE, KMER_SIZE > Event_Type; 23 | typedef Event_Sequence< FLOAT_TYPE, KMER_SIZE > Event_Sequence_Type; 24 | typedef Viterbi< FLOAT_TYPE, KMER_SIZE > Viterbi_Type; 25 | 26 | namespace opts 27 | { 28 | using namespace TCLAP; 29 | string description = 30 | "Run Viterbi on given input"; 31 | CmdLine cmd_parser(description); 32 | MultiArg< string > log_level("d", "log-level", "Log level.", false, "string", cmd_parser); 33 | ValueArg< string > pm_file_name("p", "pore-model", "Scaled pore model file name.", true, "", "file", cmd_parser); 34 | ValueArg< string > st_file_name("s", "state-transitions", "State transitions file name.", true, "", "file", cmd_parser); 35 | ValueArg< string > ev_file_name("e", "events", "Events file name.", true, "", "file", cmd_parser); 36 | } // namespace opts 37 | 38 | void real_main() 39 | { 40 | Pore_Model_Type pm; 41 | State_Transitions_Type st; 42 | Event_Sequence_Type ev; 43 | zstr::ifstream(opts::pm_file_name) >> pm; 44 | zstr::ifstream(opts::st_file_name) >> st; 45 | { 46 | zstr::ifstream ifs(opts::ev_file_name); 47 | Event_Type e; 48 | while (ifs >> e) 49 | { 50 | ev.push_back(e); 51 | } 52 | } 53 | 54 | Viterbi_Type vit; 55 | vit.fill(pm, st, ev); 56 | cout << ev.get_base_seq() << std::endl; 57 | } 58 | 59 | int main(int argc, char * argv[]) 60 | { 61 | opts::cmd_parser.parse(argc, argv); 62 | logger::Logger::set_levels_from_options(opts::log_level); 63 | real_main(); 64 | } 65 | -------------------------------------------------------------------------------- /src/version/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | message(STATUS "Processing: ${CMAKE_CURRENT_SOURCE_DIR}") 2 | 3 | add_library(version STATIC version.cpp) 4 | add_dependencies(version package_version) 5 | -------------------------------------------------------------------------------- /src/version/version.cpp: -------------------------------------------------------------------------------- 1 | #include "version.hpp" 2 | #include "package_version.h" 3 | 4 | char const * const package_version = PACKAGE_VERSION; 5 | -------------------------------------------------------------------------------- /src/version/version.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __VERSION_HPP 2 | #define __VERSION_HPP 3 | 4 | extern char const * const package_version; 5 | 6 | #endif 7 | --------------------------------------------------------------------------------