├── COPYRIGHT ├── LICENSE.md ├── README.md ├── RUBRIC ├── RK_RUutils_lite4.py ├── __main__.py ├── nanonet │ ├── LICENSE.md │ ├── __init__.py │ ├── caller_2d │ │ ├── align_kmers.py │ │ ├── caller_2d.py │ │ ├── common │ │ │ ├── bp_tools.h │ │ │ ├── data_view.h │ │ │ ├── stub_py.cpp │ │ │ ├── test │ │ │ │ └── test_stub.py │ │ │ ├── utils.h │ │ │ └── view_numpy_arrays.h │ │ ├── pair_align │ │ │ ├── mm_align.cpp │ │ │ ├── mm_align.h │ │ │ ├── nw_align.cpp │ │ │ ├── nw_align.h │ │ │ ├── pair_align.h │ │ │ ├── pair_align_py.cpp │ │ │ └── pair_align_py.h │ │ ├── viterbi_2d │ │ │ ├── viterbi_2d.cpp │ │ │ ├── viterbi_2d.h │ │ │ ├── viterbi_2d_py.cpp │ │ │ └── viterbi_2d_py.h │ │ └── viterbi_2d_ocl │ │ │ ├── proxyCL.cpp │ │ │ ├── proxyCL.h │ │ │ ├── viterbi_2d.cl │ │ │ ├── viterbi_2d_ocl.cpp │ │ │ ├── viterbi_2d_ocl.h │ │ │ ├── viterbi_2d_ocl_py.cpp │ │ │ └── viterbi_2d_ocl_py.h │ ├── cmdargs.py │ ├── currennt_to_pickle.py │ ├── data │ │ ├── default_complement.npy │ │ ├── default_model.tmpl │ │ ├── default_template.npy │ │ ├── r9.4_template.npy │ │ ├── r9_complement.npy │ │ ├── r9_template.npy │ │ └── rtc_mismatch_scores.txt │ ├── decoding.cpp │ ├── decoding.py │ ├── eventdetection │ │ ├── filters.c │ │ ├── filters.h │ │ └── filters.py │ ├── fast5.py │ ├── features.py │ ├── include │ │ ├── CL │ │ │ └── cl.hpp │ │ ├── extras │ │ │ ├── CL │ │ │ │ └── cl.hpp │ │ │ └── stdint.h │ │ └── module.h │ ├── jobqueue.py │ ├── latency_test │ │ ├── latency_test.py │ │ └── run_gnuplot_on_csv_files.py │ ├── nanonetcall.py │ ├── nanonetcall_2d.py │ ├── nanonettrain.py │ ├── nn.py │ ├── resolve.py │ ├── sample_data │ │ ├── 904896_ch170_read104_strand.fast5 │ │ ├── 904896_ch170_read105_strand.fast5 │ │ ├── 904896_ch170_read108_strand.fast5 │ │ ├── 904896_ch170_read111_strand.fast5 │ │ ├── 904896_ch170_read114_strand.fast5 │ │ └── sample_out.fa │ ├── segment.py │ ├── test │ │ └── test_nn.py │ ├── util.py │ └── watcher.py └── setup_nanonet.py └── setup.py /COPYRIGHT: -------------------------------------------------------------------------------- 1 | Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software. 2 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | 2 | BSD-3-Clause 3 | 4 | Copyright 2018 National Technology & Engineering Solutions of Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software. 5 | 6 | 7 | 8 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 17 | 18 | The RUBRIC application may only be used with devices developed by Oxford Nanopore Technologies Ltd. or Metrichor Ltd. and may require access to real time data generated from such devices that may only be available via the Software [(link)](https://github.com/ragak/RUBRIC) and may be subject to your agreement to additional terms and conditions. The RUBRIC application was not developed by Oxford Nanopore Technologies Ltd. or Metrichor Ltd. The RUBRIC application is licensed solely for noncommercial research use only. Such license does not include a right to redistribute or create derivative works (except by persons with an active Developer License Agreement [(found here)](https://community.nanoporetech.com/info_sheets/developer-channel/v/dpi_s1005_v1_revh_06apr2016/how-to-join-the-developer) with Oxford Nanopore. The RUBRIC application is provided “AS IS” and “Where Available.” Developer, Oxford Nanopore Technologies Ltd. and Metrichor Ltd. disclaim all warranties expressed or implied. Oxford reserves all rights in its Intellectual Property Rights not expressly granted herein and no implied licenses may be created by acts or omissions. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT (INCLUDING IN NEGLIGENCE) OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE RUBRIC APPLICATION OR THE USE OR OTHER DEALINGS IN THE RUBRIC APPLICATION. This notice shall be included in all copies or substantial portions of the RUBRIC application. 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | *************************************************************************** 2 | (c) 2018: National Technology & Engineering Solutions of Sandia, LLC (NTESS) 3 | *************************************************************************** 4 | 5 | RUBRIC 6 | ====== 7 | Read Until with Basecall and Reference-Informed Criteria 8 | 9 | These scripts allow for real-time filtering of nanopore sequencing reads based upon analysis of the incoming basepairs as in [DOI](https://www.biorxiv.org/content/early/2018/11/02/460014). RUBRIC was conceived and tested using Nanonet for basecalling, LAST for read-alignment, and a Windows 10 PC operating in safe mode (with networking). 10 | 11 | NOTE: 12 | --- 13 | A version of nanonet has been included in this distribution which is no longer offered or supported by Oxford Nanopore Technologies (ONT). It has been included to maintain the functionality of this package. 14 | 15 | Also note that it is advisable to use this package first on a used flow cell to gauge baseline functionality. 16 | 17 | Installation 18 | ------------ 19 | **Requirements** 20 | 21 | As the Nanonet basecaller is no longer supported or offered by Oxford Nanopore Technologies (ONT), a version modified to support RUBRIC has been included in this repository. 22 | RUBRIC also relies upon the [LAST](http://last.cbrc.jp/) aligner. Therefore LAST must also be installed and added to PATH. Nanonet will need to be compiled using the [Visual C++ Compiler for Python 2.7](https://www.microsoft.com/en-us/download/details.aspx?id=44266). RUBRIC relies upon the Read-Until API, which can be obtained directly from ONT. Most of the results obtained in [DOI](https://www.biorxiv.org/content/early/2018/11/02/460014) were obtained using the RU API that was released alongside MinKNOW version 1.6.11. *RUBRIC has not been tested on newer versions of the RU API, but may work with some small adjustments.* 23 | 24 | **Install** 25 | 26 | Once LAST and the C++ compiler have been installed the RUBRIC scripts can be installed. It is highly recommended to install the scripts in a virtual environment such as conda: 27 | ``` 28 | conda create -n RUBRIC_env python=2.7 29 | 30 | activate RUBRIC_env 31 | 32 | cd \path\to\cloned\repository 33 | 34 | python setup.py install 35 | ``` 36 | 37 | This setup file first installs the RUBRIC components, and then calls the 'setup_nanonet.py' file (taken and renamed from the original Nanonet repository). 38 | 39 | RUBRIC relies on an older version of the read_until API, which is included in this repository and is used via a relative import during runtime. It is recommended that users with the newer read_until API first uninstall the new version before installing RUBRIC. 40 | 41 | 42 | Quick Start 43 | ----------- 44 | Once installed, the rubric commandline help can be called via RUBRIC can be called simply with: 45 | 46 | ``` 47 | python RUBRIC -h 48 | ``` 49 | 50 | Which should then show: 51 | ``` 52 | usage: RUBRIC [-h] -r REFERENCE_DATABASE -ho HOST [-a ALIGN] 53 | [-as ALIGNER_SETTINGS] [-t TIME] [-q QUEUE_SIZE] [-s] 54 | [-l LOWER_THRESHOLD] [-u UPPER_THRESHOLD] [-i IGNORE_EVENTS] 55 | [-c CONSIDER_EVENTS] 56 | 57 | Read Until with Basecall and Reference-Informed Criteria (RUBRIC) 58 | 59 | optional arguments: 60 | -h, --help show this help message and exit 61 | -r REFERENCE_DATABASE, --reference_database REFERENCE_DATABASE 62 | path to database if LAST or fasta file if graphmap 63 | -ho HOST, --host HOST 64 | The host address for the laptop running the MinION 65 | -a ALIGN, --aligner ALIGN 66 | Type of aligner - either "graphmap" or "last" (default 67 | last) 68 | -as ALIGNER_SETTINGS, --aligner_settings ALIGNER_SETTINGS 69 | A string containing the settings to pass to the 70 | aligner (default: '-fTAB -C2 -q 1 -r 1 -a 1 -b 1 -e 71 | 30' 72 | -t TIME, --time TIME This is an error catch for when we cannot keep up with 73 | the rate of sequencing on the device. It takes a 74 | finite amount of time to process through the all the 75 | channels from the sequencer. If we cannot process 76 | through the array quickly enough then we will 'fall 77 | behind' and lose the ability to filter sequences. 78 | Rather than do that we set a threshold after which we 79 | allow the sequencing to complete naturally. 80 | -q QUEUE_SIZE, --queue QUEUE_SIZE 81 | The length of the queue for storing reads until 82 | compute resources are available. (default 16) 83 | -s, --skip_even If set, only apply filtering to even pores 84 | -l LOWER_THRESHOLD, --lower_threshold LOWER_THRESHOLD 85 | The lower standard deviation threshold to filter reads 86 | before basecalling (default 5) 87 | -u UPPER_THRESHOLD, --upper_threshold UPPER_THRESHOLD 88 | The upper standard deviation threshold to filter reads 89 | before basecalling (default 14) 90 | -i IGNORE_EVENTS, --ignore_events IGNORE_EVENTS 91 | The number of events to ignore at the beginning of the 92 | read (default 100) 93 | -c CONSIDER_EVENTS, --consider_events CONSIDER_EVENTS 94 | The number of events to after the ignored events to be 95 | used for RUBRIC consideration (default 300) 96 | ``` 97 | 98 | **Required Arguments** 99 | 100 | Only the path to the reference database and the host address are required arguments. All other arguments default to empirically-determined optimal conditions observed in [DOI]. After ensuring that the event sampler has started with MinKNOW, one can simply use: 101 | 102 | ``` 103 | python RUBRIC --reference_database \path\to\LAST\database --host ws://localhost:9200/ 104 | ``` 105 | 106 | Note the above command assumes the event sampler is running locally on port 9200. It is highly desireable to have one computer running MinKNOW (and the event sampler) and one computer that connects remotely and runs RUBRIC. 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /RUBRIC/RK_RUutils_lite4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import csv 3 | import os 4 | import sys 5 | 6 | from Bio import SeqIO 7 | 8 | 9 | def file_len(file_name): 10 | """ 11 | calculates length of file 12 | """ 13 | with open(file_name) as f: 14 | # print 'file is' + str(fname) 15 | for i, l in enumerate(f): 16 | pass 17 | return i + 1 18 | 19 | 20 | def align_reads(aligner, db, in_seq, in_seq_id='noIDgiven', offline=False, number_of_bases=0, verbose=False): 21 | """ 22 | align the reads using graphmap or last 23 | if working offline, remember to add 'offline=True' otherwise the script will error out 24 | also if using last and there is already a database, toggle isdb to true 25 | """ 26 | current_path = os.getcwd() 27 | if offline: 28 | # print 'offline' 29 | if number_of_bases == 0: 30 | sys.exit('you are in offline mode but have not specified number of bases - please fix') 31 | else: 32 | counter = 0 33 | for record in SeqIO.parse(in_seq, "fasta"): 34 | counter += 1 35 | # print 'count is ' + str(counter) ##THIS IS FOR DIAGNOSTICS 36 | new_seq = record.seq 37 | new_seq2 = str(new_seq[0:int(number_of_bases) + 1]) 38 | if aligner == 'graphmap': 39 | graph_map(new_seq2, record.id, current_path, db, verbose) 40 | elif aligner == 'last': 41 | # print 'yes database' 42 | result = last(new_seq2, record.id, current_path, db, verbose) 43 | return result 44 | else: 45 | sys.exit('invalid aligner specified in test mode - use either graphmap or last') 46 | else: 47 | # print 'realtime' 48 | if in_seq_id == 'noIDgiven': 49 | sys.exit('no read ID given - if not running in offline mode, please pass read ID as third arguemnt') 50 | else: 51 | if aligner == 'graphmap': 52 | graph_map(in_seq, in_seq_id, current_path, db, verbose=False) 53 | elif aligner == 'last': 54 | result = last(in_seq, in_seq_id, current_path, db, verbose=False) 55 | # print 'made it here ' 56 | # print result 57 | return result 58 | # else: 59 | # if dbName=='noNamegiven': 60 | # #print 'no database name given' 61 | # sys.exit('please specify database name for last') 62 | # else: 63 | # #print 'databse name given' 64 | # Lastdb(db,dbName) 65 | # #print 'databse is ' + str(dbName) 66 | # result=Last(in_seq,in_seq_id,current_path ,dbName,verbose=False) 67 | # #print 'made it here ' 68 | # #print result 69 | # return result 70 | else: 71 | sys.exit('invalid aligner specified in realtime mode - use either graphmap or last') 72 | 73 | 74 | def graph_map(sequence, id, current_path, db, verbose): 75 | """ 76 | given a sequence and ID, map using graphmap 77 | """ 78 | new_fasta = ">" + str(id) + "\n" + str(sequence) 79 | tempfile = open(str(id) + ".fa", "w") 80 | tempfile.write(new_fasta) 81 | tempfile.close() 82 | in_fasta = str(current_path) + "/" + str(id) + ".fa" 83 | name = 'tmpOutGrM' + str(id) 84 | cmdstring = "graphmap align -r %s -d %s -o %s -v 0 -a anchor -z 0.5" % (db, in_fasta, name) 85 | os.system(cmdstring) 86 | graph_out = str(current_path) + "/" + str(name) 87 | f = open(graph_out, 'r') 88 | sam_file = f.read() 89 | samlist = sam_file.split('\n') 90 | for a in samlist: 91 | if a[0] != "@": 92 | flagline = a 93 | break 94 | flag = flagline.split('\t')[1] 95 | # print 'flag is '+str(flag) ##THIS IS FOR DIAGNOSTICS 96 | sam_check(name, flag, verbose) 97 | os.remove(name) 98 | os.remove(in_fasta) 99 | 100 | 101 | def last_db(input_fasta, db_name): 102 | """ 103 | input fasta, output database for last alignment 104 | in db_name, include path otherwise will be put into current folder. 105 | for inFasta include path if not in current folder 106 | """ 107 | cmdstring = "lastdb -cR01 %s %s" % (db_name, input_fasta) 108 | # print cmdstring 109 | os.system(cmdstring) 110 | 111 | 112 | def last(sequence, id, current_path, db, verbose): 113 | """ 114 | given a fasta and database, output an alignment file 115 | """ 116 | newFasta = ">" + str(id) + "\n" + str(sequence) 117 | tempfile = open(str(id) + ".fa", "w") 118 | tempfile.write(newFasta) 119 | tempfile.close() 120 | inFasta = str(current_path) + "\\" + str(id) + ".fa" 121 | # dbName=str(currpath)+"/"+str(db) 122 | name = 'tmpOutGrM' + str(id) 123 | cmdstring = "lastal -fTAB -C2 %s %s > %s" % (db, inFasta, name) 124 | os.system(cmdstring) 125 | last_out = str(current_path) + "\\" + str(name) 126 | # print 'last_out is:' 127 | 128 | if file_len(last_out) == 20: 129 | os.remove(last_out) 130 | os.remove(inFasta) 131 | return 'Skip' 132 | else: 133 | os.remove(last_out) 134 | os.remove(inFasta) 135 | return 'Sequence' 136 | 137 | 138 | def last_batch(file, currpath, db, cmdstring): 139 | """ 140 | takes a batch input in fasta format and outputs a dictionary of calls. also provide current path and databse 141 | """ 142 | inFasta = str(currpath) + "\\" + str(file) 143 | updatedDict = {} 144 | with open(inFasta) as a: 145 | reader = csv.reader(a, delimiter="\n") 146 | c = list(reader) 147 | # print 'we are in lastbatch, len of list is ',len(c) 148 | for a in range(0, len(c) - 1): 149 | if a % 2 == 0: 150 | # print 'here is a ',c[a] 151 | # default all channels to skip 152 | updatedDict[c[a][0].split("_")[0].replace('>', '')] = [c[a][0].split("_")[1], "Skip"] 153 | # name=c[a][0] 154 | # channel=name.split("_")[0] 155 | # read=name.split("_")[1] 156 | name = 'tmpOutGrM' + str(file) 157 | # cmdstring="lastal -fTAB -C2 -q 1 -r 1 -a 1 -b 1 -D 100 -e 15 %s %s > %s" % (db,inFasta,name) 158 | cmdstring = 'lastal' + cmdstring + '%s %s > %s' % (db, inFasta, name) 159 | os.system(cmdstring) 160 | last_out = str(currpath) + "\\" + str(name) 161 | with open(last_out) as f: 162 | reader = csv.reader(f, delimiter="\t") 163 | d = list(reader) 164 | for i in range(19, len(d) - 1): 165 | name = d[i][6] 166 | read = name.split("_")[1] 167 | channel = name.split("_")[0].replace('>', '') 168 | # this if statement is a sanity check - can be removed to save time 169 | if channel in updatedDict.keys() and updatedDict[channel][0] == read: 170 | updatedDict[channel][1] = "Sequence" 171 | else: 172 | sys.exit('something is very wrong') 173 | return updatedDict 174 | 175 | 176 | def sam_check(name, flag, verbose): 177 | if flag == '4': 178 | print 'Skip' 179 | if verbose: 180 | print name 181 | elif flag == '0' or flag == '16': 182 | print 'Sequence' 183 | if verbose: 184 | print name 185 | else: 186 | if verbose: 187 | print name 188 | print "What is this??? don't be lazy Raga go check it NOW!!!" 189 | 190 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import re 4 | import subprocess 5 | 6 | __version__ = '2.0.0' 7 | __version_info__ = tuple([int(num) for num in __version__.split('.')]) 8 | 9 | try: 10 | import pyopencl as cl 11 | except ImportError: 12 | cl = None 13 | 14 | try: 15 | __currennt_exe__ = os.path.abspath(os.environ['CURRENNT']) 16 | except KeyError: 17 | __currennt_exe__ = 'currennt' 18 | 19 | def check_currennt(): 20 | # Check we can run currennt 21 | try: 22 | with open(os.devnull, 'w') as devnull: 23 | subprocess.call([__currennt_exe__, '-h'], stdout=devnull, stderr=devnull) 24 | except OSError: 25 | raise OSError("Cannot execute currennt, it must be in your path as 'currennt' or set via the environment variable 'CURRENNT'.") 26 | 27 | 28 | def run_currennt(currennt_cfg, device=0): 29 | sys.stdout.flush() 30 | os.environ["CURRENNT_CUDA_DEVICE"]="{}".format(device) 31 | cmd = [__currennt_exe__, currennt_cfg] 32 | with open(os.devnull, 'wb') as devnull: 33 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=devnull) 34 | stdout, _ = p.communicate() 35 | p.wait() 36 | if p.returncode != 0: 37 | # On windows currennt fails to remove the cache file. Check for 38 | # this and move on, else raise an error. 39 | e = subprocess.CalledProcessError(2, ' '.join(cmd)) 40 | if os.name != 'nt': 41 | sys.stderr.write(stdout) 42 | raise e 43 | else: 44 | cache_file = re.match( 45 | '(FAILED: boost::filesystem::remove.*: )"(.*)"', 46 | stdout.splitlines()[-1]) 47 | if cache_file is not None: 48 | cache_file = cache_file.group(2) 49 | sys.stderr.write('currennt failed to clear its cache, cleaning up {}\n'.format(cache_file)) 50 | os.unlink(cache_file) 51 | else: 52 | sys.stderr.write(stdout) 53 | raise e 54 | 55 | def run_currennt_noisy(currennt_cfg, device=0): 56 | sys.stdout.flush() 57 | os.environ["CURRENNT_CUDA_DEVICE"]="{}".format(device) 58 | cmd = [__currennt_exe__, currennt_cfg] 59 | subprocess.check_call(cmd) 60 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/align_kmers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from math import log 3 | import pkg_resources 4 | 5 | from RUBRIC.nanonet import all_kmers, kmers_to_annotated_sequence 6 | from RUBRIC.nanonet import Aligner 7 | 8 | 9 | def _load_substitution_matrix(fname): 10 | """ Loads an unwrapped representation of a substitution matrix. 11 | 12 | :param fname: Filename of substitution matrix file. 13 | 14 | :returns: Representation of matrix in log-space times 100. Values are 15 | 32 bit integers, which should all be negative. State ordering is as 16 | given by the compute_kmer_mapping() function. Probabilities smaller 17 | than 6e-6 are given the value of -1200. 18 | """ 19 | subdata = np.genfromtxt(fname, names=True, dtype=None) 20 | pos_to_kmer, kmer_to_pos = all_kmers(length=3, rev_map=True) 21 | matrix = np.empty((64, 64), dtype=np.int32) 22 | for row in subdata: 23 | i = kmer_to_pos[row['kmer1']] 24 | j = kmer_to_pos[row['kmer2']] 25 | if row['prob'] > 6e-6: 26 | val = int(log(row['prob']) * 100) 27 | else: 28 | val = -1200 29 | matrix[i, j] = val 30 | return matrix 31 | 32 | matrix_file = pkg_resources.resource_filename('nanonet', 'data/rtc_mismatch_scores.txt') 33 | sub_matrix = _load_substitution_matrix(matrix_file) 34 | 35 | open_gap = 500 36 | extend_gap = 500 37 | outside_gap = min(open_gap - 200, extend_gap) 38 | gap_pens = { 39 | 'open0': open_gap, 40 | 'open1': open_gap, 41 | 'start0': outside_gap, 42 | 'start1': outside_gap, 43 | 'end0': outside_gap, 44 | 'end1': outside_gap, 45 | 'extend0': extend_gap, 46 | 'extend1': extend_gap 47 | } 48 | 49 | 50 | def align_3mer_sequences(sequence0, sequence1, substitution_matrix=sub_matrix, gap_penalties=gap_pens, reverse=True, lowmem=True): 51 | """Align two sequences in base-space using 3mers. 52 | 53 | :param sequence0: String representing a sequence of bases. 54 | :param sequence1: String representing a sequence of bases. 55 | :param substitution_matrix: 64x64 matrix of substitution scores to use for alignment. Should be 56 | a 2D numpy array of type int32. 57 | :param gap_penalties: Dictionary of gap penalties. See below. 58 | :param reverse: Bool indicating whether the second sequence should be reversed. 59 | :param lowmem: Bool indicating whether to use the (slower) low memory implementation. 60 | 61 | :returns: A tuple of: 62 | 63 | * Numpy record array with fields 'pos0' and 'pos1', representing the alignment. 64 | * Tuple of a scalar value indicating the alignment score and the average 65 | continuous alignment length. 66 | :rtype: tuple 67 | 68 | The gap penalty dictionary should be laid out as follows: 69 | {start0: penalty for aligning sequence 0 before the start of sequence 1, 70 | end0: penatly for aligning sequence 0 after the end of sequence 1, 71 | open0: penalty for aligning sequence 0 to a new gap in sequence 1, 72 | extend0: penalty for extending a gap within sequence 1, 73 | start1: penalty for aligning sequence 1 before the start of sequence 2, 74 | end1: penatly for aligning sequence 1 after the end of sequence 2, 75 | open1: penalty for aligning sequence 1 to a new gap in sequence 2, 76 | extend1: penalty for extending a gap within sequence 2, 77 | } 78 | The only required field is open0. Gap extension values will default to being the same 79 | as opening a gap. Start and end gap penalties default to being the same as the extension 80 | penalty. And the second set of penalties will default to the values for the first. 81 | .. note:: 82 | Resulting alignment is in terms of 3mers. So '0' represents bases 0-2, and the 83 | largest value in the alignment will be len(sequence) - 3. Since the alignment is 84 | done in terms of 3mers, if the sequence was generated from 5mers then the first and 85 | last base should be discarded before calling this function. 86 | 87 | """ 88 | submat = [[int(val) for val in line] for line in substitution_matrix] 89 | pos_to_kmer, kmer_to_pos = all_kmers(length=3, rev_map=True) 90 | seq0 = [kmer_to_pos[sequence0[i:i+3]] for i in xrange(len(sequence0) - 2)] 91 | seq1 = [kmer_to_pos[sequence1[i:i+3]] for i in xrange(len(sequence1) - 2)] 92 | if reverse: 93 | seq1[:] = seq1[::-1] 94 | gaps = _gap_penalties_dict_to_list(gap_penalties) 95 | aligner = Aligner(submat, gaps, lowmem) 96 | alignment, score = aligner.align(seq0, seq1) 97 | if reverse: 98 | for pos in xrange(len(alignment)): 99 | if alignment[pos][1] != -1: 100 | alignment[pos] = (alignment[pos][0], len(seq1) - alignment[pos][1] - 1) 101 | 102 | # We'll return the average continuously-aligned length as well. 103 | alignment_lengths = [] 104 | current_alignment_length = 0 105 | for pos in alignment: 106 | if pos[0] == -1 or pos[1] == -1: # I.e. a stay or skip 107 | if current_alignment_length > 0: 108 | alignment_lengths.append(current_alignment_length) 109 | current_alignment_length = 0 110 | else: 111 | current_alignment_length += 1 112 | if len(alignment_lengths) > 0: 113 | average_continuous_length = np.average(alignment_lengths) 114 | else: 115 | average_continuous_length = current_alignment_length 116 | npalignment = np.empty(len(alignment), dtype=[('pos0', int), ('pos1', int)]) 117 | npalignment[:] = alignment 118 | return npalignment, (score, average_continuous_length) 119 | 120 | 121 | def _gap_penalties_dict_to_list(gap_penalties): 122 | """ Convert dictionary of gap penalties into an array 123 | 124 | :param gap_penalties: Dictionary of gap penalties 125 | 126 | :returns: List of gap penalties in order which align_1mer_sequences and align_3mer_sequences can use 127 | """ 128 | gaps = [0] * 8 129 | gaps[2] = gap_penalties['open0'] 130 | gaps[3] = gap_penalties.get('extend0', gaps[2]) 131 | gaps[0] = gap_penalties.get('start0', gaps[3]) 132 | gaps[1] = gap_penalties.get('end0', gaps[3]) 133 | gaps[6] = gap_penalties.get('open1', gaps[2]) 134 | gaps[7] = gap_penalties.get('extend1', gaps[6]) 135 | gaps[4] = gap_penalties.get('start1', gaps[7]) 136 | gaps[5] = gap_penalties.get('end1', gaps[7]) 137 | return gaps 138 | 139 | 140 | def align_basecalls(kmers0, kmers1, substitution_matrix=sub_matrix, gap_penalties=gap_pens, lowmem=True): 141 | """ Align template to complement basecalls, using the align_3mer_sequences function. 142 | 143 | :param kmers0: Template basecalled kmers. 144 | :param kmers1: Complement basecalled kmers. 145 | :param substitution_matrix: 64x64 matrix of substitution scores to use for alignment. Should be 146 | a 2D numpy array of type int32. 147 | :param gap_penalties: Dictionary of gap penalties. See below. 148 | :param lowmem: Bool indicating whether to use the (slower) low memory implementation. 149 | 150 | :returns: A tuple of: 151 | 152 | * Numpy array with fields 'pos0' and 'pos1' 153 | * Scalar value indicating the alignment score and the average continuous 154 | alignment length. 155 | :rtype: tuple 156 | 157 | Returns a "filled-in" alignment. So there will be no -1 values (gaps). Instead, 158 | values can be repeated in either of the sequences. 159 | 160 | The returned alignment is trimmed, meaning that it will start and end with events 161 | that are aligned to each other. Therefore events at the beginning and end of 162 | either sequence may have been removed. 163 | 164 | .. warning: 165 | It is possible for the alignment to fail, if too few points end up directly aligned 166 | to each other. In this case the function will return the tuple (None, None). 167 | """ 168 | sequence0, index0 = kmers_to_annotated_sequence(kmers0) 169 | sequence1, index1 = kmers_to_annotated_sequence(kmers1) 170 | kmer_len = len(kmers0[0]) 171 | trim = kmer_len - 3 172 | trim_left = int((trim + 1) / 2) 173 | trim_right = int(trim / 2) 174 | sequence0 = sequence0[trim_left:(len(sequence0) - trim_right)] 175 | sequence1 = sequence1[trim_left:(len(sequence1) - trim_right)] 176 | alignment, score = align_3mer_sequences(sequence0, sequence1, substitution_matrix, gap_penalties, reverse=True) 177 | # Find positions in the alignment that don't have gaps. 178 | hits = [] 179 | for i, j in alignment: 180 | if i != -1 and j != -1: 181 | p0 = index0[i] 182 | p1 = index1[j] 183 | if p0 != -1 and p1 != -1: 184 | hits.append((p0, p1)) 185 | if len(hits) < 2: 186 | # Not enough aligned positions to do anything sensible. 187 | return None, None 188 | # Build up a filled-in alignment by interpolating between aligned positions. 189 | new_alignment = [hits[0]] 190 | for i in xrange(1, len(hits)): 191 | delta0 = hits[i][0] - hits[i-1][0] 192 | delta1 = hits[i-1][1] - hits[i][1] 193 | if delta0 > 1 and delta1 > 1: 194 | # Both sequences jump by more than 1 between aligned points. 195 | # One sequence should increment by one each position. The other will vary. 196 | n = max(delta0, delta1) - 1 197 | p0 = hits[i-1][0] 198 | p1 = hits[i-1][1] 199 | step0 = float(delta0 - 1) / float(n) 200 | step1 = float(delta1 - 1) / float(n) 201 | for k in range(n): 202 | p0 += step0 203 | p1 -= step1 204 | new_alignment.append((int(round(p0)), int(round(p1)))) 205 | elif delta0 > 1: 206 | # Need to insert repetions into sequence 2. 207 | for j in range(hits[i-1][0] + 1, hits[i][0]): 208 | new_alignment.append((j, hits[i-1][1])) 209 | elif delta1 > 1: 210 | # Need to insert repetions into sequence 1. 211 | for j in range(hits[i-1][1] - 1, hits[i][1], -1): 212 | new_alignment.append((hits[i-1][0], j)) 213 | else: 214 | new_alignment.append(hits[i]) 215 | alignment = np.empty(len(new_alignment), dtype=[('pos0', int), ('pos1', int)]) 216 | for n, p in enumerate(new_alignment): 217 | alignment[n] = p 218 | return alignment, score 219 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/common/bp_tools.h: -------------------------------------------------------------------------------- 1 | #ifndef BP_TOOLS_H 2 | #define BP_TOOLS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace ublas = boost::numeric::ublas; 10 | namespace bp = boost::python; 11 | 12 | 13 | /// Construct a std::vector from a python list. Elements must match template type. 14 | template 15 | std::vector list_to_vector(const bp::list& in) { 16 | int count = bp::len(in); 17 | std::vector out(count); 18 | for (int i = 0; i < count; ++i) { 19 | out[i] = bp::extract(in[i]); 20 | } 21 | return out; 22 | } 23 | 24 | 25 | /// Construct a std::vector of std::pair objects from a python list of tuples. 26 | template 27 | std::vector > list_to_pair_vector(const bp::list& in) { 28 | int count = bp::len(in); 29 | std::vector > out(count); 30 | for (int i = 0; i < count; ++i) { 31 | bp::tuple p = bp::extract(in[i]); 32 | T first = bp::extract(p[0]); 33 | T second = bp::extract(p[1]); 34 | out[i] = std::make_pair(first, second); 35 | } 36 | return out; 37 | } 38 | 39 | 40 | /// Construct an ublas::matrix from a python list of lists. Elements must match template type. 41 | template 42 | ublas::matrix list_to_matrix(const bp::list& in) { 43 | int nrows = bp::len(in); 44 | int ncols = bp::len(bp::extract(in[0])); 45 | ublas::matrix out(nrows, ncols); 46 | for (int i = 0; i < nrows; ++i) { 47 | bp::list row = bp::extract(in[i]); 48 | if (bp::len(row) != ncols) { 49 | throw std::runtime_error("Error: Not all columns are the same length."); 50 | } 51 | for (int j = 0; j < ncols; ++j) { 52 | out(i, j) = bp::extract(row[j]); 53 | } 54 | } 55 | return out; 56 | } 57 | 58 | 59 | /// Construct a std::map from a python dictionary. 60 | template 61 | std::map dict_to_map(const bp::dict& in) { 62 | bp::list items = in.items(); 63 | int count = bp::len(items); 64 | std::map out; 65 | for (int i = 0; i < count; ++i) { 66 | bp::tuple pair = bp::extract(items[i]); 67 | KEY key = bp::extract(pair[0]); 68 | VAL val = bp::extract(pair[1]); 69 | out[key] = val; 70 | } 71 | return out; 72 | } 73 | 74 | 75 | #endif /* BP_TOOLS_H */ 76 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/common/data_view.h: -------------------------------------------------------------------------------- 1 | #ifndef DATA_VIEW_H 2 | #define DATA_VIEW_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace ublas = boost::numeric::ublas; 10 | 11 | 12 | /** Represents a block of data as an STL-style container. 13 | * This object does not own the data it views, and as such 14 | * it can be invalidated if the data it views goes out of 15 | * scope or is deleted. Note that the const-ness of this 16 | * class protects the object itself, but not the data it 17 | * views. 18 | **/ 19 | template 20 | class VecView { 21 | public: 22 | typedef T value_type; 23 | typedef T* pointer; 24 | typedef const T* const_pointer; 25 | typedef size_t size_type; 26 | typedef ptrdiff_t difference_type; 27 | typedef T& reference; 28 | typedef const T& const_reference; 29 | 30 | protected: 31 | pointer ptr_; 32 | size_type size_; 33 | difference_type stride_; 34 | 35 | public: 36 | class iterator : public std::iterator { 37 | public: 38 | typedef T value_type; 39 | typedef T* pointer; 40 | typedef const T* const_pointer; 41 | typedef ptrdiff_t difference_type; 42 | typedef T& reference; 43 | typedef const T& const_reference; 44 | 45 | protected: 46 | pointer ptr_; 47 | difference_type stride_; 48 | 49 | public: 50 | iterator() : ptr_(0), stride_(1) {} 51 | 52 | iterator(pointer p, difference_type s) : ptr_(p), stride_(s) {} 53 | 54 | operator void *() const { 55 | return ptr_; 56 | } 57 | 58 | reference operator[](int n) const { 59 | return *(ptr_ + n * stride_); 60 | } 61 | 62 | reference operator*() const { 63 | return *ptr_; 64 | } 65 | 66 | pointer operator->() const { 67 | return ptr_; 68 | } 69 | 70 | iterator& operator++() { 71 | ptr_ += stride_; 72 | return *this; 73 | } 74 | 75 | iterator& operator--() { 76 | ptr_ -= stride_; 77 | return *this; 78 | } 79 | 80 | iterator operator++(int) { 81 | iterator temp(*this); 82 | ptr_ += stride_; 83 | return temp; 84 | } 85 | 86 | iterator operator--(int) { 87 | iterator temp(*this); 88 | ptr_ -= stride_; 89 | return temp; 90 | } 91 | 92 | iterator& operator+=(int n) { 93 | ptr_ += n * stride_; 94 | return *this; 95 | } 96 | 97 | iterator& operator-=(int n) { 98 | ptr_ -= n * stride_; 99 | return *this; 100 | } 101 | 102 | iterator operator+(int n) const { 103 | return iterator(ptr_ + n * stride_, stride_); 104 | } 105 | 106 | iterator operator-(int n) const { 107 | return iterator(ptr_ - n * stride_, stride_); 108 | } 109 | 110 | difference_type operator-(const iterator& it) const { 111 | return (ptr_ - it.ptr_) / stride_; 112 | } 113 | 114 | bool operator==(const iterator& it) const { 115 | return (ptr_ == it.ptr_); 116 | } 117 | 118 | bool operator<(const iterator& it) const { 119 | return (stride_ > 0) ? (ptr_ < it.ptr_) : (it.ptr_ < ptr_); 120 | } 121 | 122 | bool operator>(const iterator& it) const { 123 | return (stride_ > 0) ? (ptr_ > it.ptr_) : (it.ptr_ > ptr_); 124 | } 125 | 126 | bool operator<=(const iterator& it) const { 127 | return (stride_ > 0) ? (ptr_ <= it.ptr_) : (it.ptr_ <= ptr_); 128 | } 129 | 130 | bool operator>=(const iterator& it) const { 131 | return (stride_ > 0) ? (ptr_ >= it.ptr_) : (it.ptr_ >= ptr_); 132 | } 133 | 134 | bool operator!=(const iterator& it) const { 135 | return (ptr_ != it.ptr_); 136 | } 137 | }; 138 | 139 | typedef iterator const_iterator; 140 | 141 | /// Default constructor. 142 | VecView() : ptr_(0), size_(0), stride_(1) {} 143 | 144 | /** Basic view constructor. 145 | * @param p Pointer to data to be viewed. 146 | * @param len Number of elements to be viewed. 147 | * @param stride Optional stride of elements. 148 | */ 149 | VecView(const_pointer p, int len, int stride = 1) { 150 | view(p, len, stride); 151 | } 152 | 153 | /// View a std::vector. 154 | VecView(const std::vector& vec) { 155 | view(vec); 156 | } 157 | 158 | /// Clears the current view. 159 | void clear() { 160 | ptr_ = 0; 161 | size_ = 0; 162 | stride_ = 1; 163 | } 164 | 165 | /** Basic view constructor. 166 | * @param p Pointer to data to be viewed. 167 | * @param len Number of elements to be viewed. 168 | * @param stride Optional stride of elements. 169 | * 170 | * The current view (if any) is abandoned. 171 | */ 172 | void view(const_pointer p, int len, int stride = 1) { 173 | if (len < 0 || stride == 0) { 174 | throw std::runtime_error("Size must be >= 0, and stride cannot be zero."); 175 | } 176 | ptr_ = const_cast(p); 177 | size_ = size_type(len); 178 | stride_ = difference_type(stride); 179 | } 180 | 181 | /** View a std::vector. 182 | * The current view (if any) is abandoned. 183 | */ 184 | void view(const std::vector& vec) { 185 | if (vec.empty()) clear(); 186 | else view(&vec[0], int(vec.size()), 1); 187 | } 188 | 189 | /** Return a new object that views a slice of the current one. 190 | * @param start The starting position of the slice in the current view. 191 | * @param len The number of elements to be viewed. 192 | * @param stride Optional stride, which is relative to the current view. 193 | */ 194 | VecView slice(int start, int len, int stride = 1) const { 195 | if (start < 0) throw std::runtime_error("Slice cannot have negative start value."); 196 | if (stride > 0) { 197 | if (start + len * stride > int(size_)) throw std::runtime_error("Slice out of bounds."); 198 | } 199 | else if (stride < 0) { 200 | if (start >= int(size_) || start + (len - 1) * stride <= 0) throw std::runtime_error("Slice out of bounds."); 201 | } 202 | else { 203 | throw std::runtime_error("Slice cannot have zero stride."); 204 | } 205 | return VecView(ptr_ + start * stride_, len, stride_ * stride); 206 | } 207 | 208 | /// The size of the data view. 209 | size_type size() const { 210 | return size_; 211 | } 212 | 213 | /// The stride of the view. 214 | difference_type stride() const { 215 | return stride_; 216 | } 217 | 218 | /// A pointer to the first element of the raw data of the view. 219 | pointer data() const { 220 | return ptr_; 221 | } 222 | 223 | /// Indexing operator. 224 | reference operator[](int n) const { 225 | return *(ptr_ + n * stride_); 226 | } 227 | 228 | /// Reverse the current view. 229 | void reverse() { 230 | if (ptr_ == 0 || size_ == 0) return; 231 | ptr_ += ptrdiff_t(size_ - 1) * stride_; 232 | stride_ = -stride_; 233 | } 234 | 235 | /// Iterator to start of view. 236 | iterator begin() const { 237 | return iterator(ptr_, stride_); 238 | } 239 | 240 | /// Iterator to past-the-end of the view. 241 | iterator end() const { 242 | return iterator(ptr_ + size_ * stride_, stride_); 243 | } 244 | }; 245 | 246 | 247 | template 248 | inline typename VecView::iterator operator+(int n, const typename VecView::iterator& it) { 249 | return it + n; 250 | } 251 | 252 | 253 | template 254 | class MatView { 255 | public: 256 | typedef T value_type; 257 | typedef T* pointer; 258 | typedef const T* const_pointer; 259 | typedef size_t size_type; 260 | typedef ptrdiff_t difference_type; 261 | typedef T& reference; 262 | typedef const T& const_reference; 263 | 264 | typedef typename VecView::iterator iterator, const_iterator; 265 | 266 | protected: 267 | pointer ptr_; 268 | size_type size1_, size2_; 269 | difference_type stride1_, stride2_; 270 | 271 | public: 272 | 273 | MatView() : ptr_(0), size1_(0), size2_(0), stride1_(1), stride2_(1) {} 274 | 275 | MatView(const_pointer p, int len1, int len2, int stride1, int stride2) { 276 | view(p, len1, len2, stride1, stride2); 277 | } 278 | 279 | MatView(const_pointer p, int len1, int len2) { 280 | view(p, len1, len2); 281 | } 282 | 283 | MatView(const ublas::matrix& mat) { 284 | view(mat); 285 | } 286 | 287 | void view(const_pointer p, int len1, int len2) { 288 | view(p, len1, len2, len2, 1); 289 | } 290 | 291 | void view(const_pointer p, int len1, int len2, int stride1, int stride2) { 292 | if (len1 < 0 || len2 < 0 || stride1 == 0 || stride2 == 0) { 293 | throw std::runtime_error("Lengths must be >= 0, and strides cannot be zero."); 294 | } 295 | ptr_ = const_cast(p); 296 | size1_ = size_type(len1); 297 | size2_ = size_type(len2); 298 | stride1_ = difference_type(stride1); 299 | stride2_ = difference_type(stride2); 300 | } 301 | 302 | void view(const ublas::matrix& mat) { 303 | int s1 = &mat(1, 0) - &mat(0, 0); 304 | int s2 = &mat(0, 1) - &mat(0, 0); 305 | view(&mat(0, 0), int(mat.size1()), int(mat.size2()), s1, s2); 306 | } 307 | 308 | reference operator()(int n, int m) const { 309 | return *(ptr_ + n * stride1_ + m * stride2_); 310 | } 311 | 312 | pointer data() const { 313 | return ptr_; 314 | } 315 | 316 | size_type size1() const { 317 | return size1_; 318 | } 319 | 320 | size_type size2() const { 321 | return size2_; 322 | } 323 | 324 | difference_type stride1() const { 325 | return stride1_; 326 | } 327 | 328 | difference_type stride2() const { 329 | return stride2_; 330 | } 331 | 332 | MatView submatrix(int start1, int start2, int len1, int len2, int stride1 = 1, int stride2 = 1) const { 333 | return MatView(ptr_ + start1 * stride1_ + start2 * stride2_, len1, len2, stride1 * stride1_, stride2 * stride2_); 334 | } 335 | 336 | VecView row(int n) const { 337 | return VecView(ptr_ + n * stride1_, size2_, stride2_); 338 | } 339 | 340 | VecView column(int m) const { 341 | return VecView(ptr_ + m * stride2_, size1_, stride1_); 342 | } 343 | 344 | void transpose() { 345 | if (ptr_ == 0 || size1_ == 0 || size2_ == 0) return; 346 | std::swap(stride1_, stride2_); 347 | std::swap(size1_, size2_); 348 | } 349 | 350 | iterator row_begin(int n) const { 351 | return iterator(ptr_ + n * stride1_, stride2_); 352 | } 353 | 354 | iterator row_end(int n) const { 355 | return iterator(ptr_ + n * stride1_ + size2_ * stride2_, stride2_); 356 | } 357 | 358 | iterator column_begin(int m) const { 359 | return iterator(ptr_ + m * stride2_, stride1_); 360 | } 361 | 362 | iterator column_end(int m) const { 363 | return iterator(ptr_ + m * stride2_ + size1_ * stride1_, stride1_); 364 | } 365 | }; 366 | 367 | 368 | #endif /* DATA_VIEW_H */ 369 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/common/stub_py.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace bp = boost::python; 9 | 10 | bp::list check_vector(bp::list& v); 11 | bp::list check_matrix(bp::list& m); 12 | bp::dict check_map(bp::dict& d); 13 | bp::numeric::array check_np_vector(bp::numeric::array& v, const std::string& t); 14 | bp::numeric::array check_np_matrix(bp::numeric::array& m); 15 | void check_exp(bp::numeric::array& data, bool fast); 16 | 17 | 18 | /// Python class wrapper. 19 | BOOST_PYTHON_MODULE(stub) { 20 | import_array(); 21 | bp::numeric::array::set_module_and_type("numpy", "ndarray"); 22 | def("check_vector", &check_vector); 23 | def("check_matrix", &check_matrix); 24 | def("check_map", &check_map); 25 | def("check_np_vector", &check_np_vector); 26 | def("check_np_matrix", &check_np_matrix); 27 | def("check_exp", &check_exp); 28 | } 29 | 30 | 31 | bp::list check_vector(bp::list& v) { 32 | std::vector vec = list_to_vector(v); 33 | bp::list out; 34 | for (size_t i = 0; i < vec.size(); ++i) { 35 | out.append(vec[i]); 36 | } 37 | return out; 38 | } 39 | 40 | 41 | bp::list check_matrix(bp::list& m) { 42 | ublas::matrix mat = list_to_matrix(m); 43 | bp::list out; 44 | for (size_t i = 0; i < mat.size1(); ++i) { 45 | bp::list row; 46 | for (size_t j = 0; j < mat.size2(); ++j) { 47 | row.append(mat(i, j)); 48 | } 49 | out.append(row); 50 | } 51 | return out; 52 | } 53 | 54 | 55 | bp::dict check_map(bp::dict& d) { 56 | std::map dm = dict_to_map(d); 57 | bp::dict out; 58 | for (std::map::iterator p = dm.begin(); p != dm.end(); ++p) { 59 | out[p->first] = p->second; 60 | } 61 | return out; 62 | } 63 | 64 | 65 | bp::numeric::array check_np_vector(bp::numeric::array& v, const std::string& t) { 66 | if (t == "int32") { 67 | VecView vec = view_1d_array(v); 68 | //std::cerr << "Stride for " << t << " is " << vec.stride() << std::endl; 69 | return vector_to_numpy(vec); 70 | } 71 | else if (t == "int64") { 72 | VecView vec = view_1d_array(v); 73 | //std::cerr << "Stride for " << t << " is " << vec.stride() << std::endl; 74 | return vector_to_numpy(vec); 75 | } 76 | else if (t == "float64") { 77 | VecView vec = view_1d_array(v); 78 | //std::cerr << "Stride for " << t << " is " << vec.stride() << std::endl; 79 | return vector_to_numpy(vec); 80 | } 81 | return new_numpy_1d(0); 82 | } 83 | 84 | 85 | bp::numeric::array check_np_matrix(bp::numeric::array& m) { 86 | MatView mat = view_2d_array(m); 87 | return matrix_to_numpy(mat); 88 | } 89 | 90 | 91 | void check_exp(bp::numeric::array& data, bool fast) { 92 | VecView x = view_1d_array(data); 93 | size_t n = x.size(); 94 | if (fast) { 95 | for (size_t i = 0; i < n; ++i) { 96 | x[i] = fastpow2(POW2FACTOR * x[i]); 97 | } 98 | } 99 | else { 100 | for (size_t i = 0; i < n; ++i) { 101 | x[i] = exp(x[i]); 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/common/test/test_stub.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import unittest 3 | import warnings 4 | import cProfile 5 | import pstats 6 | import time 7 | import StringIO 8 | import numpy as np 9 | from dragonet.basecall.common import stub 10 | 11 | 12 | class TestStub(unittest.TestCase): 13 | 14 | def setUp(self): 15 | return 16 | 17 | def tearDown(self): 18 | return 19 | 20 | def test_001_list_to_vector(self): 21 | data = [1.0, 2.0, -3.5, 7.6] 22 | newdata = stub.check_vector(data) 23 | self.assertEqual(data, newdata) 24 | return 25 | 26 | def test_002_list_to_matrix(self): 27 | data = [[1.0, 2.0, -3.5], 28 | [7.6, 1.3, 12.8]] 29 | newdata = stub.check_matrix(data) 30 | self.assertEqual(data, newdata) 31 | return 32 | 33 | def test_003_dict_to_map(self): 34 | data = {'foo': 1, 'bar': 2, 'kill': 3, 'me': 4} 35 | newdata = stub.check_map(data) 36 | self.assertEqual(data, newdata) 37 | return 38 | 39 | def test_004_contiguous(self): 40 | data1d = np.array([1, 5, 7, 3, 8, 4, 10, 4, 12, 2], dtype=np.int32) 41 | data1d2 = np.zeros(10, dtype=np.float64) 42 | data1d2[:] = data1d * 0.5 43 | data2d = np.ndarray((2, 5), buffer=(data1d * 0.5), dtype=np.float64) 44 | new1d = stub.check_np_vector(data1d, 'int32') 45 | np.testing.assert_equal(data1d, new1d) 46 | new1d2 = stub.check_np_vector(data1d2, 'float64') 47 | np.testing.assert_equal(data1d2, new1d2) 48 | new2d = stub.check_np_matrix(data2d) 49 | np.testing.assert_equal(data2d, new2d) 50 | return 51 | 52 | def test_005_noncontiguous(self): 53 | data1d = np.array([1, 5, 7, 3, 8, 4, 10, 4, 12, 2], dtype=np.int32) 54 | data2d = np.ndarray((2, 5), buffer=(data1d * 0.5), dtype=np.float64) 55 | data1d = data1d[::2] 56 | data2d = data2d[::-1, ::2] 57 | new1d = stub.check_np_vector(data1d, 'int32') 58 | np.testing.assert_equal(data1d, new1d) 59 | new2d = stub.check_np_matrix(data2d) 60 | np.testing.assert_equal(data2d, new2d) 61 | return 62 | 63 | def test_005_record_array(self): 64 | desc = np.dtype({'names': ['a', 'b', 'c'], 'formats': [np.int32, np.float64, np.int64]}, align=True) 65 | data = np.zeros(10, dtype=desc) 66 | data1 = data['a'] 67 | data2 = data['b'] 68 | data3 = data['c'] 69 | data1[:] = [1, 5, 7, 3, 8, 4, 10, 4, 12, 2] 70 | data2[:] = data1 * 0.5 71 | data3[:] = data1 * 2 72 | new1 = stub.check_np_vector(data1, 'int32') 73 | np.testing.assert_equal(data1, new1) 74 | new2 = stub.check_np_vector(data2, 'float64') 75 | np.testing.assert_equal(data2, new2) 76 | new3 = stub.check_np_vector(data3, 'int64') 77 | np.testing.assert_equal(data3, new3) 78 | return 79 | 80 | @unittest.skip('Not needed') 81 | def test_006_accumulate(self): 82 | data = np.arange(0.0, 10.0, dtype=np.float32) 83 | result = stub.check_accumulator(data) 84 | self.assertEqual(result, 45.0) 85 | result = stub.check_accumulator(data[:-1]) 86 | self.assertEqual(result, 36.0) 87 | data = np.empty(1024, dtype=np.float32) 88 | data[:] = np.random.random(1024) 89 | 90 | #print 'Fast version' 91 | pr = cProfile.Profile() 92 | pr.enable() 93 | result1 = self._loop_checker(data, 1000000, True) 94 | pr.disable() 95 | s = StringIO.StringIO() 96 | ps = pstats.Stats(pr, stream=s).sort_stats('cumulative') 97 | #ps.print_stats() 98 | #print s.getvalue() 99 | 100 | #print 'Normal version' 101 | pr = cProfile.Profile() 102 | pr.enable() 103 | result2 = self._loop_checker(data, 1000000, False) 104 | pr.disable() 105 | s = StringIO.StringIO() 106 | ps = pstats.Stats(pr, stream=s).sort_stats('cumulative') 107 | #ps.print_stats() 108 | #print s.getvalue() 109 | 110 | return 111 | 112 | def test_007_check_exp(self): 113 | warnings.filterwarnings("error") 114 | raw = np.arange(-80.0, 80.0, 1e-4, dtype=np.float32) 115 | data1 = np.empty(raw.size, dtype=np.float32) 116 | data2 = np.empty(raw.size, dtype=np.float32) 117 | data1[:] = raw 118 | data2[:] = raw 119 | #print 'Total count is:', raw.size 120 | t1 = time.clock() 121 | stub.check_exp(data1, False) 122 | t2 = time.clock() 123 | stub.check_exp(data2, True) 124 | t3 = time.clock() 125 | #print 'Stats for positive range.' 126 | #print 'Time for normal exponential:', t2 - t1 127 | #print 'Time for fast exponential:', t3 - t2 128 | error = np.abs((data2 - data1) / data1) 129 | max_idx = np.argmax(error) 130 | #print 'Max error:', error[max_idx], 'raw = ', raw[max_idx], 'data1 =', data1[max_idx], 'data2 =', data2[max_idx], 'diff =', data2[max_idx] - data1[max_idx] 131 | data3 = -1.0 * raw 132 | data4 = -1.0 * raw 133 | t1 = time.clock() 134 | stub.check_exp(data3, False) 135 | t2 = time.clock() 136 | stub.check_exp(data4, True) 137 | t3 = time.clock() 138 | #print 'Stats for negative range.' 139 | #print 'Time for normal exponential:', t2 - t1 140 | #print 'Time for fast exponential:', t3 - t2 141 | error = np.abs((data4 - data3) / data3) 142 | max_idx = np.argmax(error) 143 | #print 'Max error:', error[max_idx], 'raw =', raw[max_idx], 'data1 =', data3[max_idx], 'data2 =', data4[max_idx], 'diff =', data4[max_idx] - data3[max_idx] 144 | return 145 | 146 | 147 | def _loop_checker(self, data, n, fast): 148 | return stub.check_accumulator_loop(data, n, fast) 149 | 150 | 151 | if __name__ == '__main__': 152 | unittest.main() 153 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/common/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | /// 1.0 / ln(2) - Needed as scaling factor for computing exp(x) from 2^x. 13 | static const float POW2FACTOR = 1.442695040f; 14 | 15 | inline float square(float x) {return x * x;} 16 | inline float cube(float x) {return x * x * x;} 17 | 18 | 19 | /// Fast approximation for computing 2^p in single precision. 20 | inline float fastpow2(float p) { 21 | float clipp = (p > -125.0f) ? p : -125.0f; 22 | union {uint32_t i; float f;} v = {uint32_t((1 << 23) * (clipp + 126.94269504f))}; 23 | return v.f; 24 | } 25 | 26 | /// Fast vectorized approximation for computing 2^p in single precision for 4 numbers. 27 | inline __m128 vfasterpow2(const __m128 p) { 28 | const __m128 c_126_94269504 = _mm_set_ps1(126.94269504f); 29 | const __m128 lt125 = _mm_cmplt_ps(p, _mm_set_ps1(-125.0f)); 30 | const __m128 clipp = _mm_or_ps(_mm_andnot_ps(lt125, p), _mm_and_ps(lt125, _mm_set_ps1(-125.0f))); 31 | union { __m128i i; __m128 f; } v = { _mm_cvttps_epi32(_mm_mul_ps(_mm_set_ps1(1 << 23), _mm_add_ps(clipp, c_126_94269504))) }; 32 | return v.f; 33 | } 34 | 35 | /** Generic normalization function. 36 | * @param NUM_STATES The number of states to normalize over. 37 | * @param data An array of floats. 38 | * @returns The normalization factor used. 39 | */ 40 | template 41 | float normalize(float *data) { 42 | float sum = 0.0f; 43 | for (int state = 0; state < NUM_STATES; ++state) { 44 | sum += data[state]; 45 | } 46 | if (sum < 1e-38f || !std::isfinite(sum)) { 47 | throw std::runtime_error("Normalization error."); 48 | } 49 | float norm = 1.0f / (sum); 50 | for (int state = 0; state < NUM_STATES; ++state) { 51 | data[state] *= norm; 52 | } 53 | return sum; 54 | } 55 | 56 | 57 | #endif /* UTILS_H */ 58 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/common/view_numpy_arrays.h: -------------------------------------------------------------------------------- 1 | #ifndef VIEW_NUMPY_ARRAYS_H 2 | #define VIEW_NUMPY_ARRAYS_H 3 | 4 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | namespace bp = boost::python; 14 | 15 | 16 | inline int numpy_type(bool) {return NPY_BOOL;} 17 | inline int numpy_type(int16_t) {return NPY_INT16;} 18 | inline int numpy_type(int32_t) {return NPY_INT32;} 19 | #ifdef _MSC_VER 20 | inline int numpy_type(int) {return NPY_INT32;} 21 | #endif 22 | inline int numpy_type(int64_t) {return NPY_INT64;} 23 | inline int numpy_type(uint16_t) {return NPY_UINT16;} 24 | inline int numpy_type(uint32_t) {return NPY_UINT32;} 25 | inline int numpy_type(uint64_t) {return NPY_UINT64;} 26 | inline int numpy_type(float) {return NPY_FLOAT32;} 27 | inline int numpy_type(double) {return NPY_FLOAT64;} 28 | 29 | template 30 | inline int numpy_type(T&) { 31 | throw std::invalid_argument("Unknown type for numpy array."); 32 | return 0; 33 | } 34 | 35 | 36 | template 37 | VecView view_1d_array(const bp::numeric::array& arr) { 38 | PyArrayObject *obj = reinterpret_cast(arr.ptr()); 39 | if (obj == 0) { 40 | throw std::invalid_argument("Could not covert bp::numeric::array to 1d numpy array."); 41 | } 42 | if (PyArray_DESCR(obj)->elsize != sizeof(T)) { 43 | throw std::invalid_argument("Numpy 1d array type does not match template type."); 44 | } 45 | if (PyArray_NDIM(obj) != 1) { 46 | throw std::length_error("Numpy array must be 1D."); 47 | } 48 | int length = PyArray_DIM(obj, 0); 49 | int stride = PyArray_STRIDE(obj, 0) / sizeof(T); 50 | npy_intp ind[1] = {0}; 51 | T *data = reinterpret_cast(PyArray_GetPtr(obj, ind)); 52 | return VecView(data, length, stride); 53 | } 54 | 55 | 56 | template 57 | MatView view_2d_array(const bp::numeric::array& arr) { 58 | PyArrayObject *obj = reinterpret_cast(arr.ptr()); 59 | if (obj == 0) { 60 | throw std::invalid_argument("Could not covert bp::numeric::array to 2d numpy array."); 61 | } 62 | if (PyArray_DESCR(obj)->elsize != sizeof(T)) { 63 | throw std::invalid_argument("Numpy 2d array type does not match template type."); 64 | } 65 | if (PyArray_NDIM(obj) != 2) { 66 | throw std::length_error("Numpy array must be 2D."); 67 | } 68 | int length1 = PyArray_DIM(obj, 0); 69 | int length2 = PyArray_DIM(obj, 1); 70 | int stride1 = PyArray_STRIDE(obj, 0) / sizeof(T); 71 | int stride2 = PyArray_STRIDE(obj, 1) / sizeof(T); 72 | npy_intp ind[2] = {0, 0}; 73 | T *data = reinterpret_cast(PyArray_GetPtr(obj, ind)); 74 | return MatView(data, length1, length2, stride1, stride2); 75 | } 76 | 77 | 78 | template 79 | bp::numeric::array new_numpy_1d(int n) { 80 | npy_intp dims[1] = {n}; 81 | PyArrayObject *obj = reinterpret_cast(PyArray_SimpleNew(1, dims, numpy_type(T()))); 82 | if (obj == 0) throw std::runtime_error("Call to PyArray_SimpleNew() failed."); 83 | bp::handle<> handle(reinterpret_cast(obj)); 84 | return bp::numeric::array(handle); 85 | } 86 | 87 | 88 | template 89 | bp::numeric::array new_numpy_2d(int n, int m) { 90 | npy_intp dims[2] = {n, m}; 91 | PyArrayObject *obj = reinterpret_cast(PyArray_SimpleNew(2, dims, numpy_type(T()))); 92 | if (obj == 0) throw std::runtime_error("Call to PyArray_SimpleNew() failed."); 93 | bp::handle<> handle(reinterpret_cast(obj)); 94 | return bp::numeric::array(handle); 95 | } 96 | 97 | 98 | template 99 | bp::numeric::array vector_to_numpy(const VecView& vec) { 100 | bp::numeric::array arr = new_numpy_1d(vec.size()); 101 | VecView lhs = view_1d_array(arr); 102 | std::copy(vec.begin(), vec.end(), lhs.begin()); 103 | return arr; 104 | } 105 | 106 | 107 | template 108 | bp::numeric::array matrix_to_numpy(const MatView& mat) { 109 | bp::numeric::array arr = new_numpy_2d(mat.size1(), mat.size2()); 110 | MatView lhs = view_2d_array(arr); 111 | for (size_t i = 0; i < mat.size1(); ++i) { 112 | std::copy(mat.row_begin(i), mat.row_end(i), lhs.row_begin(i)); 113 | } 114 | return arr; 115 | } 116 | 117 | 118 | #endif /* VIEW_NUMPY_ARRAYS_H */ 119 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/pair_align/mm_align.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | using namespace std; 4 | 5 | namespace PairAlign { 6 | 7 | int32_t MMAlign::processBlock(int xpos1, int xpos2, int ypos1, int ypos2, 8 | int32_t m1, int32_t iy1, int32_t m2, int32_t iy2) { 9 | int len = xpos2 - xpos1 + 2; 10 | int mid = (ypos2 + ypos1 + 1) / 2; 11 | processUp(xpos1, xpos2, ypos1, mid, m1, iy1); 12 | lastM.swap(buffM); 13 | lastIy.swap(buffIy); 14 | processDown(xpos1, xpos2, mid, ypos2, m2, iy2); 15 | 16 | // Find alignment point. 17 | int pos = 0; 18 | int32_t maxScore = ZERO_PROB_SCORE; 19 | bool isMatch = false; 20 | for (int i = 0; i < len; ++i) { 21 | int dpos = len - i; 22 | int32_t mScore = ZERO_PROB_SCORE; 23 | if (i + xpos1 > 0) mScore = buffM[i] + lastM[dpos] - subMatrix(seq1[mid], seq2[i + xpos1 - 1]); 24 | int32_t deltay = openGapy; 25 | if (i + xpos1 == 0) deltay = startGapy; 26 | if (i + xpos1 == int(seq2.Size())) deltay = endGapy; 27 | int32_t yScore = buffIy[i] + lastIy[dpos - 1] - deltay; 28 | int32_t score = max(mScore, yScore); 29 | if (score > maxScore) { 30 | maxScore = score; 31 | pos = i; 32 | isMatch = (mScore >= yScore); 33 | } 34 | } 35 | 36 | // Push alignment position (if they aligned at this midline). 37 | if (isMatch) { 38 | matches.push_back(AlignPos(mid, xpos1 + pos - 1)); 39 | } 40 | 41 | // Set up next blocks. 42 | int32_t newm1 = isMatch ? buffM[pos] : ZERO_PROB_SCORE; 43 | int32_t newiy1 = isMatch ? ZERO_PROB_SCORE : buffIy[pos]; 44 | int dpos = len - pos; 45 | int32_t newm2 = isMatch ? lastM[dpos] : ZERO_PROB_SCORE; 46 | int32_t newiy2 = isMatch ? ZERO_PROB_SCORE : lastIy[dpos - 1]; 47 | 48 | // Do new lower block. 49 | int newxpos2 = pos + xpos1 - 1; 50 | if (isMatch) --newxpos2; 51 | if (mid > ypos1 && newxpos2 >= xpos1) { 52 | processBlock(xpos1, newxpos2, ypos1, mid - 1, m1, iy1, newm2, newiy2); 53 | } 54 | 55 | // Do new upper block. 56 | int newxpos1 = pos + xpos1; 57 | if (mid < ypos2 && newxpos1 <= xpos2) { 58 | processBlock(pos + xpos1, xpos2, mid + 1, ypos2, newm1, newiy1, m2, iy2); 59 | } 60 | return maxScore; 61 | } 62 | 63 | 64 | void MMAlign::processUp(int xpos1, int xpos2, int ypos1, int ypos2, 65 | int32_t m, int32_t iy) { 66 | View view1 = View(seq1, ypos1, ypos2 - ypos1 + 1, 1); 67 | View view2 = View(seq2, xpos1, xpos2 - xpos1 + 1, 1); 68 | int32_t gx1 = openGapx, gx2 = openGapx; 69 | int32_t hx1 = extendGapx, hx2 = extendGapx; 70 | if (ypos1 == 0) { 71 | gx1 = startGapx; 72 | hx1 = startGapx; 73 | } 74 | if (ypos2 == int(seq1.Size()) - 1) { 75 | gx2 = endGapx; 76 | hx2 = endGapx; 77 | } 78 | int32_t gy1 = openGapy, gy2 = openGapy; 79 | int32_t hy1 = extendGapy, hy2 = extendGapy; 80 | if (xpos1 == 0) { 81 | gy1 = startGapy; 82 | hy1 = startGapy; 83 | } 84 | if (xpos2 == int(seq2.Size()) - 1) { 85 | gy2 = endGapy; 86 | hy2 = endGapy; 87 | } 88 | int lenx = xpos2 - xpos1 + 2; 89 | int leny = ypos2 - ypos1 + 2; 90 | hmm(view1, view2, lenx, leny, m, iy, gx1, hx1, gx2, hx2, gy1, hy1, gy2, hy2); 91 | } 92 | 93 | 94 | void MMAlign::processDown(int xpos1, int xpos2, int ypos1, int ypos2, 95 | int32_t m, int32_t iy) { 96 | View view1 = View(seq1, ypos2, ypos2 - ypos1 + 1, -1); 97 | View view2 = View(seq2, xpos2, xpos2 - xpos1 + 1, -1); 98 | int32_t gx1 = openGapx, gx2 = openGapx; 99 | int32_t hx1 = extendGapx, hx2 = extendGapx; 100 | if (ypos1 == 0) { 101 | gx2 = startGapx; 102 | hx2 = startGapx; 103 | } 104 | if (ypos2 == int(seq1.Size()) - 1) { 105 | gx1 = endGapx; 106 | hx1 = endGapx; 107 | } 108 | int32_t gy1 = openGapy, gy2 = openGapy; 109 | int32_t hy1 = extendGapy, hy2 = extendGapy; 110 | if (xpos1 == 0) { 111 | gy2 = startGapy; 112 | hy2 = startGapy; 113 | } 114 | if (xpos2 == int(seq2.Size()) - 1) { 115 | gy1 = endGapy; 116 | hy1 = endGapy; 117 | } 118 | int lenx = xpos2 - xpos1 + 2; 119 | int leny = ypos2 - ypos1 + 2; 120 | hmm(view1, view2, lenx, leny, m, iy, gx1, hx1, gx2, hx2, gy1, hy1, gy2, hy2); 121 | } 122 | 123 | 124 | void MMAlign::hmm(const View& view1, const View& view2, int lenx, int leny, 125 | int32_t m, int32_t iy, int32_t gx1, int32_t hx1, int32_t gx2, 126 | int32_t hx2, int32_t gy1, int32_t hy1, int32_t gy2, int32_t hy2) { 127 | lastM[0] = m; 128 | lastIy[0] = iy; 129 | lastIx[0] = ZERO_PROB_SCORE; 130 | for (int j = 1; j < lenx; ++j) { 131 | lastM[j] = ZERO_PROB_SCORE; 132 | lastIy[j] = ZERO_PROB_SCORE; 133 | if (j == 1) lastIx[j] = max(lastM[0], lastIy[0]) + gx1; 134 | else lastIx[j] = lastIx[j - 1] + hx1; 135 | } 136 | for (int i = 1; i < leny; ++i) { 137 | M[0] = ZERO_PROB_SCORE; 138 | Ix[0] = ZERO_PROB_SCORE; 139 | Iy[0] = max(lastIy[0] + hy1, lastM[0] + gy1); 140 | int32_t gx = (i == leny - 1) ? gx2 : openGapx; 141 | int32_t hx = (i == leny - 1) ? hx2 : extendGapx; 142 | for (int j = 1; j < lenx; ++j) { 143 | M[j] = TripleMax(lastM[j - 1], lastIx[j - 1], lastIy[j - 1]); 144 | M[j] += subMatrix(view1[i - 1], view2[j - 1]); 145 | int32_t gy = (j == lenx - 1) ? gy2 : openGapy; 146 | int32_t hy = (j == lenx - 1) ? hy2 : extendGapy; 147 | Iy[j] = TripleMax(lastM[j] + gy, lastIx[j] + gy, lastIy[j] + hy); 148 | Ix[j] = TripleMax(M[j - 1] + gx, Ix[j - 1] + hx, Iy[j - 1] + gx); 149 | } 150 | M.swap(lastM); 151 | Ix.swap(lastIx); 152 | Iy.swap(lastIy); 153 | } 154 | } 155 | 156 | 157 | void MMAlign::makeAlignment(vector& alignment) { 158 | alignment.clear(); 159 | sort(matches.begin(), matches.end()); 160 | int lastx = -1, lasty = -1; 161 | for (size_t i = 0; i < matches.size(); ++i) { 162 | int x = matches[i].Pos2; 163 | int y = matches[i].Pos1; 164 | if (y > lasty + 1) { 165 | for (int p = lasty + 1; p < y; ++p) { 166 | alignment.push_back(AlignPos(p, -1)); 167 | } 168 | lasty = y - 1; 169 | } 170 | if (x > lastx + 1) { 171 | for (int p = lastx + 1; p < x; ++p) { 172 | alignment.push_back(AlignPos(-1, p)); 173 | } 174 | lastx = x - 1; 175 | } 176 | alignment.push_back(AlignPos(y, x)); 177 | lastx = x; 178 | lasty = y; 179 | } 180 | if (lasty < int(seq1.Size()) - 1) { 181 | for (int p = lasty + 1; p < int(seq1.Size()); ++p) { 182 | alignment.push_back(AlignPos(p, -1)); 183 | } 184 | } 185 | if (lastx < int(seq2.Size()) - 1) { 186 | for (int p = lastx + 1; p < int(seq2.Size()); ++p) { 187 | alignment.push_back(AlignPos(-1, p)); 188 | } 189 | } 190 | } 191 | 192 | 193 | } /* namespace PairAlign */ 194 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/pair_align/mm_align.h: -------------------------------------------------------------------------------- 1 | #ifndef MM_ALIGN_H 2 | #define MM_ALIGN_H 3 | 4 | #include 5 | 6 | namespace PairAlign { 7 | 8 | /// Helper class representing a view of a range of a vector. 9 | template 10 | class View { 11 | private: 12 | const std::vector *data; 13 | size_t start, size; 14 | int dir; 15 | 16 | void check(size_t a, size_t b) { 17 | if (start < a || start > b) { 18 | throw std::runtime_error("Error: View start is out of range."); 19 | } 20 | size_t end = start + size_t((int(size) - 1) * dir); 21 | if (end < a || end > b) { 22 | throw std::runtime_error("Error: View end is out of range."); 23 | } 24 | if (dir != 1 && dir != -1) { 25 | throw std::runtime_error("Error: Stride value makes no sense."); 26 | } 27 | } 28 | 29 | public: 30 | /// Default constructor. 31 | View() {} 32 | 33 | /** Construct a subview. 34 | * @param[in] rhs View object to view contents of. 35 | * @param[in] begin Start position for new view. 36 | * @param[in] len Length of new view. 37 | * @param[in] dir Either 1 or -1. Indicates direction of view. 38 | */ 39 | View(const View& rhs, int begin, int len, int d) : 40 | data(rhs.data), 41 | start(rhs.start + begin * rhs.dir), 42 | size(len), 43 | dir(d * rhs.dir) { 44 | check(rhs.start, rhs.start + (rhs.size - 1) * rhs.dir); 45 | } 46 | 47 | /** Construct a view of a std::vector. 48 | * @param[in] x Vector to be viewed. 49 | * @param[in] begin Start position of view. 50 | * @param[in] len Length of view. 51 | * @param[in] dir Either 1 or -1. Indicates direction of view. 52 | */ 53 | View(const std::vector& x, int begin = 0, int len = 0, int d = 1) : 54 | data(&x), 55 | start(begin), 56 | size(len == 0 ? x.size() : len), 57 | dir(d) { 58 | check(0, x.size() - 1); 59 | } 60 | 61 | /// Indexing operator. 62 | const T& operator[](int n) const {return (dir == 1) ? (*data)[start + n] : (*data)[start - n];} 63 | 64 | /// Returns the lenght of the view. 65 | size_t Size() const {return size;} 66 | }; 67 | 68 | 69 | /** Myers-Miller implementation supporting gap-extension. 70 | * Note that this is approximately 2x slower than the 71 | * Needleman-Wunsch implementation, but only requires 72 | * linear memory instead of quadratic. 73 | */ 74 | class MMAlign : public Aligner { 75 | private: 76 | const ublas::matrix& subMatrix; 77 | std::vector M, lastM, buffM; 78 | std::vector Iy, lastIy, buffIy; 79 | std::vector Ix, lastIx; 80 | std::vector matches; 81 | View seq1; 82 | View seq2; 83 | int32_t startGapx, endGapx, openGapx, extendGapx; 84 | int32_t startGapy, endGapy, openGapy, extendGapy; 85 | 86 | int32_t processBlock(int xpos1, int xpos2, int ypos1, int ypos2, 87 | int32_t m1, int32_t iy1, int32_t m2, int32_t iy2); 88 | void processUp(int xpos1, int xpos2, int ypos1, int ypos2, 89 | int32_t m, int32_t iy); 90 | void processDown(int xpos1, int xpos2, int ypos1, int ypos2, 91 | int32_t m, int32_t iy); 92 | void hmm(const View& view1, const View& view2, int lenx, int leny, 93 | int32_t m, int32_t iy, int32_t gx1, int32_t hx1, int32_t gx2, 94 | int32_t hx2, int32_t gy1, int32_t hy1, int32_t gy2, int32_t hy2); 95 | void makeAlignment(std::vector& alignment); 96 | 97 | public: 98 | /** Constructor. 99 | * @param[in] subMat Substitution matrix. Note this stores a reference. Beware of lifetime. 100 | * @param[in] gaps Vector of gap penalties (length 8). 101 | * 102 | * The 8 values in the gap penalty vector should be as follows: 103 | * start_gap1 Penalty for aligning sequence 1 to a gap before sequence 2. 104 | * end_gap1 Penalty for aligning sequence 1 to a gap after sequence 2. 105 | * open_gap1 Penalty for aligning sequence 1 to a new gap within sequence 1. 106 | * extend_gap1 Penalty for extending a gap within sequence 2. 107 | * start_gap2 Penalty for aligning sequence 2 to a gap before sequence 1. 108 | * end_gap2 Penalty for aligning sequence 2 to a gap after sequence 1. 109 | * open_gap2 Penalty for aligning sequence 2 to a new gap within sequence 1. 110 | * extend_gap2 Penalty for extending a gap within sequence 1. 111 | */ 112 | MMAlign(const ublas::matrix& subMat, const std::vector& gaps) : subMatrix(subMat) { 113 | startGapx = -gaps[0]; 114 | endGapx = -gaps[1]; 115 | openGapx = -gaps[2]; 116 | extendGapx = -gaps[3]; 117 | startGapy = -gaps[4]; 118 | endGapy = -gaps[5]; 119 | openGapy = -gaps[6]; 120 | extendGapy = -gaps[7]; 121 | } 122 | 123 | /** Align two sequences. 124 | * @param[in] sequence1 First sequence of states. 125 | * @param[in] sequence2 Second sequence of states. 126 | * @param[out] alignment Vector to hold the resulting alignment. 127 | * @return The alignment score. This is not normalized. 128 | * 129 | * The alignment object will contain one entry per alignment position. Any contents it 130 | * had before the call will be lost. Each entry contains the indexes of the two sequence 131 | * elements that align to that position. If one sequence has aligned to a gap at that 132 | * position, the value for the other sequence will be -1. 133 | * 134 | * Note that the two sequences must contain only values from 0 to n-1, where n is the 135 | * size of the nxn substitution matrix. 136 | */ 137 | int32_t Align(const std::vector& sequence1, const std::vector& sequence2, 138 | std::vector& alignment) { 139 | seq1 = View(sequence1); 140 | seq2 = View(sequence2); 141 | int len1 = int(sequence1.size()); 142 | int len2 = int(sequence2.size()); 143 | M.clear(); 144 | lastM.clear(); 145 | buffM.clear(); 146 | Ix.clear(); 147 | lastIx.clear(); 148 | Iy.clear(); 149 | lastIy.clear(); 150 | buffIy.clear(); 151 | matches.clear(); 152 | M.resize(len2 + 1); 153 | lastM.resize(len2 + 1); 154 | buffM.resize(len2 + 1); 155 | Ix.resize(len2 + 1); 156 | lastIx.resize(len2 + 1); 157 | Iy.resize(len2 + 1); 158 | lastIy.resize(len2 + 1); 159 | buffIy.resize(len2 + 1); 160 | int32_t score = processBlock(0, len2 - 1, 0, len1 - 1, 0, 0, 0, 0); 161 | makeAlignment(alignment); 162 | return score; 163 | } 164 | }; 165 | 166 | } /* namespace PairAlign */ 167 | 168 | 169 | #endif /* MM_ALIGN_H */ 170 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/pair_align/nw_align.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace std; 5 | 6 | 7 | namespace PairAlign { 8 | 9 | int32_t NWAlign::Align(const vector& sequence1, const vector& sequence2, 10 | vector& alignment) { 11 | int len1 = int(sequence1.size()); 12 | int len2 = int(sequence2.size()); 13 | diagScores.resize(len1 + 1, len2 + 1, false); 14 | upScores.resize(len1 + 1, len2 + 1, false); 15 | rightScores.resize(len1 + 1, len2 + 1, false); 16 | diagScores(0, 0) = 0; 17 | upScores(0, 0) = 0; 18 | rightScores(0, 0) = 0; 19 | // Fill in the left column. This is events from sequence 1 aligning before 20 | // the beginning of sequence 2. 21 | for (int i = 1; i <= len1; ++i) { 22 | upScores(i, 0) = upScores(i - 1, 0) + startGapy; 23 | diagScores(i, 0) = ZERO_PROB_SCORE; 24 | rightScores(i, 0) = ZERO_PROB_SCORE; 25 | } 26 | // Fill in the bottom row. This is events from sequence 2 aligning before 27 | // the beginning of sequence 1. 28 | for (int j = 1; j <= len2; ++j) { 29 | rightScores(0, j) = rightScores(0, j - 1) + startGapx; 30 | diagScores(0, j) = ZERO_PROB_SCORE; 31 | upScores(0, j) = ZERO_PROB_SCORE; 32 | } 33 | // Fill in the main body, but not the right column or top row. 34 | for (int i = 1; i < len1; ++i) { 35 | for (int j = 1; j < len2; ++j) { 36 | int32_t mismatch = subMatrix(sequence1[i - 1], sequence2[j - 1]); 37 | processNode(i, j, openGapx, extendGapx, openGapy, extendGapy, mismatch); 38 | } 39 | } 40 | // Fill in the top row. This is events from sequence 2 aligning to or after 41 | // the end of sequence 1. 42 | for (int j = 1; j < len2; ++j) { 43 | int32_t mismatch = subMatrix(sequence1[len1 - 1], sequence2[j - 1]); 44 | processNode(len1, j, endGapx, endGapx, openGapy, extendGapy, mismatch); 45 | } 46 | // Fill in the right column. This is events from sequence 1 aligning after 47 | // the end of sequence 2. 48 | for (int i = 1; i < len1; ++i) { 49 | int32_t mismatch = subMatrix(sequence1[i - 1], sequence2[len2 - 1]); 50 | processNode(i, len2, openGapx, extendGapx, endGapy, endGapy, mismatch); 51 | } 52 | // Fill in the top-right node. 53 | int32_t mismatch = subMatrix(sequence1[len1 - 1], sequence2[len2 - 1]); 54 | processNode(len1, len2, endGapx, endGapx, endGapy, endGapy, mismatch); 55 | backtrace(alignment); 56 | return TripleMax(diagScores(len1, len2), upScores(len1, len2), rightScores(len1, len2)); 57 | } 58 | 59 | 60 | void NWAlign::backtrace(vector& alignment) { 61 | alignment.clear(); 62 | size_t i = diagScores.size1() - 1, j = diagScores.size2() - 1; 63 | while (i > 0 || j > 0) { 64 | int dir = TripleMaxIndex(diagScores(i, j), upScores(i, j), rightScores(i, j)); 65 | switch(dir) { 66 | case 0: 67 | alignment.push_back(AlignPos(--i, --j)); 68 | break; 69 | case 1: 70 | alignment.push_back(AlignPos(--i, -1)); 71 | break; 72 | case 2: 73 | alignment.push_back(AlignPos(-1, --j)); 74 | break; 75 | default: 76 | throw runtime_error("Error: Invalid result in backtrace."); 77 | } 78 | } 79 | reverse(alignment.begin(), alignment.end()); 80 | } 81 | 82 | 83 | } /* namespace PairAlign */ 84 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/pair_align/nw_align.h: -------------------------------------------------------------------------------- 1 | #ifndef NW_ALIGN_H 2 | #define NW_ALIGN_H 3 | 4 | #include 5 | 6 | namespace PairAlign { 7 | 8 | 9 | /** Needleman-Wunsch implementation supporting gap-extension. 10 | * Note that this implementation is optimized for speed, but 11 | * is quadratic in memory. For aligning long sequences use 12 | * the Myers-Miller implementation instead. 13 | */ 14 | class NWAlign : public Aligner { 15 | private: 16 | const ublas::matrix& subMatrix; 17 | ublas::matrix diagScores; 18 | ublas::matrix upScores; 19 | ublas::matrix rightScores; 20 | int32_t startGapy, endGapy, openGapy, extendGapy; 21 | int32_t startGapx, endGapx, openGapx, extendGapx; 22 | 23 | void processNode(int i, int j, int32_t ogx, int32_t egx, int32_t ogy, int32_t egy, int32_t m) { 24 | // Find the best diagonal movement score. 25 | int32_t score = TripleMax(diagScores(i - 1, j - 1), rightScores(i - 1, j - 1), upScores(i - 1, j - 1)); 26 | // Find the best upward movement score. 27 | int32_t upScore1 = diagScores(i - 1, j) + ogy; 28 | int32_t upScore2 = rightScores(i - 1, j) + ogy; 29 | int32_t upScore3 = upScores(i - 1, j) + egy; 30 | // Find the best right movement score. 31 | int32_t rightScore1 = diagScores(i, j - 1) + ogx; 32 | int32_t rightScore2 = upScores(i, j - 1) + ogx; 33 | int32_t rightScore3 = rightScores(i, j - 1) + egx; 34 | diagScores(i, j) = score + m; 35 | upScores(i, j) = TripleMax(upScore1, upScore2, upScore3); 36 | rightScores(i, j) = TripleMax(rightScore1, rightScore2, rightScore3); 37 | } 38 | 39 | void backtrace(std::vector& alignment); 40 | 41 | public: 42 | /** Constructor. 43 | * @param[in] subMat Substitution matrix. Note this stores a reference. Beware of lifetime. 44 | * @param[in] gaps Vector of gap penalties (length 8). 45 | * 46 | * The 8 values in the gap penalty vector should be as follows: 47 | * start_gap1 Penalty for aligning sequence 1 to a gap before sequence 2. 48 | * end_gap1 Penalty for aligning sequence 1 to a gap after sequence 2. 49 | * open_gap1 Penalty for aligning sequence 1 to a new gap within sequence 1. 50 | * extend_gap1 Penalty for extending a gap within sequence 2. 51 | * start_gap2 Penalty for aligning sequence 2 to a gap before sequence 1. 52 | * end_gap2 Penalty for aligning sequence 2 to a gap after sequence 1. 53 | * open_gap2 Penalty for aligning sequence 2 to a new gap within sequence 1. 54 | * extend_gap2 Penalty for extending a gap within sequence 1. 55 | */ 56 | NWAlign(const ublas::matrix& subMat, const std::vector& gaps) : subMatrix(subMat) { 57 | startGapx = -gaps[0]; 58 | endGapx = -gaps[1]; 59 | openGapx = -gaps[2]; 60 | extendGapx = -gaps[3]; 61 | startGapy = -gaps[4]; 62 | endGapy = -gaps[5]; 63 | openGapy = -gaps[6]; 64 | extendGapy = -gaps[7]; 65 | } 66 | 67 | /** Align two sequences. 68 | * @param[in] sequence1 First sequence of states. 69 | * @param[in] sequence2 Second sequence of states. 70 | * @param[out] alignment Vector to hold the resulting alignment. 71 | * @return The alignment score. This is not normalized. 72 | * 73 | * The alignment object will contain one entry per alignment position. Any contents it 74 | * had before the call will be lost. Each entry contains the indexes of the two sequence 75 | * elements that align to that position. If one sequence has aligned to a gap at that 76 | * position, the value for the other sequence will be -1. 77 | * 78 | * Note that the two sequences must contain only values from 0 to n-1, where n is the 79 | * size of the nxn substitution matrix. 80 | */ 81 | int32_t Align(const std::vector& sequence1, const std::vector& sequence2, 82 | std::vector& alignment); 83 | }; 84 | 85 | } /* namespace PairAlign */ 86 | 87 | 88 | #endif /* NW_ALIGN_H */ 89 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/pair_align/pair_align.h: -------------------------------------------------------------------------------- 1 | #ifndef PAIR_ALIGN_H 2 | #define PAIR_ALIGN_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace ublas = boost::numeric::ublas; 10 | 11 | /// Namespace for pairwise alignment code. 12 | namespace PairAlign { 13 | 14 | /// Magic number representing log of zero (-INF). 15 | static const int32_t ZERO_PROB_SCORE = -1000000000; 16 | 17 | /// Helper function for the max of three values. 18 | inline int32_t TripleMax(int32_t a, int32_t b, int32_t c) { 19 | return (a >= b) ? ((a >= c) ? a : c) : ((b >= c) ? b : c); 20 | } 21 | 22 | /// Helper function for the index of the max of three values. 23 | inline int TripleMaxIndex(int32_t a, int32_t b, int32_t c) { 24 | return (a >= b) ? ((a >= c) ? 0 : 2) : ((b >= c) ? 1 : 2); 25 | } 26 | 27 | /// Helper struct for representing a position in an alignment. 28 | struct AlignPos { 29 | int Pos1; 30 | int Pos2; 31 | /// Constructor. 32 | AlignPos(int p1 = 0, int p2 = 0) : Pos1(p1), Pos2(p2) {} 33 | /// Comparison operator. Sorts by first pos, then by second. 34 | bool operator<(const AlignPos& rhs) const { 35 | if (Pos1 == -1 || rhs.Pos1 == -1 || Pos1 == rhs.Pos1) { 36 | return Pos2 < rhs.Pos2; 37 | } 38 | return Pos1 < rhs.Pos1; 39 | } 40 | }; 41 | 42 | 43 | /// Abstract baseclass for pairwise alignment. 44 | class Aligner { 45 | public: 46 | /// Destructor. 47 | virtual ~Aligner() {} 48 | 49 | /** Align two sequences. 50 | * @param[in] sequence1 First sequence of states. 51 | * @param[in] sequence2 Second sequence of states. 52 | * @param[out] alignment Vector to hold the resulting alignment. 53 | * @return The alignment score. This is not normalized. 54 | * 55 | * The alignment object will contain one entry per alignment position. Any contents it 56 | * had before the call will be lost. Each entry contains the indexes of the two sequence 57 | * elements that align to that position. If one sequence has aligned to a gap at that 58 | * position, the value for the other sequence will be -1. 59 | * 60 | * Note that the two sequences must contain only values from 0 to n-1, where n is the 61 | * size of the nxn substitution matrix. 62 | */ 63 | virtual int32_t Align(const std::vector& sequence1, const std::vector& sequence2, 64 | std::vector& alignment) = 0; 65 | }; 66 | 67 | 68 | } /* namespace PairAlign */ 69 | 70 | 71 | #endif /* PAIR_ALIGN_H */ 72 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/pair_align/pair_align_py.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | using ublas::matrix; 7 | 8 | 9 | template 10 | void list_to_vector(bp::list& in, vector& out) { 11 | out.clear(); 12 | int count = bp::len(in); 13 | out.resize(count); 14 | for (int i = 0; i < count; ++i) { 15 | out[i] = bp::extract(in[i]); 16 | } 17 | } 18 | 19 | template 20 | void list_to_matrix(bp::list& in, matrix& out) { 21 | out.clear(); 22 | int nrows = bp::len(in); 23 | int ncols = bp::len(bp::extract(in[0])); 24 | out.resize(nrows, ncols); 25 | for (int i = 0; i < nrows; ++i) { 26 | bp::list row = bp::extract(in[i]); 27 | if (bp::len(row) != ncols) { 28 | throw runtime_error("Error: Not all columns are the same length."); 29 | } 30 | for (int j = 0; j < ncols; ++j) { 31 | out(i, j) = bp::extract(row[j]); 32 | } 33 | } 34 | } 35 | 36 | 37 | PairAlign_Py::PairAlign_Py(bp::list& subMat, bp::list& gapPen, bool lowmem) { 38 | list_to_matrix(subMat, subMatrix); 39 | vector gapPenalties; 40 | list_to_vector(gapPen, gapPenalties); 41 | if (lowmem) { 42 | aligner = boost::shared_ptr(new PairAlign::MMAlign(subMatrix, gapPenalties)); 43 | } 44 | else { 45 | aligner = boost::shared_ptr(new PairAlign::NWAlign(subMatrix, gapPenalties)); 46 | } 47 | } 48 | 49 | 50 | bp::tuple PairAlign_Py::Align(bp::list& sequence1, bp::list& sequence2) { 51 | vector alignVec; 52 | vector seq1, seq2; 53 | list_to_vector(sequence1, seq1); 54 | list_to_vector(sequence2, seq2); 55 | int32_t score = aligner->Align(seq1, seq2, alignVec); 56 | bp::list alignment; 57 | for (size_t i = 0; i < alignVec.size(); ++i) { 58 | alignment.append(bp::make_tuple(alignVec[i].Pos1, alignVec[i].Pos2)); 59 | } 60 | return bp::make_tuple(alignment, score); 61 | } 62 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/pair_align/pair_align_py.h: -------------------------------------------------------------------------------- 1 | #ifndef PAIR_ALIGN_PY_H 2 | #define PAIR_ALIGN_PY_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | namespace bp = boost::python; 10 | namespace ublas = boost::numeric::ublas; 11 | 12 | 13 | /// Boost-Python wrapper class for pairwise alignment classes. 14 | class PairAlign_Py { 15 | private: 16 | ublas::matrix subMatrix; 17 | boost::shared_ptr aligner; 18 | 19 | public: 20 | /** Constructor 21 | * @param[in] subMat Substitution matrix. List of lists. 22 | * @param[in] gapPenalties List of gap penalties (length 8). 23 | * @param[in] lowmem Flag indicating whether to use the faster Neddleman-Wunsch 24 | * implementation or the slower linear-memory Myers-Miller 25 | * implementation. 26 | * 27 | * The 8 values in the gap penalty list should be as follows: 28 | * start_gap1 Penalty for aligning to a gap before sequence 1. 29 | * end_gap1 Penalty for aligning to a gap after sequence 1. 30 | * open_gap1 Penalty for opening a gap within sequence 1. 31 | * extend_gap1 Penalty for extending a gap within sequence 1. 32 | * start_gap2 Penalty for aligning to a gap before sequence 2. 33 | * end_gap2 Penalty for aligning to a gap after sequence 2. 34 | * open_gap2 Penalty for opening a gap within sequence 2. 35 | * extend_gap2 Penalty for extending a gap within sequence 2. 36 | */ 37 | PairAlign_Py(bp::list& subMat, bp::list& gapPenalties, bool lowmem); 38 | 39 | /** Align two sequences. 40 | * @param[in] sequence1 First sequence of states. 41 | * @param[in] sequence2 Second sequence of states. 42 | * @return Tuple containing a list holding the resulting alignment 43 | * and an alignment score. This is not normalized. 44 | * 45 | * The alignment object will contain one entry per alignment position. Each entry 46 | * is a tuple containing the indexes of the two sequence elements that align to that 47 | * position. If one sequence has aligned to a gap at that position, the value for 48 | * the other sequence will be -1. 49 | * 50 | * Note that the two sequences must contain only values from 0 to n-1, where n is the 51 | * size of the nxn substitution matrix. 52 | */ 53 | bp::tuple Align(bp::list& sequence1, bp::list& sequence2); 54 | }; 55 | 56 | 57 | 58 | /// Python class wrapper. 59 | BOOST_PYTHON_MODULE(pair_align) { 60 | bp::class_("Aligner", bp::init(bp::args("sub_matrix", "gap_penalties", "lowmem"))) 61 | .def("align", &PairAlign_Py::Align); 62 | } 63 | 64 | 65 | #endif /* PAIR_ALIGN_PY_H */ 66 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/viterbi_2d/viterbi_2d.h: -------------------------------------------------------------------------------- 1 | #ifndef VITERBI_2D_H 2 | #define VITERBI_2D_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace ublas = boost::numeric::ublas; 11 | 12 | static const int8_t MOVE_DIAG = 0; 13 | static const int8_t MOVE_RIGHT = 1; 14 | static const int8_t MOVE_UP = 2; 15 | static const int8_t MOVE_UNDEF = 3; 16 | static const int32_t ZERO_PROB_SCORE = -1000000000; 17 | static const double MIN_EMISSION_SCORE = -20.0; 18 | 19 | 20 | inline double square(double x) { 21 | return x * x; 22 | } 23 | 24 | inline int32_t prob2score(double x) { 25 | if (x < 0.0000000001) return -2400; 26 | return int32_t(100.0 * log(x)); 27 | } 28 | 29 | 30 | /// Helper class representing a node in the HMM. 31 | struct Node { 32 | int32_t index1; ///< Index of event from first sequence. 33 | int32_t index2; ///< Index of event from second sequence. 34 | int32_t leftIndex; ///< Index of node to the left of this one. 35 | int32_t downIndex; ///< Index of node below this one. 36 | int32_t diagIndex; ///< Index of node diagonal to this one. 37 | std::vector statePointers; ///< Viterbi backtrace pointers. 38 | std::vector dirPointers; ///< NW alignment backtrace pointers. 39 | 40 | /** Initialize node. 41 | * @param[in] i Index of event from first sequence. 42 | * @param[in] j index of event from second sequence. 43 | * @param[in] left Index of node to the left of this one. 44 | * @param[in] down Index of node below this one. 45 | * @param[in] diag Index of node diagonal to this one. 46 | * @param[in] states Number of states in the HMM. 47 | */ 48 | void Init(int i, int j, int left, int down, int diag, int states) { 49 | index1 = i; 50 | index2 = j; 51 | leftIndex = left; 52 | downIndex = down; 53 | diagIndex = diag; 54 | statePointers.resize(states); 55 | dirPointers.resize(states); 56 | } 57 | }; 58 | 59 | 60 | /** Helper class for emission scores. 61 | * 62 | * This class provides normal level emissions and gamma distributed noise emissions. 63 | * Note that other emission objects can be substituted by changing the Emission typedef 64 | * immediately following this class definition. 65 | */ 66 | class DefaultEmission { 67 | private: 68 | std::vector levels; 69 | std::vector noises; 70 | std::vector logNoises; 71 | std::vector stayWeights; 72 | std::vector emWeights; 73 | std::vector modelLevels; 74 | std::vector modelNoises; 75 | std::vector offsets; 76 | std::vector levelScales; 77 | std::vector noiseScales; 78 | std::vector noiseShapes; 79 | int numEvents; 80 | int numStates; 81 | bool useNoise; 82 | 83 | public: 84 | /** Constructor. 85 | * @param[in] mdlLevels Model current levels. 86 | * @param[in] mdlLevelSpreads Spreads of model current levels. 87 | * @params[in] mdlNoises Model noise levels. 88 | * @param[in] mdlNoiseSpreads Spreads of model noise levels. 89 | * @param[in] useSd Flag to specify whether to use noise levels in the basecall. 90 | */ 91 | DefaultEmission(const std::vector& mdlLevels, const std::vector& mdlLevelSpreads, 92 | const std::vector& mdlNoises, const std::vector& mdlNoiseSpreads, 93 | bool useSd); 94 | 95 | /** Assign events to the object with vectors. 96 | * @param[in] means Event current levels. 97 | * @param[in] stdvs Event noise levels. 98 | * @param[in] stayWts Event weights for modifying stay probabilities. 99 | * @param[in] emWts Event weights for modifying emission probabilities. 100 | */ 101 | void SetEvents(const std::vector& means, const std::vector& stdvs, 102 | const std::vector& stayWts, const std::vector& emWts); 103 | 104 | /// Set the number of events (for when SetEvents() will not be called. 105 | void SetNEvents(int n) {numEvents = n;} 106 | 107 | /// Returns the number of events. 108 | int NumEvents() const {return numEvents;} 109 | 110 | /// Returns the number of model states. 111 | int NumStates() const {return numStates;} 112 | 113 | /// Returns the model levels. 114 | const std::vector GetModelLevels() const {return modelLevels;} 115 | 116 | /// Returns the stay weights. 117 | const std::vector GetStayWeights() const {return stayWeights;} 118 | 119 | /// Returns the score for event i and state j. 120 | int32_t Score(int i, int j) const { 121 | double score = offsets[j] + levelScales[j] * square(levels[i] - modelLevels[j]); 122 | if (useNoise) score += (noiseShapes[j] - 1.0) * logNoises[i] - noiseScales[j] * noises[i]; 123 | return int32_t(emWeights[i] * std::max(MIN_EMISSION_SCORE, score)); 124 | } 125 | }; 126 | 127 | 128 | typedef DefaultEmission Emission; 129 | typedef std::vector > Alignment; 130 | 131 | 132 | /// Worker class for performing 2D Viterbi basecall. 133 | class Viterbi2D { 134 | private: 135 | std::vector nodes; // All HMM nodes, in the order they should be processed. 136 | int32_t baseStay[3]; // Stay scores for each direction. 137 | int32_t baseStep[3]; // Step scores for each direction. 138 | int32_t baseSkip[3]; // Skip scores for each direction. 139 | ublas::matrix emScore1; // Pre-computed emissions for sequence 1. 140 | ublas::matrix emScore2; // Pre-computed emissions for sequence 2. 141 | ublas::matrix viterbiScore; // Viterbi scores. Length of sequence 1 by number of states. 142 | ublas::matrix lastScore; // Viterbi scores for previous event from sequence 2. 143 | int numStates; // Number of states in the HMM. 144 | int numNodes; // Total number of nodes to be processed. 145 | int numEvents1; // Number of events in sequence 1. 146 | int numEvents2; // Number of events in sequence 2. 147 | 148 | void initNodes(const std::vector& bandStarts, const std::vector& bandEnds); 149 | void processNodes(const std::vector& wts1, const std::vector& wts2); 150 | void backTrace(Alignment& alignment, std::vector& states); 151 | 152 | public: 153 | /** Constructor. 154 | * @param[in] maxNodes The maximum number of nodes to support. 155 | * @param[in] maxLen The maximum number of events to support for either sequence. 156 | * @param[in] states The number of states in the HMM. 157 | * @param[in] trans The six transition probabilities (stay1, step1, skip1, stay2, step2, skip2). 158 | */ 159 | Viterbi2D(int maxNodes, int maxLen, int states, const std::vector& trans); 160 | 161 | /** Perform the basecall with emission objects. 162 | * @param[in] data1 Emission object for sequence 1. 163 | * @param[in] data2 Emission object for sequence 2. 164 | * @param[in] bandStarts For each event in sequence 2, the first candidate position in sequence 1. 165 | * @param[in] bandEnds For each event in sequence 2, the last candidate position in sequence 1. 166 | * @param[in] priors The prior scores for the "before alignment" node. All zeros means no prior. 167 | * @param[out] alignment The final alignment of events. 168 | * @param[out] states The final basecalled states. 169 | */ 170 | void Call(const Emission& data1, const Emission& data2, const std::vector& bandStarts, 171 | const std::vector& bandEnds, const std::vector& priors, 172 | Alignment& alignment, std::vector& states); 173 | 174 | /** Perform the basecall with precomputed emissions. 175 | * @param[in] data1 Precomputed emissions for sequence 1. 176 | * @param[in] data2 Precomputed emissions for sequence 2. 177 | * @param[in] stayWt1 Stay weights for sequence 1. 178 | * @param[in] stayWt2 Stay weights for sequence 2. 179 | * @param[in] bandStarts For each event in sequence 2, the first candidate position in sequence 1. 180 | * @param[in] bandEnds For each event in sequence 2, the last candidate position in sequence 1. 181 | * @param[in] priors The prior scores for the "before alignment" node. All zeros means no prior. 182 | * @param[out] alignment The final alignment of events. 183 | * @param[out] states The final basecalled states. 184 | */ 185 | void Call(const MatView& data1, const MatView& data2, 186 | const VecView& stayWt1, const VecView& stayWt2, 187 | const std::vector& bandStarts, const std::vector& bandEnds, 188 | const std::vector& priors, Alignment& alignment, std::vector& states); 189 | }; 190 | 191 | 192 | #endif /* VITERBI_2D */ 193 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/viterbi_2d/viterbi_2d_py.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | using namespace std; 6 | using ublas::matrix; 7 | 8 | 9 | Viterbi2D_Py::Viterbi2D_Py(bp::dict& stateInfo, bp::dict& params) { 10 | bandSize = bp::extract(params["band_size"]); 11 | int kmerLen = bp::extract(params["kmer_len"]); 12 | setupKmers(kmerLen); 13 | bool rc = bp::extract(params["seq2_is_rc"]); 14 | useNoise = bp::extract(params["use_sd"]); 15 | if (stateInfo.has_key(string("kmers"))) { 16 | bp::list kmers = bp::extract(stateInfo["kmers"]); 17 | emission1 = dummyEmission(kmers); 18 | emission2 = dummyEmission(kmers); 19 | } 20 | else { 21 | bp::dict model1 = bp::extract(stateInfo["model1"]); 22 | bp::dict model2 = bp::extract(stateInfo["model2"]); 23 | emission1 = makeEmission(model1, false); 24 | emission2 = makeEmission(model2, rc); 25 | } 26 | int numStates = emission1->NumStates(); 27 | int maxNodes = bp::extract(params["max_nodes"]); 28 | int maxLen = bp::extract(params["max_len"]); 29 | vector trans(6); 30 | trans[0] = bp::extract(params["stay1"]); 31 | trans[1] = bp::extract(params["step1"]); 32 | trans[2] = bp::extract(params["skip1"]); 33 | trans[3] = bp::extract(params["stay2"]); 34 | trans[4] = bp::extract(params["step2"]); 35 | trans[5] = bp::extract(params["skip2"]); 36 | viterbi = boost::shared_ptr(new Viterbi2D(maxNodes, maxLen, numStates, trans)); 37 | } 38 | 39 | 40 | bp::list Viterbi2D_Py::GetKmerList() const { 41 | bp::list kmerList; 42 | for (size_t i = 0; i < kmers.size(); ++i) { 43 | kmerList.append(kmers[i]); 44 | } 45 | return kmerList; 46 | } 47 | 48 | 49 | bp::numeric::array Viterbi2D_Py::GetModelLevels1() const { 50 | const vector levelVec = emission1->GetModelLevels(); 51 | bp::numeric::array result = new_numpy_1d(levelVec.size()); 52 | VecView data = view_1d_array(result); 53 | for (size_t i = 0; i < levelVec.size(); ++i) { 54 | data[i] = levelVec[i]; 55 | } 56 | return result; 57 | } 58 | 59 | 60 | bp::numeric::array Viterbi2D_Py::GetModelLevels2() const { 61 | const vector levelVec = emission2->GetModelLevels(); 62 | bp::numeric::array result = new_numpy_1d(levelVec.size()); 63 | VecView data = view_1d_array(result); 64 | for (size_t i = 0; i < levelVec.size(); ++i) { 65 | data[i] = levelVec[i]; 66 | } 67 | return result; 68 | } 69 | 70 | 71 | bp::dict Viterbi2D_Py::Call(bp::dict& events1, bp::dict& events2, bp::list& alignment, bp::object& prior) { 72 | vector means1, stdvs1, stwts1, emwts1, means2, stdvs2, stwts2, emwts2; 73 | getEvents(events1, means1, stdvs1, stwts1, emwts1); 74 | getEvents(events2, means2, stdvs2, stwts2, emwts2); 75 | emission1->SetEvents(means1, stdvs1, stwts1, emwts1); 76 | emission2->SetEvents(means2, stdvs2, stwts2, emwts2); 77 | Alignment alignIn; 78 | alignIn = list_to_pair_vector(alignment); 79 | vector bandStarts, bandEnds; 80 | makeBands(alignIn, bandStarts, bandEnds); 81 | vector priorScores(emission1->NumStates()); 82 | if (prior) { 83 | fill(priorScores.begin(), priorScores.end(), ZERO_PROB_SCORE); 84 | int state = states[bp::extract(prior)]; 85 | priorScores[state] = 0; 86 | } 87 | Alignment alignOut; 88 | vector statesOut; 89 | viterbi->Call(*emission1, *emission2, bandStarts, bandEnds, priorScores, alignOut, statesOut); 90 | return makeResult(alignOut, statesOut); 91 | } 92 | 93 | 94 | bp::dict Viterbi2D_Py::CallPost(bp::numeric::array& post1, bp::numeric::array& post2, 95 | bp::numeric::array& stayWt1, bp::numeric::array& stayWt2, 96 | bp::list& alignment, bp::object& prior) { 97 | MatView probs1 = view_2d_array(post1); 98 | MatView probs2 = view_2d_array(post2); 99 | VecView stayWeight1 = view_1d_array(stayWt1); 100 | VecView stayWeight2 = view_1d_array(stayWt2); 101 | int numStates = int(probs1.size2()); 102 | emission1->SetNEvents(int(probs1.size1())); 103 | emission2->SetNEvents(int(probs2.size1())); 104 | Alignment alignIn; 105 | alignIn = list_to_pair_vector(alignment); 106 | vector bandStarts, bandEnds; 107 | makeBands(alignIn, bandStarts, bandEnds); 108 | vector priorScores(numStates); 109 | if (prior) { 110 | fill(priorScores.begin(), priorScores.end(), ZERO_PROB_SCORE); 111 | int state = states[bp::extract(prior)]; 112 | priorScores[state] = 0; 113 | } 114 | Alignment alignOut; 115 | vector statesOut; 116 | viterbi->Call(probs1, probs2, stayWeight1, stayWeight2, bandStarts, bandEnds, priorScores, alignOut, statesOut); 117 | return makeResult(alignOut, statesOut); 118 | } 119 | 120 | 121 | void Viterbi2D_Py::setupKmers(int kmerLen) { 122 | int numKmers = 1 << (kmerLen << 1); 123 | const char letters[] = "ACGT"; 124 | kmers.resize(numKmers); 125 | states.clear(); 126 | vector pos(kmerLen); 127 | for (int i = 0; i < numKmers; ++i) { 128 | string kmer; 129 | for (int j = 0; j < kmerLen; ++j) { 130 | kmer += letters[pos[kmerLen - j - 1]]; 131 | } 132 | kmers[i] = kmer; 133 | states[kmer] = i; 134 | bool flag = true; 135 | int digit = 0; 136 | while (flag) { 137 | ++pos[digit]; 138 | if (pos[digit] == 4) { 139 | pos[digit] = 0; 140 | ++digit; 141 | if (digit == kmerLen) { 142 | flag = false; 143 | } 144 | } 145 | else { 146 | flag = false; 147 | } 148 | } 149 | } 150 | } 151 | 152 | 153 | boost::shared_ptr Viterbi2D_Py::makeEmission(bp::dict& model, bool rc) { 154 | bp::numeric::array levelMean = bp::extract(model.get("level_mean")); 155 | bp::numeric::array levelStdv = bp::extract(model.get("level_stdv")); 156 | bp::numeric::array sdMean = bp::extract(model.get("sd_mean")); 157 | bp::numeric::array sdStdv = bp::extract(model.get("sd_stdv")); 158 | bp::list kmer = bp::extract(model.get("kmer")); 159 | VecView mean = view_1d_array(levelMean); 160 | VecView sigma = view_1d_array(levelStdv); 161 | VecView noise = view_1d_array(sdMean); 162 | VecView noiseSd = view_1d_array(sdStdv); 163 | int numStates = int(mean.size()); 164 | vector levels(numStates), levelSpreads(numStates), noises(numStates), noiseSpreads(numStates); 165 | copy(mean.begin(), mean.end(), levels.begin()); 166 | copy(sigma.begin(), sigma.end(), levelSpreads.begin()); 167 | copy(noise.begin(), noise.end(), noises.begin()); 168 | copy(noiseSd.begin(), noiseSd.end(), noiseSpreads.begin()); 169 | vector mdlKmers = list_to_vector(kmer); 170 | sortModel(levels, levelSpreads, noises, noiseSpreads, mdlKmers, rc); 171 | return boost::shared_ptr(new Emission(levels, levelSpreads, noises, noiseSpreads, useNoise)); 172 | } 173 | 174 | 175 | boost::shared_ptr Viterbi2D_Py::dummyEmission(bp::list& kmers) { 176 | int numStates = bp::len(kmers); 177 | vector levels(numStates), levelSpreads(numStates), noises(numStates), noiseSpreads(numStates); 178 | return boost::shared_ptr(new Emission(levels, levelSpreads, noises, noiseSpreads, useNoise)); 179 | } 180 | 181 | 182 | void Viterbi2D_Py::sortModel(vector& levels, vector& levelSpreads, vector& noises, 183 | vector& noiseSpreads, vector& mdlKmers, bool rc) { 184 | int numKmers = int(levels.size()); 185 | vector newLvl(numKmers), newLvlSprd(numKmers), newSd(numKmers), newSdSprd(numKmers); 186 | vector newKmer(numKmers); 187 | map rcMap; 188 | rcMap['A'] = 'T'; 189 | rcMap['C'] = 'G'; 190 | rcMap['G'] = 'C'; 191 | rcMap['T'] = 'A'; 192 | for (int i = 0; i < numKmers; ++i) { 193 | string kmer = mdlKmers[i]; 194 | if (rc) { 195 | reverse(kmer.begin(), kmer.end()); 196 | for (string::iterator p = kmer.begin(); p < kmer.end(); ++p) { 197 | *p = rcMap[*p]; 198 | } 199 | } 200 | int pos = states[kmer]; 201 | newLvl[pos] = levels[i]; 202 | newLvlSprd[pos] = levelSpreads[i]; 203 | newSd[pos] = noises[i]; 204 | newSdSprd[pos] = noiseSpreads[i]; 205 | newKmer[pos] = mdlKmers[i]; 206 | } 207 | levels.swap(newLvl); 208 | levelSpreads.swap(newLvlSprd); 209 | noises.swap(newSd); 210 | noiseSpreads.swap(newSdSprd); 211 | mdlKmers.swap(newKmer); 212 | } 213 | 214 | 215 | void Viterbi2D_Py::getEvents(bp::dict& events, vector& means, vector& stdvs, 216 | vector& stayWts, vector& emWts) { 217 | bp::numeric::array mean = bp::extract(events.get("mean")); 218 | bp::numeric::array stdv = bp::extract(events.get("stdv")); 219 | bp::numeric::array stayWeight = bp::extract(events.get("stay_weight")); 220 | bp::numeric::array emWeight = bp::extract(events.get("em_weight")); 221 | VecView meanV = view_1d_array(mean); 222 | VecView stdvV = view_1d_array(stdv); 223 | VecView stwtV = view_1d_array(stayWeight); 224 | VecView emwtV = view_1d_array(emWeight); 225 | int numEvents = int(meanV.size()); 226 | means.resize(numEvents); 227 | stdvs.resize(numEvents); 228 | stayWts.resize(numEvents); 229 | emWts.resize(numEvents); 230 | copy(meanV.begin(), meanV.end(), means.begin()); 231 | copy(stdvV.begin(), stdvV.end(), stdvs.begin()); 232 | copy(stwtV.begin(), stwtV.end(), stayWts.begin()); 233 | copy(emwtV.begin(), emwtV.end(), emWts.begin()); 234 | 235 | } 236 | 237 | 238 | void Viterbi2D_Py::makeBands(const Alignment& alignIn, vector& bandStarts, vector& bandEnds) { 239 | int numEvents1 = emission1->NumEvents(); 240 | int numEvents2 = emission2->NumEvents(); 241 | bandStarts.resize(numEvents2); 242 | bandEnds.resize(numEvents2); 243 | fill(bandStarts.begin(), bandStarts.end(), int32_t(numEvents1 - 1)); 244 | fill(bandEnds.begin(), bandEnds.end(), int32_t(0)); 245 | int lastX = 0, lastY = numEvents2 - 1; 246 | int nPos = int(alignIn.size()); 247 | for (int p = 0; p < nPos; ++p) { 248 | int x = (alignIn[p].first == -1) ? lastX : alignIn[p].first; 249 | int y = (alignIn[p].second == -1) ? lastY : alignIn[p].second; 250 | for (int k = y - bandSize; k <= y + bandSize; ++k) { 251 | if (k < 0 || k >= numEvents2) continue; 252 | int left = min((int32_t)(x - bandSize), bandStarts[k]); 253 | int right = max((int32_t)(x + bandSize), bandEnds[k]); 254 | left = max(0, left); 255 | right = min(numEvents1 - 1, right); 256 | bandStarts[k] = left; 257 | bandEnds[k] = right; 258 | } 259 | lastX = x; 260 | lastY = y; 261 | } 262 | } 263 | 264 | 265 | bp::dict Viterbi2D_Py::makeResult(const Alignment& alignOut, const vector& statesOut) { 266 | bp::list align; 267 | bp::list kmersOut; 268 | int count = int(alignOut.size()); 269 | for (int i = 0; i < count; ++i) { 270 | kmersOut.append(kmers[statesOut[i]]); 271 | bp::tuple data = bp::make_tuple(alignOut[i].first, alignOut[i].second); 272 | align.append(data); 273 | } 274 | bp::dict results; 275 | results["alignment"] = align; 276 | results["kmers"] = kmersOut; 277 | return results; 278 | } 279 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/viterbi_2d/viterbi_2d_py.h: -------------------------------------------------------------------------------- 1 | #ifndef VITERBI_2D_PY_H 2 | #define VITERBI_2D_PY_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | namespace bp = boost::python; 13 | 14 | 15 | /// Viterbi 2D basecaller python wrapper. 16 | class Viterbi2D_Py { 17 | private: 18 | boost::shared_ptr viterbi; 19 | boost::shared_ptr emission1; 20 | boost::shared_ptr emission2; 21 | std::vector kmers; 22 | std::map states; 23 | std::map parms; 24 | int bandSize; 25 | bool useNoise; 26 | 27 | void setupKmers(int kmerLen); 28 | boost::shared_ptr makeEmission(bp::dict& model, bool rc); 29 | boost::shared_ptr dummyEmission(bp::list& kmers); 30 | void sortModel(std::vector& levels, std::vector& levelSpreads, std::vector& noises, 31 | std::vector& noiseSpreads, std::vector& mdlKmers, bool rc); 32 | void getEvents(bp::dict& events, std::vector& means, std::vector& stdvs, 33 | std::vector& stayWts, std::vector& emWts); 34 | void makeBands(const Alignment& alignIn, std::vector& bandStarts, std::vector& bandEnds); 35 | bp::dict makeResult(const Alignment& alignOut, const std::vector& statesOut); 36 | 37 | public: 38 | /** Constructor. 39 | * @param[in] stateInfo Dictionary containing state information. 40 | * @param[in] params Dictionary of basecalling parameters. 41 | * 42 | * The state information should either contain 'model1' and 'model2' 43 | * fields, containing the models for the template and complement data, 44 | * or a 'kmers' field containing a list of the kmers (for posterior 45 | * calling). 46 | */ 47 | Viterbi2D_Py(bp::dict& stateInfo, bp::dict& params); 48 | 49 | /** Perform the basecall. 50 | * @param[in] data1 Event sequence 1. 51 | * @param[in] data2 Event sequence 2. 52 | * @param[in] alignment Estimated alignment of sequence 1 to sequence 2. 53 | * @param[in] prior The prior kmer for the "before alignment" node. None means no prior. 54 | * @return Dictionary contain alignment and called kmers. 55 | */ 56 | bp::dict Call(bp::dict& events1, bp::dict& events2, bp::list& alignment, bp::object& prior); 57 | 58 | /** Perform the basecall using posteriors. 59 | * @param[in] post1 Posteriors for sequence 1. 60 | * @param[in] post2 Posteriors for sequence 2. 61 | * @param[in] stayWt1 Stay weights for sequence 1. 62 | * @param[in] stayWt2 Stay weights for sequence 2. 63 | * @param[in] alignment Estimated alignment of sequence 1 to sequence 2. 64 | * @param[in] prior The prior kmer for the "before alignment" node. None means no prior. 65 | * @return Dictionary contain alignment and called kmers. 66 | */ 67 | bp::dict CallPost(bp::numeric::array& post1, bp::numeric::array& post2, 68 | bp::numeric::array& stayWt1, bp::numeric::array& stayWt2, 69 | bp::list& alignment, bp::object& prior); 70 | 71 | /// Get a list of the base transition probabilities. 72 | bp::list GetTransitionProbs() const; 73 | 74 | /// Get a list of kmers in operational order. 75 | bp::list GetKmerList() const; 76 | 77 | /// Get a list of the model levels for the first sequence. 78 | bp::numeric::array GetModelLevels1() const; 79 | 80 | /// Get a list of the model levels for the second sequence. 81 | bp::numeric::array GetModelLevels2() const; 82 | }; 83 | 84 | /// Python class wrapper. 85 | BOOST_PYTHON_MODULE(viterbi_2d) { 86 | import_array(); 87 | bp::numeric::array::set_module_and_type("numpy", "ndarray"); 88 | bp::class_("Viterbi2D", bp::init()) 89 | .def("call", &Viterbi2D_Py::Call) 90 | .def("call_post", &Viterbi2D_Py::CallPost) 91 | .def("get_kmer_list", &Viterbi2D_Py::GetKmerList) 92 | .def("get_model_levels1", &Viterbi2D_Py::GetModelLevels1) 93 | .def("get_model_levels2", &Viterbi2D_Py::GetModelLevels2); 94 | bp::scope().attr("ZERO_PROB_SCORE") = bp::object(ZERO_PROB_SCORE); 95 | } 96 | 97 | 98 | #endif /* VITERBI_2D_PY_H */ 99 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/viterbi_2d_ocl/proxyCL.h: -------------------------------------------------------------------------------- 1 | #ifndef PROXY_CL_H 2 | #define PROXY_CL_H 3 | 4 | #include 5 | #ifdef max // it is defined in cl.hpp 6 | #undef max 7 | #endif //max 8 | 9 | #include 10 | #include 11 | 12 | enum vendor 13 | { 14 | amd, 15 | intel, 16 | nvidia, 17 | apple, 18 | other 19 | }; 20 | 21 | enum device_type 22 | { 23 | cpu, 24 | gpu, 25 | all, 26 | undefined 27 | }; 28 | 29 | struct device_info 30 | { 31 | size_t id; 32 | std::string name; 33 | device_type type; 34 | 35 | bool operator==(const device_info &data) 36 | { 37 | return id == data.id && name == data.name && type == data.type; 38 | } 39 | }; 40 | 41 | struct device_info_ex 42 | { 43 | size_t id; 44 | std::string name; 45 | device_type type; 46 | size_t max_compute_units; 47 | size_t max_work_item_dimensions; 48 | size_t max_work_group_size; 49 | size_t max_work_items_sizes_x; 50 | size_t max_work_items_sizes_y; 51 | size_t max_work_items_sizes_z; 52 | size_t max_clock_frequency; 53 | size_t max_parameter_size; 54 | size_t global_mem_cache_type; 55 | size_t global_mem_cacheline_size; 56 | size_t global_mem_cache_size; 57 | size_t global_mem_size; 58 | size_t max_constant_buffer_size; 59 | size_t local_mem_type; 60 | size_t local_mem_size; 61 | size_t preferred_vector_width_char; 62 | size_t preferred_vector_width_short; 63 | size_t preferred_vector_width_int; 64 | size_t preferred_vector_width_long; 65 | size_t preferred_vector_width_float; 66 | }; 67 | 68 | class proxyCL 69 | { 70 | public: 71 | proxyCL(){}; 72 | ~proxyCL(){}; 73 | 74 | bool profiling_enabled() const { return enable_profiling_; } 75 | void enable_cuda_build_cache(bool enable) const; 76 | 77 | size_t get_max_global_mem_size() const { return max_global_mem_size_; } 78 | size_t get_max_local_mem_size() const { return max_local_mem_size_; } 79 | size_t get_max_work_group_size() const { return max_work_group_size_; } 80 | size_t get_work_group_size() const { return work_group_size_; } 81 | void set_work_group_size(size_t value) { work_group_size_ = value; } 82 | 83 | std::vector available_vendors_str(std::string &error) const; 84 | std::vector available_vendors_str_ex(std::string &error) const; 85 | std::vector available_vendors(std::string &error) const; 86 | bool select_vendor(const std::string &vendor, std::string &error); 87 | bool select_vendor(vendor v, std::string &error); 88 | vendor get_selected_vendor() const { return active_vendor_; } 89 | 90 | bool create_context(device_type type, std::string &error); 91 | bool create_context(std::string &error); 92 | 93 | std::vector available_devices(std::string &error) const; 94 | bool select_device(size_t id, std::string &error); 95 | device_info_ex get_device_info_ex(size_t id, std::string &error) const; 96 | std::string get_device_info(size_t id, std::string &error) const; 97 | 98 | std::string get_device_extensions(std::string &error) const; 99 | bool fp64_extension_support(std::string &error) const; 100 | 101 | bool double_fp_support(std::string &error) const; 102 | 103 | bool load_kernel_from_source_file(const std::string &file_path, std::string &error); 104 | bool load_kernel_from_binary_file(const std::string &file_path, const std::string &build_options, std::string &error); 105 | bool load_kernel_from_source(const std::string &src, std::string &error); 106 | bool build_kernel(const std::string &build_options, std::string &error); 107 | bool output_binary(const std::string &path, const std::string &build_options, std::string &error); 108 | 109 | bool create_command_queue(bool enable_profiling, bool enable_out_of_order_exec_mode, std::string &error); 110 | 111 | cl::Program& get_program() { return program_; } 112 | cl::Context& get_context() { return context_; } 113 | cl::CommandQueue& get_command_queue() { return queue_; } 114 | 115 | const char* ocl_error_to_string(cl_int err) const; 116 | 117 | private: 118 | cl::Platform platform_; 119 | cl::Context context_; 120 | cl::Device device_; 121 | cl::Program program_; 122 | cl::CommandQueue queue_; 123 | size_t max_global_mem_size_{}; 124 | size_t max_local_mem_size_{}; 125 | size_t max_work_group_size_{}; 126 | size_t work_group_size_{}; 127 | bool enable_profiling_{}; 128 | vendor active_vendor_ = other; 129 | 130 | }; 131 | 132 | #endif // PROXY_CL_H 133 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/viterbi_2d_ocl/viterbi_2d.cl: -------------------------------------------------------------------------------- 1 | #ifdef ENABLE_FP64 2 | #if __OPENCL_VERSION__ <= CL_VERSION_1_1 3 | #pragma OPENCL EXTENSION cl_khr_fp64: enable 4 | #endif 5 | #endif 6 | 7 | #define MOVE_DIAG 0 8 | #define MOVE_RIGHT 1 9 | #define MOVE_UP 2 10 | #define MOVE_UNDEF 3 11 | 12 | #define ZERO_PROB_SCORE -1000000000 13 | 14 | 15 | __kernel __attribute__((reqd_work_group_size(WORK_ITEMS, 1, 1))) 16 | void ProcessNodes( 17 | int firstNode, 18 | int firstOffset, 19 | int wrapAround, 20 | int sliceSize, 21 | int index1First, 22 | int firstLeftValid, 23 | int lastDownValid, 24 | int firstDiagonalValid, 25 | int lastDiagonalValid, 26 | __global int* restrict viterbiScore, // maxSliceSize * 2 * numStates 27 | __global int* restrict transitions, // 9 (3 Stay, 3 Step, 3 Skip) 28 | __global int* restrict stayBuf, // maxSliceSize * 3 * numStates 29 | __global short* restrict ptrs, // maxSliceSize * 3 * numStates 30 | __global int* restrict emScore1, // maxLen * numStates 31 | __global int* restrict emScore2, // maxLen * numStates 32 | #ifdef ENABLE_FP64 33 | __global double* restrict weights // 3 * numNodes 34 | #else 35 | __global int* restrict weights_stay // 3 * numNodes 36 | #endif 37 | ) 38 | { 39 | int localId = get_local_id(0); 40 | int groupId = get_group_id(0); 41 | int slicePos = groupId / 3; 42 | int nodeIndex = firstNode + slicePos; 43 | int dir = groupId % 3; 44 | 45 | int pos = firstOffset + slicePos * 2; 46 | if (dir == 1) { --pos; } 47 | else if (dir == 2) { ++pos; } 48 | pos = pos % wrapAround; 49 | if (pos < 0) pos += wrapAround; 50 | viterbiScore += pos * NUM_STATES; 51 | 52 | stayBuf += NUM_STATES * groupId; 53 | ptrs += NUM_STATES * groupId; 54 | #ifdef ENABLE_FP64 55 | int weight_stay = (int)(weights[3 * nodeIndex + dir] * transitions[dir]); 56 | #else 57 | int weight_stay = weights_stay[3 * nodeIndex + dir]; 58 | #endif 59 | int step = transitions[3 + dir]; 60 | int skip = transitions[6 + dir]; 61 | 62 | // Fill in scores from previous nodes. 63 | if ((slicePos == 0 && ((dir == MOVE_RIGHT && firstLeftValid == 0) || 64 | (dir == MOVE_DIAG && firstDiagonalValid == 0))) || 65 | (slicePos == sliceSize - 1 && ((dir == MOVE_UP && lastDownValid == 0) || 66 | (dir == MOVE_DIAG && lastDiagonalValid == 0)))) 67 | { 68 | for (int x = 0; x < NUM_STATES; x += WORK_ITEMS) { 69 | stayBuf[x + localId] = ZERO_PROB_SCORE; 70 | } 71 | return; 72 | } 73 | 74 | int index1 = index1First + slicePos; 75 | int index2 = index1First - firstOffset - slicePos; 76 | emScore1 += NUM_STATES * index1; 77 | emScore2 += NUM_STATES * index2; 78 | 79 | for (int x = 0; x < NUM_STATES; x += WORK_ITEMS) { 80 | // Add transitions scores. 81 | int state = x + localId; 82 | int score = viterbiScore[x + localId] + weight_stay; 83 | 84 | // Set pointers for stay movement. Scores are already stay scores. 85 | int ptr = state; 86 | 87 | // Find maxima for each direction. 88 | for (int from = 0; from < NUM_STATES; from += NUM_STATES/4) { 89 | 90 | // Check the step movement scores. Update as needed. 91 | int buf = viterbiScore[from + (state / 4)] + step; 92 | if (buf > score) { 93 | score = buf; 94 | ptr = from + (state / 4); 95 | } 96 | 97 | // Check the skip movement scores. Update as needed. 98 | #pragma unroll 99 | for (int y = 0; y < 4; ++y) { 100 | int fromState = from + (y * NUM_STATES / 16) + (state / 16); 101 | int buf = viterbiScore[fromState] + skip; 102 | if (buf > score) { 103 | score = buf; 104 | ptr = fromState; 105 | } 106 | } 107 | } 108 | 109 | // Apply emission scores, depending on direction 110 | if (dir < 2) { score += emScore1[state]; } 111 | if (dir != 1) { score += emScore2[state]; } 112 | 113 | // Write result 114 | stayBuf[state] = score; 115 | ptrs[state] = ptr; 116 | } 117 | } 118 | 119 | 120 | __kernel __attribute__((reqd_work_group_size(WORK_ITEMS, 1, 1))) 121 | void PickBest( 122 | int firstNode, 123 | int firstOffset, 124 | int wrapAround, 125 | __global int* restrict stayBuf_tab, // maxSliceSize * 3 * numStates 126 | __global short* restrict ptrs_tab, // maxSliceSize * 3 * numStates 127 | __global short* restrict statePointers, // numNodes * numStates 128 | __global char* restrict dirPointers, // numNodes * numStates 129 | __global int* restrict viterbiScore // maxSliceSize * 2 * numStates 130 | ) 131 | { 132 | int localId = get_local_id(0); 133 | int groupId = get_group_id(0); 134 | int nodeIndex = firstNode + groupId; 135 | __global int *stayBuf = &stayBuf_tab[3 * NUM_STATES * groupId]; 136 | __global short *ptrs = &ptrs_tab[3 * NUM_STATES * groupId]; 137 | 138 | // Since firstOffset varies by +/-1 per slice we alternate between writing even and odd buffers 139 | int pos = (firstOffset + groupId * 2) % wrapAround; 140 | if (pos < 0) pos += wrapAround; 141 | 142 | // Pick the best of the three for each state. 143 | for (int j = 0; j < NUM_STATES; j += WORK_ITEMS) { 144 | int state = j + localId; 145 | char dir = MOVE_UP; 146 | int score0 = stayBuf[state]; 147 | int score1 = stayBuf[NUM_STATES + state]; 148 | int score = stayBuf[2*NUM_STATES + state]; 149 | 150 | if (score0 > score1 && score0 > score) { 151 | dir = MOVE_DIAG; 152 | score = score0; 153 | } else if (score1 > score) { 154 | dir = MOVE_RIGHT; 155 | score = score1; 156 | } 157 | viterbiScore[pos * NUM_STATES + state] = score; 158 | statePointers[nodeIndex * NUM_STATES + state] = ptrs[dir * NUM_STATES + state]; 159 | dirPointers[nodeIndex * NUM_STATES + state] = dir; 160 | } 161 | }; 162 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/viterbi_2d_ocl/viterbi_2d_ocl.h: -------------------------------------------------------------------------------- 1 | #ifndef VITERBI_2D_OCL_H 2 | #define VITERBI_2D_OCL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "proxyCL.h" 11 | 12 | namespace ublas = boost::numeric::ublas; 13 | 14 | static const int8_t MOVE_DIAG = 0; 15 | static const int8_t MOVE_RIGHT = 1; 16 | static const int8_t MOVE_UP = 2; 17 | static const int8_t MOVE_UNDEF = 3; 18 | static const int32_t ZERO_PROB_SCORE = -1000000000; 19 | static const double MIN_EMISSION_SCORE = -20.0; 20 | 21 | 22 | inline double square(double x) { 23 | return x * x; 24 | } 25 | 26 | inline int32_t prob2score(double x) { 27 | if (x < 0.0000000001) return -2400; 28 | return int32_t(100.0 * log(x)); 29 | } 30 | 31 | /// Struct representing all nodes in the HMM 32 | struct HmmNodesData { 33 | /* The NodeSlice struct represents a diagonal slice through the nodes which can be 34 | * scheduled simultaneously as they only have data dependencies on the previous two 35 | * slices (the left and down nodes are in the previous slice, the down-left diagonal 36 | * node is in the slice before that) and none on nodes within the current slice. 37 | * 38 | * The diagram below shows the nodes within a narrow band around an alignment of 39 | * two base sequences. The first 10 diagonals of nodes represented by the digits 0-9, 40 | * further diagonals are indicated by backslashes. 41 | * For example, slice 6 is represented as follows: 42 | * - It contains 5 nodes (.numNodes = 5) 43 | * - The first (top-left) node is at position (2, 4) (.index1 = 2, .index2 = 4) 44 | * - The top-left node is node #18 (.firstNode = 18) as the previous 45 | * slices have 1, 2, 3, 4, 4, and 4 nodes, respectively. 46 | * - The first node has no valid left node but a valid SW diagonal node 47 | * (.firstLeftValid = false, .firstDiagonalValid = true) 48 | * - The last node has no valid diagonal nor down nodes 49 | * (.lastDownValid = false, .lastDiagonalValid = false) 50 | * 51 | * ^ 52 | * | \\\\\\\\\ 53 | * s | 9\\\\\\\\\ 54 | * e 4| 6789\\\\\\ 55 | * q |3456789\\\\\ 56 | * 2 |23456789\\ 57 | * |12345678 58 | * 0|0123456 59 | * *-------------------> 60 | * 0 2 61 | * sequence1 62 | */ 63 | struct NodeSlice { 64 | int32_t numNodes; // Number of nodes in this slice 65 | int32_t firstNode; // Index of first node in slice 66 | int32_t index1; // Position of first node in slice along sequence 1 67 | int32_t index2; // Position of first node in slice along sequence 2 68 | bool firstLeftValid; // Does the first node in this slice have a left node? 69 | bool lastDownValid; // Does the last node in this slice have a down node? 70 | bool firstDiagonalValid; // Does the first node in this slice have a diagonal node? 71 | bool lastDiagonalValid; // Does the last node in this slice have a diagonal node? 72 | }; 73 | int32_t maxSliceSize; // Size of largest slice 74 | std::vector slices; // List of slices to be scheduled separately 75 | ublas::matrix statePointers; // Viterbi backtrace pointers. 76 | ublas::matrix dirPointers; // NW alignment backtrace pointers. 77 | }; 78 | 79 | 80 | /** Helper class for emission scores. 81 | * 82 | * This class provides normal level emissions and gamma distributed noise emissions. 83 | * Note that other emission objects can be substituted by changing the Emission typedef 84 | * immediately following this class definition. 85 | */ 86 | class DefaultEmission { 87 | private: 88 | std::vector levels; 89 | std::vector noises; 90 | std::vector logNoises; 91 | std::vector stayWeights; 92 | std::vector emWeights; 93 | std::vector modelLevels; 94 | std::vector modelNoises; 95 | std::vector offsets; 96 | std::vector levelScales; 97 | std::vector noiseScales; 98 | std::vector noiseShapes; 99 | int numEvents; 100 | int numStates; 101 | bool useNoise; 102 | 103 | public: 104 | /** Constructor. 105 | * @param[in] mdlLevels Model current levels. 106 | * @param[in] mdlLevelSpreads Spreads of model current levels. 107 | * @params[in] mdlNoises Model noise levels. 108 | * @param[in] mdlNoiseSpreads Spreads of model noise levels. 109 | * @param[in] useSd Flag to specify whether to use noise levels in the basecall. 110 | */ 111 | DefaultEmission(const std::vector& mdlLevels, const std::vector& mdlLevelSpreads, 112 | const std::vector& mdlNoises, const std::vector& mdlNoiseSpreads, 113 | bool useSd); 114 | 115 | /** Assign events to the object with vectors. 116 | * @param[in] means Event current levels. 117 | * @param[in] stdvs Event noise levels. 118 | * @param[in] stayWts Event weights for modifying stay probabilities. 119 | * @param[in] emWts Event weights for modifying emission probabilities. 120 | */ 121 | void SetEvents(const std::vector& means, const std::vector& stdvs, 122 | const std::vector& stayWts, const std::vector& emWts); 123 | 124 | /// Set the number of events (for when SetEvents() will not be called. 125 | void SetNEvents(int n) {numEvents = n;} 126 | 127 | /// Returns the number of events. 128 | int NumEvents() const {return numEvents;} 129 | 130 | /// Returns the number of model states. 131 | int NumStates() const {return numStates;} 132 | 133 | /// Returns the model levels. 134 | const std::vector GetModelLevels() const { return modelLevels; } 135 | 136 | /// Returns the stay weights. 137 | const std::vector GetStayWeights() const { return stayWeights; } 138 | 139 | /// Returns the score for event i and state j. 140 | int32_t Score(int i, int j) const { 141 | double score = offsets[j] + levelScales[j] * square(levels[i] - modelLevels[j]); 142 | if (useNoise) score += (noiseShapes[j] - 1.0) * logNoises[i] - noiseScales[j] * noises[i]; 143 | return int32_t(emWeights[i] * std::max(MIN_EMISSION_SCORE, score)); 144 | } 145 | }; 146 | 147 | 148 | typedef DefaultEmission Emission; 149 | typedef std::vector > Alignment; 150 | 151 | 152 | /// Worker class for performing 2D Viterbi basecall. 153 | class Viterbi2Docl { 154 | private: 155 | proxyCL &proxy_cl_; 156 | HmmNodesData nodes; // All HMM nodes, in the order they should be processed. 157 | std::vector transProbs; // Nine transition probabilities (stay * dir, step * dir, skip * dir). 158 | ublas::matrix emScore1; // Pre-computed emissions for sequence 1. 159 | ublas::matrix emScore2; // Pre-computed emissions for sequence 2. 160 | std::vector viterbiScore; // Viterbi scores for last node. 161 | int numStates; // Number of states in the HMM. 162 | int numNodes; // Total number of nodes to be processed. 163 | int numEvents1; // Number of events in sequence 1. 164 | int numEvents2; // Number of events in sequence 2. 165 | bool enable_fp64_; // Whether to use double floating point 166 | cl::Kernel kernelProcessNodes; // OpenCL kernel objects 167 | cl::Kernel kernelPickBest; 168 | 169 | void initNodes(const std::vector& bandStarts, const std::vector& bandEnds); 170 | void processNodes(const std::vector& wts1, const std::vector& wts2, 171 | const std::vector& priors); 172 | void backTrace(Alignment& alignment, std::vector& states); 173 | 174 | public: 175 | /** Constructor. 176 | */ 177 | Viterbi2Docl(proxyCL& proxy_cl); 178 | 179 | /** Perform the basecall with emission objects. 180 | * @param[in] data1 Emission object for sequence 1. 181 | * @param[in] data2 Emission object for sequence 2. 182 | * @param[in] bandStarts For each event in sequence 2, the first candidate position in sequence 1. 183 | * @param[in] bandEnds For each event in sequence 2, the last candidate position in sequence 1. 184 | * @param[in] priors The prior scores for the "before alignment" node. All zeros means no prior. 185 | * @param[out] alignment The final alignment of events. 186 | * @param[out] states The final basecalled states. 187 | */ 188 | void Call(const Emission& data1, const Emission& data2, const std::vector& bandStarts, 189 | const std::vector& bandEnds, const std::vector& priors, 190 | Alignment& alignment, std::vector& states); 191 | 192 | /** Perform the basecall with precomputed emissions. 193 | * @param[in] data1 Precomputed emissions for sequence 1. 194 | * @param[in] data2 Precomputed emissions for sequence 2. 195 | * @param[in] stayWt1 Stay weights for sequence 1. 196 | * @param[in] stayWt2 Stay weights for sequence 2. 197 | * @param[in] bandStarts For each event in sequence 2, the first candidate position in sequence 1. 198 | * @param[in] bandEnds For each event in sequence 2, the last candidate position in sequence 1. 199 | * @param[in] priors The prior scores for the "before alignment" node. All zeros means no prior. 200 | * @param[out] alignment The final alignment of events. 201 | * @param[out] states The final basecalled states. 202 | */ 203 | void Call(const MatView& data1, const MatView& data2, 204 | const VecView& stayWt1, const VecView& stayWt2, 205 | const std::vector& bandStarts, const std::vector& bandEnds, 206 | const std::vector& priors, Alignment& alignment, std::vector& states); 207 | 208 | /* Set default transition values and allocate memory 209 | * @param[in] len The maximum number of events to support for either sequence. 210 | * @param[in] states The number of states in the HMM. 211 | * @param[in] trans The six transition probabilities (stay1, step1, skip1, stay2, step2, skip2). 212 | */ 213 | void InitData(int len, int states, const std::vector& trans); 214 | 215 | bool InitCL(const std::string& srcKernelDir, const std::string& binKernelDir, 216 | std::string &error, bool enable_fp64, size_t num_states, size_t work_group_size = 0); 217 | }; 218 | 219 | #endif /* VITERBI_2D_OCL_H */ 220 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/caller_2d/viterbi_2d_ocl/viterbi_2d_ocl_py.h: -------------------------------------------------------------------------------- 1 | #ifndef VITERBI_2D_OCL_PY_H 2 | #define VITERBI_2D_OCL_PY_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "proxyCL.h" 14 | #include "viterbi_2d_ocl.h" 15 | 16 | 17 | namespace bp = boost::python; 18 | 19 | /// proxyCL python wrapper. 20 | class proxyCL_Py : public proxyCL 21 | { 22 | public: 23 | bp::tuple available_vendors() const 24 | { 25 | std::string error; 26 | std::vector vendors = proxyCL::available_vendors(error); 27 | return bp::make_tuple(vendors, error); 28 | } 29 | 30 | bp::tuple available_vendors_str() const 31 | { 32 | std::string error; 33 | std::vector vendors = proxyCL::available_vendors_str(error); 34 | return bp::make_tuple(vendors, error); 35 | } 36 | 37 | bp::tuple available_vendors_str_ex() const 38 | { 39 | std::string error; 40 | std::vector vendors = proxyCL::available_vendors_str_ex(error); 41 | return bp::make_tuple(vendors, error); 42 | } 43 | 44 | bp::tuple select_vendor(vendor v) 45 | { 46 | std::string error; 47 | bool ret = proxyCL::select_vendor(v, error); 48 | return bp::make_tuple(ret, error); 49 | } 50 | 51 | bp::tuple select_vendor_str(const std::string &vendor) 52 | { 53 | std::string error; 54 | bool ret = proxyCL::select_vendor(vendor, error); 55 | return bp::make_tuple(ret, error); 56 | } 57 | 58 | bp::tuple create_context(device_type type = undefined) 59 | { 60 | bool ret; 61 | std::string error; 62 | if (type == undefined) 63 | { 64 | ret = proxyCL::create_context(error); 65 | } 66 | else 67 | { 68 | ret = proxyCL::create_context(type, error); 69 | } 70 | return bp::make_tuple(ret, error); 71 | } 72 | 73 | bp::tuple available_devices() const 74 | { 75 | std::string error; 76 | std::vector devices = proxyCL::available_devices(error); 77 | return bp::make_tuple(devices, error); 78 | } 79 | 80 | bp::tuple select_device(size_t id) 81 | { 82 | std::string error; 83 | bool ret = proxyCL::select_device(id, error); 84 | return bp::make_tuple(ret, error); 85 | } 86 | 87 | bp::tuple get_device_info(size_t id) const 88 | { 89 | std::string error; 90 | std::string info = proxyCL::get_device_info(id, error); 91 | return bp::make_tuple(info, error); 92 | } 93 | }; 94 | 95 | BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(create_context_overloads, create_context, 0, 1); 96 | 97 | /// Viterbi 2D basecaller python wrapper. 98 | class Viterbi2Docl_Py { 99 | private: 100 | boost::shared_ptr viterbi; 101 | boost::shared_ptr emission1; 102 | boost::shared_ptr emission2; 103 | std::vector kmers; 104 | std::map states; 105 | std::map parms; 106 | int bandSize; 107 | bool useNoise; 108 | 109 | void setupKmers(int kmerLen); 110 | boost::shared_ptr makeEmission(bp::dict& model, bool rc); 111 | boost::shared_ptr dummyEmission(bp::list& kmers); 112 | void sortModel(std::vector& levels, std::vector& levelSpreads, std::vector& noises, 113 | std::vector& noiseSpreads, std::vector& mdlKmers, bool rc); 114 | void getEvents(bp::dict& events, std::vector& means, std::vector& stdvs, 115 | std::vector& stayWts, std::vector& emWts); 116 | void makeBands(const Alignment& alignIn, std::vector& bandStarts, std::vector& bandEnds); 117 | bp::dict makeResult(const Alignment& alignOut, const std::vector& statesOut); 118 | 119 | public: 120 | /** Constructor. 121 | * @param[in] proxy_cl Initialised proxyCL object. 122 | */ 123 | Viterbi2Docl_Py(proxyCL_Py& proxy_cl); 124 | 125 | /** Perform the basecall. 126 | * @param[in] data1 Event sequence 1. 127 | * @param[in] data2 Event sequence 2. 128 | * @param[in] alignment Estimated alignment of sequence 1 to sequence 2. 129 | * @param[in] prior The prior kmer for the "before alignment" node. None means no prior. 130 | * @return Dictionary contain alignment and called kmers. 131 | */ 132 | bp::dict Call(bp::dict& events1, bp::dict& events2, bp::list& alignment, bp::object& prior); 133 | 134 | /** Perform the basecall using posteriors. 135 | * @param[in] post1 Posteriors for sequence 1. 136 | * @param[in] post2 Posteriors for sequence 2. 137 | * @param[in] stayWt1 Stay weights for sequence 1. 138 | * @param[in] stayWt2 Stay weights for sequence 2. 139 | * @param[in] alignment Estimated alignment of sequence 1 to sequence 2. 140 | * @param[in] prior The prior kmer for the "before alignment" node. None means no prior. 141 | * @return Dictionary contain alignment and called kmers. 142 | */ 143 | bp::dict CallPost(bp::numeric::array& post1, bp::numeric::array& post2, 144 | bp::numeric::array& stayWt1, bp::numeric::array& stayWt2, 145 | bp::list& alignment, bp::object& prior); 146 | 147 | /** Initialize OpenCL kernel and command queue. This can also be used to just build the binary kernel file. 148 | * @param[in] model1 Model to use for first sequence of events. 149 | * @param[in] model2 Model to use for second sequence of events. 150 | * @param[in] params Dictionary of basecalling parameters. 151 | */ 152 | bp::tuple InitCL(const std::string& srcKernelDir, const std::string& binKernelDir, 153 | bool enable_fp64, size_t num_states, size_t work_group_size); 154 | 155 | /** Initialize model data and basecalling parameters. This is not necessary when just creating the binary kernel file. 156 | * @param[in] stateInfo Dictionary containing state information. 157 | * @param[in] params Dictionary of basecalling parameters. 158 | * 159 | * The state information should either contain 'model1' and 'model2' 160 | * fields, containing the models for the template and complement data, 161 | * or a 'kmers' field containing a list of the kmers (for posterior 162 | * calling). 163 | */ 164 | void InitData(bp::dict& stateInfo, bp::dict& params); 165 | 166 | /// Get a list of kmers in operational order. 167 | bp::list GetKmerList() const; 168 | 169 | /// Get a list of the model levels for the first sequence. 170 | bp::numeric::array GetModelLevels1() const; 171 | 172 | /// Get a list of the model levels for the second sequence. 173 | bp::numeric::array GetModelLevels2() const; 174 | }; 175 | 176 | /// Python class wrapper. 177 | BOOST_PYTHON_MODULE(viterbi_2d_ocl) { 178 | import_array(); 179 | bp::numeric::array::set_module_and_type("numpy", "ndarray"); 180 | 181 | bp::scope().attr("ZERO_PROB_SCORE") = bp::object(ZERO_PROB_SCORE); 182 | 183 | bp::enum_("vendor") 184 | .value("amd", vendor::amd) 185 | .value("intel", vendor::intel) 186 | .value("nvidia", vendor::nvidia) 187 | .value("apple", vendor::apple) 188 | .value("other", vendor::other) 189 | ; 190 | 191 | bp::enum_("device_type") 192 | .value("cpu", device_type::cpu) 193 | .value("gpu", device_type::gpu) 194 | .value("all", device_type::all) 195 | ; 196 | 197 | bp::class_("device_info", bp::no_init) 198 | .def_readonly("id", &device_info::id) 199 | .def_readonly("name", &device_info::name) 200 | .def_readonly("type", &device_info::type) 201 | ; 202 | 203 | bp::class_ >("vendor_vec") 204 | .def(bp::vector_indexing_suite >()) 205 | ; 206 | 207 | bp::class_ >("device_info_vec") 208 | .def(bp::vector_indexing_suite >()) 209 | ; 210 | 211 | bp::class_("proxyCL", bp::init<>()) 212 | .def("available_vendors", &proxyCL_Py::available_vendors) 213 | .def("available_vendors_str", &proxyCL_Py::available_vendors_str) 214 | .def("available_vendors_str_ex",&proxyCL_Py::available_vendors_str_ex) 215 | .def("enable_cuda_build_cache", &proxyCL_Py::enable_cuda_build_cache) 216 | .def("select_vendor", &proxyCL_Py::select_vendor) 217 | .def("select_vendor_str", &proxyCL_Py::select_vendor_str) 218 | .def("create_context", &proxyCL_Py::create_context, create_context_overloads()) 219 | .def("available_devices", &proxyCL_Py::available_devices) 220 | .def("get_device_info", &proxyCL_Py::get_device_info) 221 | .def("select_device", &proxyCL_Py::select_device) 222 | ; 223 | 224 | bp::class_("Viterbi2Docl", bp::init()) 225 | .def("call", &Viterbi2Docl_Py::Call) 226 | .def("call_post", &Viterbi2Docl_Py::CallPost) 227 | .def("init_cl", &Viterbi2Docl_Py::InitCL) 228 | .def("init_data", &Viterbi2Docl_Py::InitData) 229 | .def("get_kmer_list", &Viterbi2Docl_Py::GetKmerList) 230 | .def("get_model_levels1", &Viterbi2Docl_Py::GetModelLevels1) 231 | .def("get_model_levels2", &Viterbi2Docl_Py::GetModelLevels2) 232 | ; 233 | } 234 | 235 | 236 | #endif /* VITERBI_2D_OCL_PY_H */ 237 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/cmdargs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import multiprocessing 4 | 5 | 6 | class FileExist(argparse.Action): 7 | """Check if the input file exist.""" 8 | def __call__(self, parser, namespace, values, option_string=None): 9 | if values is not None and not os.path.exists(values): 10 | raise RuntimeError("File/path for '{}' does not exist, {}".format(self.dest, values)) 11 | setattr(namespace, self.dest, values) 12 | 13 | 14 | class CheckCPU(argparse.Action): 15 | """Make sure people do not overload the machine""" 16 | def __call__(self, parser, namespace, values, option_string=None): 17 | num_cpu = multiprocessing.cpu_count() 18 | if int(values) <= 0 or int(values) > num_cpu: 19 | raise RuntimeError('Number of jobs can only be in the range of {} and {}'.format(1, num_cpu)) 20 | setattr(namespace, self.dest, values) 21 | 22 | 23 | class AutoBool(argparse.Action): 24 | def __init__(self, option_strings, dest, default=None, required=False, help=None): 25 | """Automagically create --foo / --no-foo argument pairs""" 26 | 27 | if default is None: 28 | raise ValueError('You must provide a default with AutoBool action') 29 | if len(option_strings)!=1: 30 | raise ValueError('Only single argument is allowed with AutoBool action') 31 | opt = option_strings[0] 32 | if not opt.startswith('--'): 33 | raise ValueError('AutoBool arguments must be prefixed with --') 34 | 35 | opt = opt[2:] 36 | opts = ['--' + opt, '--no-' + opt] 37 | if default: 38 | default_opt = opts[0] 39 | else: 40 | default_opt = opts[1] 41 | super(AutoBool, self).__init__(opts, dest, nargs=0, const=None, 42 | default=default, required=required, 43 | help='{} (Default: {})'.format(help, default_opt)) 44 | def __call__(self, parser, namespace, values, option_strings=None): 45 | if option_strings.startswith('--no-'): 46 | setattr(namespace, self.dest, False) 47 | else: 48 | setattr(namespace, self.dest, True) 49 | 50 | class ParseTransitions(argparse.Action): 51 | """Handle list of exactly 3 values, check values can be coerced to float and 52 | normalise so that they sum to 1. 53 | """ 54 | def __init__(self, **kwdargs): 55 | kwdargs['metavar'] = ('stay', 'step', 'skip') 56 | super(ParseTransitions, self).__init__(**kwdargs) 57 | 58 | def __call__(self, parser, namespace, values, option_string=None): 59 | # locally import these 60 | import numpy as np 61 | from dragonet.util.assertions import checkTransitionProbabilities 62 | try: 63 | values = np.array(values, dtype='float') 64 | except: 65 | raise ValueError('Illegal value for {} ({})'.format(option_string, values)) 66 | values = values / np.sum(values) 67 | checkTransitionProbabilities(values) 68 | setattr(namespace, self.dest, values) 69 | 70 | 71 | class ParseToNamedTuple(argparse.Action): 72 | """Parse to a namedtuple 73 | """ 74 | def __init__(self, **kwdargs): 75 | assert 'metavar' in kwdargs, "Argument 'metavar' must be defined" 76 | assert 'type' in kwdargs, "Argument 'type' must be defined" 77 | assert len(kwdargs['metavar']) == kwdargs['nargs'], 'Number of arguments and descriptions inconstistent' 78 | assert len(kwdargs['type']) == kwdargs['nargs'], 'Number of arguments and types inconstistent' 79 | self._types = kwdargs['type'] 80 | kwdargs['type'] = str 81 | self.Values = namedtuple('Values', ' '.join(kwdargs['metavar'])) 82 | super(ParseToNamedTuple, self).__init__(**kwdargs) 83 | self.default = self.Values(*self.default) if self.default is not None else None 84 | 85 | def __call__(self, parser, namespace, values, option_string=None): 86 | value_dict = self.Values(*[ f(v) for f, v in zip(self._types, values)]) 87 | setattr(namespace, self.dest, value_dict) 88 | 89 | def TypeOrNone(mytype): 90 | """Create an argparse argument type that accepts either given type or 'None' 91 | 92 | :param mytype: Type function for type to accept, e.g. `int` or `float` 93 | """ 94 | def f(y): 95 | try: 96 | if y == 'None': 97 | res = None 98 | else: 99 | res = mytype(y) 100 | except: 101 | raise argparse.ArgumentTypeError('Argument must be None or {}'.format(mytype)) 102 | return res 103 | return f 104 | 105 | 106 | def NonNegative(mytype): 107 | """Create an argparse argument type that accepts only non-negative values 108 | 109 | :param mytype: Type function for type to accept, e.g. `int` or `float` 110 | """ 111 | def f(y): 112 | yt = mytype(y) 113 | if yt < 0: 114 | raise argparse.ArgumentTypeError('Argument must be non-negative') 115 | return yt 116 | return f 117 | 118 | 119 | def Positive(mytype): 120 | """Create an argparse argument type that accepts only positive values 121 | 122 | :param mytype: Type function for type to accept, e.g. `int` or `float` 123 | """ 124 | def f(y): 125 | yt = mytype(y) 126 | if yt <= 0: 127 | raise argparse.ArgumentTypeError('Argument must be positive') 128 | return yt 129 | return f 130 | 131 | 132 | def Vector(mytype): 133 | """Return an argparse.Action that will convert a list of values into a numpy 134 | array of given type 135 | """ 136 | 137 | class MyNumpyAction(argparse.Action): 138 | """Parse a list of values into numpy array""" 139 | def __call__(self, parser, namespace, values, option_string=None): 140 | import tang.numpty as np 141 | try: 142 | setattr(namespace, self.dest, np.array(values, dtype=mytype)) 143 | except: 144 | raise argparse.ArgumentTypeError('Cannot convert {} to array of {}'.format(values, mytype)) 145 | @staticmethod 146 | def value_as_string(value): 147 | return ' '.join(str(x) for x in value) 148 | return MyNumpyAction 149 | 150 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/currennt_to_pickle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import json 4 | import sys 5 | 6 | import numpy as np 7 | from RUBRIC.nanonet import nn 8 | from RUBRIC.nanonet import all_nmers 9 | from RUBRIC.nanonet.cmdargs import FileExist 10 | 11 | def get_parser(): 12 | parser = argparse.ArgumentParser( 13 | description='Convert currennt json network file into pickle. Makes assumptions about meta data.', 14 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 15 | ) 16 | parser.add_argument('input', action=FileExist, 17 | help='File containing current network') 18 | parser.add_argument('output', help='Output pickle file') 19 | 20 | parser.add_argument("--kmer_length", type=int, default=5, 21 | help="Length of kmers to learn.") 22 | parser.add_argument("--bases", type=str, default='ACGT', 23 | help="Alphabet of kmers to learn.") 24 | parser.add_argument("--window", type=int, nargs='+', default=[-1, 0, 1], 25 | help="The detailed list of the entire window.") 26 | parser.add_argument("--section", type=str, default='template', 27 | help="Section of read which network is trained against.") 28 | return parser 29 | 30 | 31 | def toarray(x): 32 | return np.ascontiguousarray(np.array(x, order='C', dtype=nn.dtype)) 33 | 34 | 35 | def parse_layer_input(size, weights): 36 | return None 37 | 38 | 39 | def parse_layer_feedforward(size, weights, fun): 40 | M = toarray(weights['input']) 41 | M = M.reshape((size, -1)).transpose() 42 | b = toarray(weights['bias']) 43 | return nn.FeedForward(M, b, fun) 44 | 45 | 46 | def parse_layer_feedforward_tanh(size, weights): 47 | return parse_layer_feedforward(size, weights, nn.tanh) 48 | 49 | 50 | def parse_layer_feedforward_sigmoid(size, weights): 51 | return parse_layer_feedforward(size, weights, nn.sigmoid) 52 | 53 | 54 | def parse_layer_feedforward_linear(size, weights): 55 | return parse_layer_feedforward(size, weights, nn.linear) 56 | 57 | 58 | def parse_layer_softmax(size, weights): 59 | M = toarray(weights['input']) 60 | M = M.reshape((size, -1)).transpose() 61 | b = toarray(weights['bias']) 62 | return nn.SoftMax(M, b) 63 | 64 | 65 | def parse_layer_multiclass(size, weights): 66 | return None 67 | 68 | 69 | def parse_layer_blstm(size, weights): 70 | size = size / 2 71 | wgts_input = toarray(weights['input']).reshape((4, 2, size, -1)).transpose((0, 1, 3, 2)) 72 | wgts_bias = toarray(weights['bias']).reshape((4, 2, -1)) 73 | wgts_internalMat = toarray(weights['internal'][: 4 * size * size * 2]).reshape((4, 2, size, size)).transpose((0, 1, 3, 2)) 74 | wgts_internalPeep = toarray(weights['internal'][4 * size * size * 2 :]).reshape((3, 2, size)) 75 | 76 | iM1 = wgts_input[:, 0, :, :] 77 | bM1 = wgts_bias[:, 0, :] 78 | lM1 = wgts_internalMat[:, 0, :, :] 79 | pM1 = wgts_internalPeep[:, 0, :] 80 | layer1 = nn.LSTM(iM1, lM1, bM1, pM1) 81 | 82 | iM2 = wgts_input[:, 1, :, :] 83 | bM2 = wgts_bias[:, 1, :] 84 | lM2 = wgts_internalMat[:, 1, :, :] 85 | pM2 = wgts_internalPeep[:, 1, :] 86 | layer2 = nn.LSTM(iM2, lM2, bM2, pM2) 87 | return nn.BiRNN(layer1, layer2) 88 | 89 | 90 | def parse_layer_lstm(size, weights): 91 | iM = toarray(weights['input']).reshape((4, size, -1)).transpose((0, 2, 1)) 92 | bM = toarray(weights['bias']).reshape((4, size)) 93 | lM = toarray(weights['internal'][ : 4 * size * size]).reshape((4, size, size)).transpose((0, 2, 1)) 94 | pM = toarray(weights['internal'][4 * size * size : ]).reshape((3, size)) 95 | return nn.LSTM(iM, lM, bM, pM) 96 | 97 | 98 | LAYER_DICT = {'input' : parse_layer_input, 99 | 'blstm' : parse_layer_blstm, 100 | 'feedforward_tanh' : parse_layer_feedforward_tanh, 101 | 'feedforward_logistic' : parse_layer_feedforward_sigmoid, 102 | 'feedforward_identity' : parse_layer_feedforward_linear, 103 | 'lstm' : parse_layer_lstm, 104 | 'blstm' : parse_layer_blstm, 105 | 'softmax' : parse_layer_softmax, 106 | 'multiclass_classification' : parse_layer_multiclass} 107 | 108 | 109 | def parse_layer(layer_type, size, weights): 110 | if not layer_type in LAYER_DICT: 111 | sys.stderr.write('Unsupported layer type {}.\n'.format(layer_type)) 112 | exit(1) 113 | return LAYER_DICT[layer_type](size, weights) 114 | 115 | 116 | def network_to_numpy(in_network): 117 | """Transform a json representation of a network into a numpy 118 | representation. 119 | """ 120 | 121 | layers = list() 122 | for layer in in_network['layers']: 123 | wgts = in_network['weights'][layer['name']] if layer['name'] in in_network['weights'] else None 124 | layers.append(parse_layer(layer['type'], layer['size'], wgts)) 125 | layers = filter(lambda x: x is not None, layers) 126 | 127 | meta = None 128 | if 'meta' in in_network: 129 | meta = in_network['meta'] 130 | network = nn.Serial(layers) 131 | network.meta = meta 132 | return network 133 | 134 | 135 | if __name__ == '__main__': 136 | args = get_parser().parse_args() 137 | 138 | try: 139 | with open(args.input, 'r') as fh: 140 | in_network = json.load(fh) 141 | except: 142 | sys.stderr.write('Failed to read from {}.\n'.format(args.input)) 143 | exit(1) 144 | 145 | if not 'layers' in in_network: 146 | sys.stderr.write('Could not find any layers in {} -- is it a network file?\n'.format(args.network)) 147 | exit(1) 148 | if not 'weights' in in_network: 149 | sys.stderr.write('Could not find any weights in {} -- is network trained?\n'.format(args.network)) 150 | exit(1) 151 | 152 | # Build meta, taking some guesses 153 | kmers = all_nmers(args.kmer_length, alpha=args.bases) 154 | kmers.append('X'*args.kmer_length) 155 | in_network['meta'] = { 156 | 'window':args.window, 157 | 'n_features':in_network['layers'][0]['size'], 158 | 'kmers':kmers, 159 | 'section':args.section 160 | } 161 | 162 | network = network_to_numpy(in_network) 163 | np.save(args.output, network) 164 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/data/default_complement.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sandialabs/RUBRIC/2895b15ded82a85142f4d68e4feb5d5526aebb09/RUBRIC/nanonet/data/default_complement.npy -------------------------------------------------------------------------------- /RUBRIC/nanonet/data/default_model.tmpl: -------------------------------------------------------------------------------- 1 | { 2 | "meta": {"section": "
"}, 3 | "layers": [ 4 | { 5 | "size": , 6 | "name": "input", 7 | "type": "input" 8 | }, 9 | { 10 | "size": 128, 11 | "name": "blstm_level_0", 12 | "bias": 1.0, 13 | "type": "blstm" 14 | }, 15 | { 16 | "size": 64, 17 | "name": "subsample_level_1", 18 | "bias": 1.0, 19 | "type": "feedforward_tanh" 20 | }, 21 | { 22 | "size": 128, 23 | "name": "blstm_level_1", 24 | "bias": 1.0, 25 | "type": "blstm" 26 | }, 27 | { 28 | "size": 64, 29 | "name": "subsample_level_2", 30 | "bias": 1.0, 31 | "type": "feedforward_tanh" 32 | }, 33 | { 34 | "size": , 35 | "name": "output", 36 | "bias": 1.0, 37 | "type": "softmax" 38 | }, 39 | { 40 | "size": , 41 | "name": "postoutput", 42 | "type": "multiclass_classification" 43 | } 44 | ] 45 | } 46 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/data/default_template.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sandialabs/RUBRIC/2895b15ded82a85142f4d68e4feb5d5526aebb09/RUBRIC/nanonet/data/default_template.npy -------------------------------------------------------------------------------- /RUBRIC/nanonet/data/r9.4_template.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sandialabs/RUBRIC/2895b15ded82a85142f4d68e4feb5d5526aebb09/RUBRIC/nanonet/data/r9.4_template.npy -------------------------------------------------------------------------------- /RUBRIC/nanonet/data/r9_complement.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sandialabs/RUBRIC/2895b15ded82a85142f4d68e4feb5d5526aebb09/RUBRIC/nanonet/data/r9_complement.npy -------------------------------------------------------------------------------- /RUBRIC/nanonet/data/r9_template.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sandialabs/RUBRIC/2895b15ded82a85142f4d68e4feb5d5526aebb09/RUBRIC/nanonet/data/r9_template.npy -------------------------------------------------------------------------------- /RUBRIC/nanonet/decoding.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define MODULE_API_EXPORTS 9 | #include "module.h" 10 | #include "stdint.h" 11 | 12 | #include 13 | 14 | typedef float ftype; 15 | using namespace std; 16 | 17 | 18 | static PyMethodDef DecodeMethods[] = { 19 | {NULL, NULL, 0, NULL} /* Sentinel */ 20 | }; 21 | 22 | PyMODINIT_FUNC initnanonetdecode(void) { 23 | (void) Py_InitModule("nanonetdecode", DecodeMethods); 24 | } 25 | 26 | 27 | extern "C" void viterbi_update( 28 | ftype* vit_last, ftype* vit_curr, int32_t* max_idx, 29 | const size_t num_bases, const size_t num_kmers, 30 | const ftype stay, const ftype step, const ftype skip, const ftype slip 31 | ){ 32 | 33 | for ( size_t kmer=0 ; kmer::infinity(); 36 | } 37 | 38 | // Stay 39 | for ( size_t kmer=0 ; kmervit_curr[kmer]){ 41 | vit_curr[kmer] = vit_last[kmer]+stay; 42 | max_idx[kmer] = kmer; 43 | } 44 | } 45 | // Step 46 | for ( size_t kmer=0 ; kmervit_curr[idx+i]){ 50 | vit_curr[idx+i] = vit_last[kmer]+step; 51 | max_idx[idx+i] = kmer; 52 | } 53 | } 54 | } 55 | // Skip 56 | for ( size_t kmer=0 ; kmervit_curr[idx+i]){ 60 | vit_curr[idx+i] = vit_last[kmer]+skip; 61 | max_idx[idx+i] = kmer; 62 | } 63 | } 64 | } 65 | // Slip 66 | if (slip > -std::numeric_limits::infinity()){ 67 | ftype slip_max = -std::numeric_limits::infinity(); 68 | size_t slip_idx = 0; 69 | for ( size_t kmer=0 ; kmerslip_max){ 71 | slip_max = vit_last[kmer]+slip; 72 | slip_idx = kmer; 73 | } 74 | } 75 | for ( size_t kmer=0 ; kmervit_curr[kmer]){ 77 | vit_curr[kmer] = slip_max; 78 | max_idx[kmer] = slip_idx; 79 | } 80 | } 81 | } 82 | } 83 | 84 | 85 | extern "C" MODULE_API ftype decode_path(ftype * logpost, const size_t num_events, const size_t num_bases, const size_t num_kmers){ 86 | assert(NULL!=logpost); 87 | assert(num_events>0); 88 | assert(num_bases>0); 89 | assert(num_kmers>0); 90 | 91 | std::vector max_idx(num_kmers); 92 | std::vector vit_last(num_kmers); 93 | std::vector vit_curr(num_kmers); 94 | 95 | // Treat all movement types equally, disallow slip (allowing slip 96 | // would simply give kmer with maximum posterioir) 97 | ftype stay = 0.0; 98 | ftype step = 0.0; 99 | ftype skip = 0.0; 100 | ftype slip = -std::numeric_limits::infinity(); 101 | 102 | // Initial values 103 | for ( size_t kmer=0 ; kmer::infinity(); 133 | int max_kmer = -1; 134 | for ( size_t kmer=0 ; kmermax_val){ 136 | max_val = vit_last[kmer]; 137 | max_kmer = kmer; 138 | } 139 | } 140 | logpost[idx] = max_kmer; 141 | // Other states by traceback 142 | for ( size_t ev=(num_events-1) ; ev>0 ; ev--){ 143 | const size_t idx = (ev-1)*num_kmers; 144 | logpost[idx] = logpost[idx+(int)logpost[idx+num_kmers]]; 145 | } 146 | 147 | return max_val; 148 | } 149 | 150 | 151 | extern "C" MODULE_API void estimate_transitions(ftype* post, ftype* trans, const size_t num_events, const size_t num_bases, const size_t num_kmers){ 152 | assert(NULL!=post); 153 | assert(num_events>0); 154 | assert(num_bases>0); 155 | assert(num_kmers>0); 156 | const size_t num_bases_sq = num_bases * num_bases; 157 | 158 | for (size_t ev = 1; ev < num_events; ++ev) { 159 | ftype stay_sum = 0.f; 160 | ftype step_sum = 0.f; 161 | ftype skip_sum = 0.f; 162 | const size_t idx1 = ev * num_kmers; 163 | const size_t idx0 = idx1 - num_kmers; 164 | for (size_t i = 0; i < num_kmers / num_bases_sq; ++i) { 165 | ftype sum16 = 0.f; 166 | for (size_t j = 0; j < num_bases; ++j) { 167 | ftype sum4 = 0.f; 168 | for (size_t k = 0; k < num_bases; ++k) { 169 | size_t kmer = i * num_bases_sq + j * num_bases + k; 170 | ftype p = post[idx1 + kmer]; 171 | stay_sum += post[idx0 + kmer] * p; 172 | sum4 += p; 173 | } 174 | for (size_t step_from = num_bases * i + j; step_from < num_kmers; step_from += num_kmers / num_bases) { 175 | step_sum += sum4 * post[idx0 + step_from]; 176 | } 177 | sum16 += sum4; 178 | } 179 | for (size_t skip_from = i; skip_from < num_kmers; skip_from += num_kmers / num_bases_sq) { 180 | skip_sum += sum16 * post[idx0 + skip_from]; 181 | } 182 | } 183 | step_sum *= 0.25f; 184 | skip_sum *= 0.0625f; 185 | trans[(ev-1) * 3] = stay_sum; 186 | trans[(ev-1) * 3 + 1] = step_sum; 187 | trans[(ev-1) * 3 + 2] = skip_sum; 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/eventdetection/filters.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define MODULE_API_EXPORTS 13 | #include "module.h" 14 | #include "filters.h" 15 | 16 | 17 | /** 18 | * setuptools install command doesn't play nice. We'll make this module 19 | * importable as a python module but not export anything. Importing the 20 | * module as: 21 | * import nanonetfilters 22 | * will at least allow us to find the file and continue to import it as 23 | * a CDLL and wrap with ctypes. That's fine because it means we can pass 24 | * numpy arrays as pointers and not worry about writing real python 25 | * extensions. 26 | **/ 27 | static PyMethodDef FilterMethods[] = { 28 | {NULL, NULL, 0, NULL} /* Sentinel */ 29 | }; 30 | 31 | PyMODINIT_FUNC initnanonetfilters(void) { 32 | (void) Py_InitModule("nanonetfilters", FilterMethods); 33 | } 34 | 35 | 36 | /** 37 | * Compute cumulative sum and sum of squares for a vector of data 38 | * data double[d_length] Data to be summed over (in) 39 | * sum double[d_length] Vector to store sum (out) 40 | * sumsq double[d_length] Vector to store sum of squares (out) 41 | * d_length Length of data vector 42 | **/ 43 | MODULE_API void compute_sum_sumsq(const double * restrict data, double* restrict sum, double* restrict sumsq, size_t d_length) { 44 | size_t i; 45 | 46 | // Basic contracts 47 | assert(NULL!=data); 48 | assert(NULL!=sum); 49 | assert(NULL!=sumsq); 50 | assert(d_length>0); 51 | 52 | sum[0] = data[0]; 53 | sumsq[0] = data[0]*data[0]; 54 | for (i = 1; i < d_length; ++i) { 55 | sum[i] = sum[i - 1] + data[i]; 56 | sumsq[i] = sumsq[i - 1] + data[i]*data[i]; 57 | } 58 | } 59 | 60 | /** 61 | * Compute moving average over window, output centred on current coordinate 62 | * sum double[d_length] Input data, cumulative sum (in) 63 | * out double[d_length] Ouput data (out) 64 | * d_length Length of data vector 65 | * w_length Length of window to compute mave over. Made odd if not. 66 | **/ 67 | MODULE_API void compute_mave(const double* restrict sum, double* restrict mave, size_t d_length, size_t w_length) { 68 | size_t i; 69 | size_t h_length; 70 | size_t ip; 71 | 72 | // Simple contracts 73 | assert(NULL!=sum); 74 | assert(NULL!=mave); 75 | assert(d_length>0); 76 | assert(w_length>0); 77 | // make window length odd 78 | if(w_length % 2 == 0){ 79 | w_length -= 1; 80 | } 81 | 82 | // quick return 83 | if (d_length < w_length || w_length < 2) { 84 | mave[0] = sum[0]; 85 | for(i = 1; i < d_length; ++i) 86 | mave[i] = sum[i] - sum[i-1]; 87 | return; 88 | } 89 | 90 | h_length = w_length/2; 91 | // fudge boundaries 92 | for(i = 0; i < h_length; ++i) { 93 | mave[i] = (sum[i+h_length]) / (i+1+h_length); 94 | ip = d_length - 1 - i; 95 | mave[ip] = (sum[d_length - 1] - sum[ip-h_length-1]) / (i+1+h_length); 96 | } 97 | // most of the data 98 | for(i = h_length; i < d_length - h_length ; ++i) { 99 | mave[i] = (sum[i+h_length] - sum[i-h_length-1]) / (w_length); 100 | } 101 | return; 102 | } 103 | 104 | 105 | /** 106 | * Compute windowed t-statistic from summary information 107 | * sum double[d_length] Cumulative sums of data (in) 108 | * sumsq double[d_length] Cumulative sum of squares of data (in) 109 | * tstat double[d_length] T-statistic (out) 110 | * d_length Length of data vector 111 | * w_length Window length to calculate t-statistic over 112 | **/ 113 | MODULE_API void compute_tstat(const double* restrict sum, const double* restrict sumsq, double* restrict tstat, size_t d_length, size_t w_length, bool pooled) { 114 | size_t i; 115 | const double eta = 1e-100; 116 | 117 | // Simple contracts 118 | assert(NULL!=sum); 119 | assert(NULL!=sumsq); 120 | assert(NULL!=tstat); 121 | 122 | // Quick return: 123 | // t-test not defined for number of points less than 2 124 | // need at least as many points as twice the window length 125 | if (d_length < 2*w_length || w_length < 2) { 126 | for(i = 0; i < d_length; ++i){ 127 | tstat[i] = 0.0; 128 | } 129 | return; 130 | } 131 | 132 | // fudge boundaries 133 | for (i = 0; i < w_length; ++i) { 134 | tstat[i] = 0; 135 | tstat[d_length - i - 1] = 0; 136 | } 137 | 138 | // get to work on the rest 139 | { 140 | double sum1, sum2, sumsq1, sumsq2, mean1, mean2, var1, var2; 141 | 142 | for (i = w_length; i <= d_length - w_length; ++i) { 143 | sum1 = sum[i - 1]; 144 | sumsq1 = sumsq[i - 1]; 145 | if (i > w_length) { 146 | sum1 -= sum[i - w_length - 1]; 147 | sumsq1 -= sumsq[i - w_length - 1]; 148 | } 149 | sum2 = sum[i + w_length - 1] - sum[i - 1]; 150 | sumsq2 = sumsq[i + w_length - 1] - sumsq[i - 1]; 151 | mean1 = sum1 / w_length; 152 | mean2 = sum2 / w_length; 153 | var1 = sumsq1 / w_length - mean1*mean1; 154 | var2 = sumsq2 / w_length - mean2*mean2; 155 | if(pooled){ 156 | var1 = ( var1 + var2 ) / 2.0; 157 | var2 = var1; 158 | } 159 | // Prevent problem due to very small variances 160 | var1 = fmax(var1, eta); 161 | var2 = fmax(var2, eta); 162 | 163 | //t-stat 164 | // Formula is a simplified version of Student's t-statistic for the 165 | // special case where there are two samples of equal size with 166 | // differing variance 167 | { 168 | const double delta = mean2 - mean1; 169 | const double totvar = var1 / w_length + var2 / w_length; 170 | tstat[i] = fabs(delta / sqrt(totvar)); 171 | } 172 | } 173 | } 174 | } 175 | 176 | 177 | /** 178 | * Compute windowed deltamean value from summary information 179 | * sum double[d_length] Cumulative sums of data (in) 180 | * sumsq double[d_length] Cumulative sum of squares of data (in) 181 | * deltamean double[d_length] deltamean (out) 182 | * d_length Length of data vector 183 | * w_length Window length to calculate t-statistic over 184 | **/ 185 | 186 | MODULE_API void compute_deltamean(const double* restrict sum, const double* restrict sumsq, double* restrict deltamean, size_t d_length, size_t w_length) { 187 | size_t i; 188 | double sum1, sum2, mean1, mean2; 189 | 190 | // Set boundaries to 0. 191 | for (i = 0; i < w_length; ++i) { 192 | deltamean[i] = 0; 193 | deltamean[d_length - i - 1] = 0; 194 | } 195 | 196 | // compute deltamean for non-boundary data 197 | for (i = w_length; i <= d_length - w_length; ++i) { 198 | sum1 = sum[i - 1]; 199 | if (i > w_length) { 200 | sum1 -= sum[i - w_length - 1]; 201 | } 202 | sum2 = sum[i + w_length - 1] - sum[i - 1]; 203 | mean1 = sum1 / w_length; 204 | mean2 = sum2 / w_length; 205 | 206 | 207 | // assume variance of 1.0 - approximately correct and avoids extra division 208 | { 209 | const double delta = mean2 - mean1; 210 | deltamean[i] = fabs(delta); 211 | } 212 | } 213 | } 214 | 215 | 216 | MODULE_API void short_long_peak_detector(DetectorPtr short_detector, DetectorPtr long_detector, const double peak_height, size_t * peaks){ 217 | size_t i, k; 218 | size_t peak_count = 0; 219 | DetectorPtr detector; 220 | DetectorPtr detectors[2] = {short_detector, long_detector}; 221 | double current_value; 222 | 223 | assert(short_detector->signal_length == long_detector->signal_length); 224 | assert(NULL!=peaks); 225 | 226 | 227 | for(i=0; isignal_length; i++){ 228 | for(k=0; k<2; k++){ 229 | detector = detectors[k]; 230 | //Carry on if we've been masked out 231 | if (detector->masked_to >= i){ 232 | continue; 233 | } 234 | 235 | current_value = detector->signal[i]; 236 | 237 | if (detector->peak_pos == detector->DEF_PEAK_POS){ 238 | //CASE 1: We've not yet recorded a maximum 239 | if (current_value < detector->peak_value){ 240 | //Either record a deeper minimum... 241 | detector->peak_value = current_value; 242 | } 243 | else if (current_value - detector->peak_value > peak_height){ 244 | // ...or we've seen a qualifying maximum 245 | detector->peak_value = current_value; 246 | detector->peak_pos = i; 247 | //otherwise, wait to rise high enough to be considered a peak 248 | } 249 | } 250 | else { 251 | //CASE 2: In an existing peak, waiting to see if it is good 252 | if (current_value > detector->peak_value){ 253 | //Update the peak 254 | detector->peak_value = current_value; 255 | detector->peak_pos = i; 256 | } 257 | 258 | //Dominate other tstat signals if we're going to fire at some point 259 | if (detector == short_detector){ 260 | if (detector->peak_value > detector->threshold){ 261 | long_detector->masked_to = detector->peak_pos + detector->window_length; 262 | long_detector->peak_pos = long_detector->DEF_PEAK_POS; 263 | long_detector->peak_value = long_detector->DEF_PEAK_VAL; 264 | long_detector->valid_peak = false; 265 | } 266 | } 267 | 268 | //Have we convinced ourselves we've seen a peak 269 | if (detector->peak_value - current_value > peak_height && detector->peak_value > detector->threshold){ 270 | detector->valid_peak = true; 271 | } 272 | 273 | //Finally, check the distance if this is a good peak 274 | if (detector->valid_peak && (i - detector->peak_pos) > detector->window_length / 2){ 275 | //Emit the boundary and reset 276 | peaks[peak_count] = detector->peak_pos; 277 | peak_count++; 278 | detector->peak_pos = detector->DEF_PEAK_POS; 279 | detector->peak_value = current_value; 280 | detector->valid_peak = false; 281 | } 282 | } 283 | } 284 | } 285 | } 286 | 287 | 288 | 289 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/eventdetection/filters.h: -------------------------------------------------------------------------------- 1 | #ifndef FILTERS_H 2 | #define FILTERS_H 3 | 4 | #include 5 | #include 6 | 7 | #if defined(_MSC_VER) 8 | # define false 0 9 | # define true 1 10 | # define bool int 11 | # define _Bool int 12 | # define fmax max 13 | # define fmin min 14 | #else 15 | # include 16 | #endif 17 | 18 | 19 | 20 | typedef struct { 21 | int DEF_PEAK_POS; 22 | double DEF_PEAK_VAL; 23 | double * signal; 24 | size_t signal_length; 25 | double threshold; 26 | size_t window_length; 27 | size_t masked_to; 28 | int peak_pos; 29 | double peak_value; 30 | _Bool valid_peak; 31 | } Detector; 32 | typedef Detector * DetectorPtr; 33 | 34 | 35 | MODULE_API void short_long_peak_detector( 36 | DetectorPtr short_detector, 37 | DetectorPtr long_detector, 38 | const double peak_height, 39 | size_t * peaks); 40 | 41 | 42 | #endif /* FILTERS_H */ 43 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/include/module.h: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | #ifdef _WIN32 6 | # ifdef MODULE_API_EXPORTS 7 | # define MODULE_API __declspec(dllexport) 8 | # define restrict __restrict 9 | # else 10 | # define MODULE_API __declspec(dllimport) 11 | # endif 12 | #else 13 | # define MODULE_API 14 | #endif 15 | 16 | MODULE_API int module_init(); 17 | 18 | #ifdef __cplusplus 19 | } 20 | #endif 21 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/jobqueue.py: -------------------------------------------------------------------------------- 1 | from uuid import uuid4 2 | from time import sleep 3 | import os 4 | from multiprocessing import Process 5 | import Queue 6 | from functools import partial 7 | 8 | from myriad.components import MyriadServer 9 | from myriad.managers import make_client 10 | 11 | from RUBRIC.nanonet import stderr_redirected 12 | 13 | __timeout__ = 0.5 14 | __worker_startup_sleep__ = 2 15 | 16 | class JobQueue(object): 17 | 18 | def __init__(self, jobs, functors): 19 | """A simple job queue which can be processed by various functors. 20 | 21 | :param jobs: iterable of job items. 22 | :param functions: tuples of the form (function, n_items), if n_itmes 23 | is None then the function should accept a single job items and 24 | process it to produce a single result. if n_items >= 2, then 25 | function should process a list of items, returning a list of 26 | results. 27 | """ 28 | self.jobs = jobs 29 | self.functors = functors 30 | 31 | def __iter__(self): 32 | self.start_server() 33 | workers = [Process(target=partial(worker, f[0], f[1], self.port, self.authkey)) for f in self.functors] 34 | 35 | try: 36 | for w in workers: 37 | w.start() 38 | 39 | for result in self.server.imap_unordered(self.jobs, timeout=__timeout__): 40 | yield result 41 | 42 | for w in workers: 43 | w.terminate() 44 | except KeyboardInterrupt: 45 | for w in workers: 46 | w.terminate() 47 | self.server.manager.join() 48 | self.server.manager.shutdown() 49 | 50 | def start_server(self, ports=(5000,6000)): 51 | self.authkey = str(uuid4()) 52 | 53 | server = None 54 | for port in xrange(*ports): 55 | try: 56 | with stderr_redirected(os.devnull): 57 | server = MyriadServer(None, port, self.authkey) 58 | except EOFError: 59 | pass 60 | else: 61 | break 62 | if server is None: 63 | raise RuntimeError("Could not start myriad server.") 64 | 65 | self.server = server 66 | self.port = port 67 | 68 | 69 | # On *nix the following could be part of the class above, but not on windows: 70 | # https://docs.python.org/2/library/multiprocessing.html#windows 71 | 72 | def worker(function, take_n, port, authkey, timeout=__timeout__): 73 | """Worker function for JobQueue. Dispatches to singleton_worker or 74 | multi_worker as appropriate. 75 | 76 | :param function: function to apply in job items. 77 | :param take_n: number of items to process, should be None or >=2. Special 78 | case of None indicates function takes a single item to produce a single 79 | result. 80 | """ 81 | sleep(__worker_startup_sleep__) # nasty, allows all workers to come up before iteration begins 82 | manager = make_client('localhost', port, authkey) 83 | job_q = manager.get_job_q() 84 | job_q_closed = manager.q_closed() 85 | result_q = manager.get_result_q() 86 | 87 | if take_n is None: 88 | _singleton_worker(function, job_q, job_q_closed, result_q, timeout=timeout) 89 | else: 90 | _multi_worker(function, take_n, job_q, job_q_closed, result_q, timeout=timeout) 91 | 92 | 93 | def _singleton_worker(function, job_q, job_q_closed, result_q, timeout=__timeout__): 94 | while True: 95 | try: 96 | job = job_q.get_nowait() 97 | result = function(job) 98 | result_q.put(result) 99 | except Queue.Empty: 100 | if job_q_closed._getvalue().value: 101 | break 102 | sleep(timeout) 103 | 104 | 105 | def _multi_worker(function, take_n, job_q, job_q_closed, result_q, timeout=__timeout__): 106 | while True: 107 | jobs = [] 108 | try: 109 | for _ in xrange(take_n): 110 | job = job_q.get_nowait() 111 | jobs.append(job) 112 | except Queue.Empty: 113 | if job_q_closed._getvalue().value: 114 | break 115 | else: 116 | for i, res in enumerate(function(jobs)): 117 | result_q.put(res) 118 | sleep(timeout) 119 | if len(jobs) > 0: 120 | for i, res in enumerate(function(jobs)): 121 | result_q.put(res) 122 | 123 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/latency_test/latency_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | from RUBRIC.read_until import ReadUntil 4 | import time 5 | import errno 6 | from socket import error as socket_error 7 | import csv 8 | 9 | class MessageStats: 10 | def __init__(self, pre_encode_time, received_time, total_channel_count, total_event_count): 11 | self.pre_encode_time = pre_encode_time 12 | self.received_time = received_time 13 | self.total_channel_count = total_channel_count 14 | self.total_event_count = total_event_count 15 | self.time_diff = received_time - pre_encode_time 16 | self.events_per_channel = total_event_count / total_channel_count 17 | 18 | def __repr__(self): 19 | return "MessageStats(pre_encode_time={0}, received_time={1}, total_channel_count={2}, total_event_count={3})".format( 20 | self.pre_encode_time, 21 | self.received_time, 22 | self.total_channel_count, 23 | self.total_event_count) 24 | 25 | class LatencyTestReadUntil(ReadUntil): 26 | 27 | def __init__(self, **kwargs): 28 | self.message_stats = [] 29 | super(LatencyTestReadUntil, self).__init__(**kwargs) 30 | 31 | def received_server_message(self, msg): 32 | if super(LatencyTestReadUntil, self).received_server_message(msg): 33 | print "Messages received: ", len(self.message_stats) 34 | self.message_stats.append(MessageStats(msg.pre_encode_time, 35 | int(time.time()), 36 | len(msg.channels_update), 37 | sum([len(d.events) for d in msg.channels_update.values()]))) 38 | 39 | 40 | class RunningState: 41 | def __init__(self): 42 | self.keep_running=True 43 | 44 | def closed(self, *args): 45 | self.keep_running=False 46 | 47 | def run_latency_test(): 48 | """Runs ReadUntil with particular setup conditions for a given duration, 49 | then moves onto running another set of setup conditions for a given 50 | duration. All the time accumulates statistics about each message coming 51 | back. When the test finishes, when all configurations have been run, the 52 | statistics are written to csv files ready for analysis/plotting.""" 53 | host = "ws://localhost:9200" 54 | 55 | 56 | time_and_setup_conditions = [ 57 | (120, {"ignore_first_events": 0, "padding_length_events": 0, "events_length": 100, "repetitions": 1}), 58 | (120, {"ignore_first_events": 0, "padding_length_events": 0, "events_length": 200, "repetitions": 1}), 59 | (120, {"ignore_first_events": 0, "padding_length_events": 0, "events_length": 500, "repetitions": 1}), 60 | (120, {"ignore_first_events": 0, "padding_length_events": 0, "events_length": 800, "repetitions": 1}), 61 | (120, {"ignore_first_events": 0, "padding_length_events": 0, "events_length": 1000, "repetitions": 1}), 62 | (240, {"ignore_first_events": 0, "padding_length_events": 0, "events_length": 1200, "repetitions": 1}), 63 | ] 64 | time_and_setup_iter = iter(time_and_setup_conditions) 65 | total_run_time = sum([x[0] for x in time_and_setup_conditions]) 66 | 67 | state=RunningState() 68 | duration, setup_conditions = time_and_setup_iter.next() 69 | with LatencyTestReadUntil(host=host, 70 | setup_conditions=setup_conditions, 71 | connection_closed=state.closed) as my_client: 72 | # Start sending stuff to our analyser 73 | my_client.start() 74 | change_time = time.time() + duration 75 | print "Client connection started. Will run for {0} seconds".format(total_run_time) 76 | while state.keep_running: 77 | time_now = time.time() 78 | if (time_now > change_time): 79 | try: 80 | duration, setup_conditions = time_and_setup_iter.next() 81 | except StopIteration: 82 | my_client.stop() 83 | break 84 | print "Changing to new conditions:", setup_conditions 85 | my_client.update_conditions(setup_conditions) 86 | change_time = time_now + duration 87 | make_report(my_client.message_stats) 88 | 89 | def make_report(ms): 90 | series={} 91 | for m in ms: 92 | series.setdefault(m.events_per_channel, []).append((m.time_diff, m.total_event_count)) 93 | for s, rows in series.items(): 94 | f = open("{0}_events_per_channel.csv".format(s), "ab") 95 | wr = csv.writer(f, delimiter=' ') 96 | for row in rows: 97 | wr.writerow(row) 98 | 99 | 100 | if __name__ == "__main__": 101 | try: 102 | run_latency_test() 103 | except socket_error as serr: 104 | if serr.errno != errno.ECONNREFUSED: 105 | raise serr 106 | print "Server not started?" 107 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/latency_test/run_gnuplot_on_csv_files.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import glob 3 | import sys 4 | 5 | files=sorted(glob.glob("*_events_per_channel.csv"), 6 | cmp=lambda x, y: cmp(int(x.split("_")[0]), int(y.split("_")[0]))) 7 | 8 | gnuplot = subprocess.Popen(["gnuplot"], stdin=subprocess.PIPE) 9 | 10 | def plot_args(f): 11 | return "\"{0}\" using 2:1 title '{0}' with points".format(f) 12 | 13 | title = "Time taken in seconds to send different sized messages" 14 | gnuplot.stdin.write("set term dumb 72 40\n") 15 | gnuplot.stdin.write("set xlabel 'message size/events'\n") 16 | gnuplot.stdin.write("set ylabel 'time/s'\n") 17 | gnuplot.stdin.write("set title '{0}'\n".format(title)) 18 | gnuplot.stdin.write("plot ") 19 | gnuplot.stdin.write(", \\\n".join([plot_args(f) for f in files])) 20 | gnuplot.stdin.write("\n") 21 | gnuplot.stdin.write("ex\n") 22 | gnuplot.stdin.flush() 23 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/nanonettrain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import json 5 | import os 6 | import sys 7 | import pkg_resources 8 | import tempfile 9 | import numpy as np 10 | 11 | from RUBRIC.nanonet import run_currennt_noisy 12 | from RUBRIC.nanonet.cmdargs import FileExist, AutoBool 13 | from RUBRIC.nanonet import iterate_fast5 14 | from RUBRIC.nanonet import make_currennt_training_input_multi 15 | from RUBRIC.nanonet import random_string, conf_line, tang_imap 16 | from RUBRIC.nanonet import network_to_numpy 17 | 18 | 19 | def get_parser(): 20 | parser = argparse.ArgumentParser( 21 | description="A simple ANN training wrapper.", 22 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 23 | ) 24 | 25 | parser.add_argument("--train", action=FileExist, 26 | help="Input training data, either a path to fast5 files or a single netcdf file", required=True) 27 | parser.add_argument("--train_list", action=FileExist, default=None, 28 | help="Strand list constaining training set") 29 | parser.add_argument("--section", default='template', choices=('template', 'complement'), 30 | help="Section of reads to train") 31 | 32 | parser.add_argument("--val", action=FileExist, 33 | help="Input validation data, either a path to fast5 files or a single netcdf file", required=True) 34 | parser.add_argument("--val_list", action=FileExist, default=None, 35 | help="Strand list constaining validation set") 36 | parser.add_argument("--workspace", default=tempfile.gettempdir(), 37 | help="Path for storing training and validation NetCDF files, if not specified a temporary file is used.") 38 | 39 | parser.add_argument("--output", help="Output prefix", required=True) 40 | 41 | parser.add_argument("--model", action=FileExist, 42 | default=pkg_resources.resource_filename('nanonet', 'data/default_model.tmpl'), 43 | help="ANN configuration file") 44 | parser.add_argument("--kmer_length", type=int, default=5, 45 | help="Length of kmers to learn.") 46 | parser.add_argument("--bases", type=str, default='ACGT', 47 | help="Alphabet of kmers to learn.") 48 | 49 | parser.add_argument("--device", type=int, default=0, 50 | help="ID of CUDA device to use.") 51 | parser.add_argument("--cuda", default=False, action=AutoBool, 52 | help="Use CUDA acceleration.") 53 | parser.add_argument("--window", type=int, nargs='+', default=[-1, 0, 1], 54 | help="The detailed list of the entire window.") 55 | 56 | training_parameter_group = parser.add_argument_group("Training Parameters.") 57 | training_parameter_group.add_argument("--max_epochs", type=int, default=500, 58 | help="Max training epocs, default 500") 59 | training_parameter_group.add_argument("--max_epochs_no_best", type=int, default=50, 60 | help="Stop training when no improvment for number of epocs, default 50" ) 61 | training_parameter_group.add_argument("--validate_every", type=int, default=5, 62 | help="Run validation data set every number of epocs.") 63 | training_parameter_group.add_argument("--parallel_sequences", type=int, default=125, 64 | help="Number of sequences in a min-batch") 65 | training_parameter_group.add_argument("--learning_rate", type=float, default=1e-5, 66 | help="Learning rate parameters of SGD." ) 67 | training_parameter_group.add_argument("--momentum", type=float, default=0.9, 68 | help="Momentum parameter of SGD." ) 69 | training_parameter_group.add_argument("--cache_path", default=tempfile.gettempdir(), 70 | help="Path for currennt temporary files.") 71 | 72 | return parser 73 | 74 | 75 | def prepare_input_file(in_out, **kwargs): 76 | path, in_list, output = in_out 77 | 78 | print "Creating training data NetCDF: {}".format(output) 79 | fast5_files = list(iterate_fast5(path, paths=True, strand_list=in_list)) 80 | return make_currennt_training_input_multi( 81 | fast5_files=fast5_files, 82 | netcdf_file=output, 83 | **kwargs 84 | ) 85 | 86 | 87 | def main(): 88 | if len(sys.argv) == 1: 89 | sys.argv.append("-h") 90 | args = get_parser().parse_args() 91 | 92 | if not args.cuda: 93 | args.nseqs = 1 94 | 95 | if not os.path.exists(args.workspace): 96 | os.makedirs(args.workspace) 97 | 98 | # file names for training 99 | tag = random_string() 100 | modelfile = os.path.abspath(args.model) 101 | outputfile = os.path.abspath(args.output) 102 | temp_name = os.path.abspath(os.path.join( 103 | args.workspace, 'nn_data_{}_'.format(tag) 104 | )) 105 | config_name = os.path.abspath(os.path.join( 106 | args.workspace, 'nn_{}.cfg'.format(tag) 107 | )) 108 | 109 | # Create currennt training input files 110 | trainfile = '{}{}'.format(temp_name, 'train.netcdf') 111 | valfile = '{}{}'.format(temp_name, 'validation.netcdf') 112 | inputs = ( 113 | (args.train, args.train_list, trainfile), 114 | (args.val, args.val_list, valfile), 115 | ) 116 | fix_kwargs = { 117 | 'window':args.window, 118 | 'kmer_len':args.kmer_length, 119 | 'alphabet':args.bases, 120 | 'callback_kwargs':{'section':args.section, 'kmer_len':args.kmer_length} 121 | } 122 | for results in tang_imap(prepare_input_file, inputs, fix_kwargs=fix_kwargs, threads=2): 123 | n_chunks, n_features, out_kmers = results 124 | if n_chunks == 0: 125 | raise RuntimeError("No training data written.") 126 | 127 | 128 | # fill-in templated items in model 129 | n_states = len(out_kmers) 130 | with open(modelfile, 'r') as model: 131 | mod = model.read() 132 | mod = mod.replace('
', args.section) 133 | mod = mod.replace('', str(n_features)) 134 | mod = mod.replace('', str(n_states)) 135 | try: 136 | mod_meta = json.loads(mod)['meta'] 137 | except Exception as e: 138 | mod_meta = dict() 139 | mod_meta['n_features'] = n_features 140 | mod_meta['kmers'] = out_kmers 141 | mod_meta['window'] = args.window 142 | 143 | modelfile = os.path.abspath(os.path.join( 144 | args.workspace, 'input_model.jsn' 145 | )) 146 | with open(modelfile, 'w') as model: 147 | model.write(mod) 148 | final_network = "{}_final.jsn".format(outputfile) 149 | best_network_prefix = "{}_auto".format(outputfile) 150 | # currennt appends some bits here 151 | 152 | # currennt cfg files 153 | with open(config_name, 'w') as currennt_cfg: 154 | if not args.cuda: 155 | currennt_cfg.write(conf_line('cuda', 'false')) 156 | # IO 157 | currennt_cfg.write(conf_line("cache_path", args.cache_path)) 158 | currennt_cfg.write(conf_line("network", modelfile)) 159 | currennt_cfg.write(conf_line("train_file", trainfile)) 160 | currennt_cfg.write(conf_line("val_file", valfile)) 161 | currennt_cfg.write(conf_line("save_network", final_network)) 162 | currennt_cfg.write(conf_line("autosave_prefix", best_network_prefix)) 163 | # Tunable parameters 164 | currennt_cfg.write(conf_line("max_epochs", args.max_epochs)) 165 | currennt_cfg.write(conf_line("max_epochs_no_best", args.max_epochs_no_best)) 166 | currennt_cfg.write(conf_line("validate_every", args.validate_every)) 167 | currennt_cfg.write(conf_line("parallel_sequences", args.parallel_sequences)) 168 | currennt_cfg.write(conf_line("learning_rate", args.learning_rate)) 169 | currennt_cfg.write(conf_line("momentum", args.momentum)) 170 | # Fixed parameters 171 | currennt_cfg.write(conf_line("train", "true")) 172 | currennt_cfg.write(conf_line("weights_dist", "normal")) 173 | currennt_cfg.write(conf_line("weights_normal_sigma", "0.1")) 174 | currennt_cfg.write(conf_line("weights_normal_mean", "0")) 175 | currennt_cfg.write(conf_line("stochastic", "true")) 176 | currennt_cfg.write(conf_line("input_noise_sigma", "0.0")) 177 | currennt_cfg.write(conf_line("shuffle_fractions", "false")) 178 | currennt_cfg.write(conf_line("shuffle_sequences", "true")) 179 | currennt_cfg.write(conf_line("autosave_best", "true")) 180 | 181 | # run currennt 182 | print "\n\nRunning currennt with: {}".format(config_name) 183 | run_currennt_noisy(config_name, device=args.device) 184 | 185 | # Currennt won't pass through our meta in the model, amend the output 186 | # and write out a numpy version of the network 187 | best_network = "{}.best.jsn".format(best_network_prefix) 188 | best_network_numpy = "{}_best.npy".format(outputfile) 189 | 190 | print "Adding model meta to currennt best network: {}".format(best_network) 191 | mod = json.load(open(best_network, 'r')) 192 | mod['meta'] = mod_meta 193 | json.dump(mod, open(best_network, 'w')) 194 | print "Transforming network to numpy pickle: {}".format(best_network_numpy) 195 | mod = network_to_numpy(mod) 196 | np.save(best_network_numpy, mod) 197 | 198 | 199 | 200 | if __name__ == '__main__': 201 | main() 202 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/resolve.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from datetime import datetime 3 | import glob 4 | import os 5 | import re 6 | import sys 7 | 8 | reads_dir = sys.argv[1] 9 | 10 | sampler_reads = {str(n+1): {} for n in range(512)} 11 | with open('read_log.csv', 'rb') as log_file: 12 | reader = csv.reader(log_file) 13 | reader.next() # skip header 14 | i = 0 15 | for row in reader: 16 | [date_str, channel_name, read_number, delay] = row 17 | date = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S.%f') 18 | sampler_reads[channel_name][read_number] = date 19 | 20 | with open('missing_files.csv', 'wb') as missing_f, open('matched_files.csv', 'wb') as matched_f: 21 | missing = csv.writer(missing_f) 22 | missing.writerow(["Channel", "Read", "File"]) 23 | matched = csv.writer(matched_f) 24 | matched.writerow(["Channel", "Read", "Time from seen to written", "File"]) 25 | 26 | regex = re.compile('.*_ch([0-9]+)_read([0-9]+)_.*\\.fast5') 27 | for (dirpath, dirnames, filenames) in os.walk(reads_dir): 28 | for filename in filenames: 29 | match = regex.match(filename) 30 | if match: 31 | channel_name = match.group(1) 32 | read_number = match.group(2) 33 | pretty_path = os.path.join(os.path.basename(dirpath), filename) 34 | try: 35 | date = sampler_reads[channel_name][read_number] 36 | except KeyError: 37 | missing.writerow([channel_name, read_number, pretty_path]) 38 | else: 39 | path = os.path.join(dirpath, filename) 40 | last_mod = datetime.fromtimestamp(os.path.getmtime(path)) 41 | matched.writerow([channel_name, read_number, str(last_mod - date), pretty_path]) 42 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/sample_data/904896_ch170_read104_strand.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sandialabs/RUBRIC/2895b15ded82a85142f4d68e4feb5d5526aebb09/RUBRIC/nanonet/sample_data/904896_ch170_read104_strand.fast5 -------------------------------------------------------------------------------- /RUBRIC/nanonet/sample_data/904896_ch170_read105_strand.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sandialabs/RUBRIC/2895b15ded82a85142f4d68e4feb5d5526aebb09/RUBRIC/nanonet/sample_data/904896_ch170_read105_strand.fast5 -------------------------------------------------------------------------------- /RUBRIC/nanonet/sample_data/904896_ch170_read108_strand.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sandialabs/RUBRIC/2895b15ded82a85142f4d68e4feb5d5526aebb09/RUBRIC/nanonet/sample_data/904896_ch170_read108_strand.fast5 -------------------------------------------------------------------------------- /RUBRIC/nanonet/sample_data/904896_ch170_read111_strand.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sandialabs/RUBRIC/2895b15ded82a85142f4d68e4feb5d5526aebb09/RUBRIC/nanonet/sample_data/904896_ch170_read111_strand.fast5 -------------------------------------------------------------------------------- /RUBRIC/nanonet/sample_data/904896_ch170_read114_strand.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sandialabs/RUBRIC/2895b15ded82a85142f4d68e4feb5d5526aebb09/RUBRIC/nanonet/sample_data/904896_ch170_read114_strand.fast5 -------------------------------------------------------------------------------- /RUBRIC/nanonet/test/test_nn.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from RUBRIC import nanonet as nn 4 | import numpy as np 5 | 6 | class ANNTest(unittest.TestCase): 7 | @classmethod 8 | def setUpClass(self): 9 | print '* ANN' 10 | np.random.seed(0xdeadbeef) 11 | self.prec = 6 12 | self._NSTEP = 10 13 | self._NFEATURES = 4 14 | self._SIZE = 5 15 | 16 | self.W = np.random.normal(size=(self._NFEATURES, self._SIZE)) 17 | self.b = np.random.normal(size=self._SIZE) 18 | self.x = np.random.normal(size=(self._NSTEP, self._NFEATURES)) 19 | self.res = (self.x.dot(self.W) + self.b).astype(nn.dtype) 20 | 21 | def test_000_single_layer_linear(self): 22 | network = nn.FeedForward(self.W, self.b, nn.linear) 23 | self.assertEqual(network.in_size, self._NFEATURES) 24 | self.assertEqual(network.out_size, self._SIZE) 25 | np.testing.assert_array_almost_equal(network.run(self.x), self.res) 26 | 27 | def test_001_single_layer_tanh(self): 28 | network = nn.FeedForward(self.W, self.b, nn.tanh) 29 | self.assertEqual(network.in_size, self._NFEATURES) 30 | self.assertEqual(network.out_size, self._SIZE) 31 | np.testing.assert_array_almost_equal(network.run(self.x), np.tanh(self.res)) 32 | 33 | def test_002_parallel_layers(self): 34 | l1 = nn.FeedForward(self.W, self.b, nn.tanh) 35 | l2 = nn.FeedForward(self.W, self.b, nn.tanh) 36 | network = nn.Parallel([l1, l2]) 37 | self.assertEqual(network.in_size, self._NFEATURES) 38 | self.assertEqual(network.out_size, 2 * self._SIZE) 39 | 40 | res = network.run(self.x) 41 | np.testing.assert_array_equal(res[:,:self._SIZE], res[:,self._SIZE:]) 42 | 43 | def test_003_simple_serial(self): 44 | W2 = np.random.normal(size=(self._SIZE, self._SIZE)) 45 | res = self.x.dot(self.W).dot(W2) 46 | 47 | l1 = nn.FeedForward(self.W, fun=nn.linear) 48 | l2 = nn.FeedForward(W2, fun=nn.linear) 49 | network = nn.Serial([l1, l2]) 50 | self.assertEqual(network.in_size, self._NFEATURES) 51 | self.assertEqual(network.out_size, self._SIZE) 52 | 53 | np.testing.assert_array_almost_equal(network.run(self.x), res) 54 | 55 | def test_004_reverse(self): 56 | network1 = nn.FeedForward(self.W, self.b, nn.tanh) 57 | res1 = network1.run(self.x) 58 | network2 = nn.Reverse(network1) 59 | res2 = network2.run(self.x) 60 | self.assertEqual(network1.in_size, network2.in_size,) 61 | self.assertEqual(network1.out_size, network2.out_size) 62 | 63 | np.testing.assert_array_equal(res1, res2, self.prec) 64 | 65 | def test_005_poormans_birnn(self): 66 | layer1 = nn.FeedForward(self.W, self.b, nn.tanh) 67 | layer2 = nn.FeedForward(self.W, self.b, nn.tanh) 68 | network = nn.BiRNN(layer1, layer2) 69 | 70 | res = network.run(self.x) 71 | np.testing.assert_array_equal(res[:,:self._SIZE], res[:,self._SIZE:], self.prec) 72 | 73 | def test_006_softmax(self): 74 | network = nn.SoftMax(self.W, self.b) 75 | 76 | res = network.run(self.x) 77 | res_sum = res.sum(axis=1) 78 | self.assertTrue(np.allclose(res_sum, 1.0)) 79 | 80 | def test_007_rnn_no_state(self): 81 | W1 = np.vstack((np.zeros((self._SIZE, self._SIZE)), self.W)) 82 | network = nn.SimpleRNN(W1, b=self.b, fun=nn.linear) 83 | 84 | res = network.run(self.x) 85 | np.testing.assert_almost_equal(res, self.res, self.prec) 86 | 87 | def test_008_rnn_no_input(self): 88 | W1 = np.random.normal(size=(self._SIZE, self._SIZE)) 89 | W2 = np.vstack((W1, np.zeros((self._NFEATURES, self._SIZE)))) 90 | network = nn.SimpleRNN(W2, fun=nn.linear) 91 | 92 | res = network.run(self.x) 93 | np.testing.assert_almost_equal(res, 0.0, self.prec) 94 | 95 | def test_009_rnn_no_input_with_bias(self): 96 | W1 = np.random.normal(size=(self._SIZE, self._SIZE)) 97 | W2 = np.vstack((W1, np.zeros((self._NFEATURES, self._SIZE)))) 98 | network = nn.SimpleRNN(W2, b=self.b, fun=nn.linear) 99 | 100 | res = network.run(self.x) 101 | res2 = np.zeros(self._SIZE, dtype=nn.dtype) 102 | for i in xrange(self._NSTEP): 103 | res2 = res2.dot(W1) + self.b 104 | np.testing.assert_allclose(res[i], res2, self.prec) 105 | 106 | def test_010_birnn_no_input_with_bias(self): 107 | W1 = np.random.normal(size=(self._SIZE, self._SIZE)) 108 | W2 = np.vstack((W1, np.zeros((self._NFEATURES, self._SIZE)))) 109 | layer1 = nn.SimpleRNN(W2, b=self.b) 110 | layer2 = nn.SimpleRNN(W2, b=self.b) 111 | network = nn.BiRNN(layer1, layer2) 112 | 113 | res = network.run(self.x) 114 | np.testing.assert_almost_equal(res[:,:self._SIZE], res[::-1,self._SIZE:], self.prec) 115 | -------------------------------------------------------------------------------- /RUBRIC/nanonet/watcher.py: -------------------------------------------------------------------------------- 1 | import time 2 | from multiprocessing import Process, Queue 3 | 4 | try: 5 | from watchdog.observers import Observer 6 | from watchdog.events import RegexMatchingEventHandler 7 | except ImportError: 8 | raise ImportError('Nanonet component error: cannot import optional watchdog module. Install with pip.') 9 | 10 | 11 | class Fast5Watcher(object): 12 | 13 | def __init__(self, path, timeout=10, regex='.*\.fast5$', initial_jobs=None): 14 | """Watch a path and yield modified files 15 | 16 | :param path: path to watch for files. 17 | :param timeout: timeout period for newly modified files. 18 | :param regex: regex filter for files to consifer. 19 | :param initial_jobs: pre-existing files to process. 20 | """ 21 | self.path = path 22 | self.timeout = timeout 23 | self.regex = regex 24 | self.initial_jobs = initial_jobs 25 | self.q = Queue() 26 | self.watcher = Process(target=self._watcher) 27 | self.yielded = set() 28 | 29 | def _watcher(self): 30 | handler = RegexMatchingEventHandler(regexes=[self.regex], ignore_directories=True) 31 | handler.on_modified = lambda x: self.q.put(x.src_path) 32 | observer = Observer() 33 | observer.schedule(handler, self.path) 34 | observer.start() 35 | try: 36 | while True: 37 | time.sleep(1) 38 | except KeyboardInterrupt: 39 | observer.stop() 40 | observer.join() 41 | 42 | def __iter__(self): 43 | self.watcher.start() 44 | 45 | if self.initial_jobs is not None: 46 | for item in self.initial_jobs: 47 | if item not in self.yielded: 48 | yield item 49 | self.yielded.add(item) 50 | 51 | while True: 52 | try: 53 | item = self.q.get(True, self.timeout) 54 | except: 55 | break 56 | else: 57 | if item not in self.yielded: 58 | yield item 59 | self.yielded.add(item) 60 | self.watcher.terminate() 61 | -------------------------------------------------------------------------------- /RUBRIC/setup_nanonet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import re 4 | import sys 5 | 6 | import numpy 7 | from setuptools import setup, find_packages, Extension 8 | 9 | print """ 10 | ******************************************************************* 11 | This Source Code Form is subject to the terms of the Mozilla Public 12 | License, v. 2.0. If a copy of the MPL was not distributed with this 13 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 14 | 15 | (c) 2016 Oxford Nanopore Technologies Ltd. 16 | ******************************************************************* 17 | """ 18 | 19 | # Get the version number from __init__.py 20 | pkg_name = 'nanonet' 21 | pkg_path = os.path.join(os.path.dirname(__file__), pkg_name) 22 | verstrline = open(os.path.join(pkg_path, '__init__.py'), 'r').read() 23 | vsre = r"^__version__ = ['\"]([^'\"]*)['\"]" 24 | mo = re.search(vsre, verstrline, re.M) 25 | if mo: 26 | version = mo.group(1) 27 | else: 28 | raise RuntimeError('Unable to find version string in "{}/__init__.py".'.format(pkg_name)) 29 | 30 | system = platform.system() 31 | print "System is {}".format(system) 32 | print "By default the 2D basecaller (standard and OpenCL) are not built." 33 | print "To enable these use 'with2d' and 'opencl2d' command line options." 34 | print 35 | 36 | with_2d = True if 'with2d' in sys.argv else False 37 | if with_2d: 38 | sys.argv.remove('with2d') 39 | 40 | opencl_2d = True if 'opencl2d' in sys.argv else False 41 | if opencl_2d: 42 | with_2d = True 43 | sys.argv.remove('opencl2d') 44 | 45 | mingw = True if "mingw" in sys.argv else False 46 | if mingw: 47 | sys.argv.remove('mingw') 48 | # patch distutils to force our compiler class. With distutils build 49 | # command we can use the commandline option to set compiler but 50 | # develop command does accept this. C++ extensions also aren't 51 | # recognised as being unchanged meaning they get built twice with 52 | # build --compiler=mingw32 develop 53 | # all very annoying. 54 | import distutils.cygwinccompiler 55 | from nanoccompiler import Mingw64CCompiler 56 | 57 | distutils.cygwinccompiler.Mingw32CCompiler = Mingw64CCompiler 58 | distutils.ccompiler.get_default_compiler = lambda x: 'mingw32' 59 | 60 | main_include = os.path.join(os.path.dirname(__file__), 'nanonet', 'include') 61 | include_dirs = [main_include] 62 | event_detect_include = [] 63 | boost_inc = [] 64 | boost_lib_path = [] 65 | boost_libs = [] 66 | opencl_include = [] 67 | opencl_lib_path = [] 68 | opencl_libs = [] 69 | 70 | c_compile_args = ['-pedantic', '-Wall', '-std=c99'] 71 | cpp_compile_args = [] 72 | optimisation = ['-DNDEBUG'] 73 | 74 | if system == 'Darwin': 75 | print "Adding OSX compile/link options" 76 | optimisation.extend(['-O3', '-fstrict-aliasing']) 77 | cpp_compile_args.extend(['-std=c++0x', '-Wno-unused-local-typedefs']) 78 | # may wish to edit - required for 2D 79 | boost_inc = ['/opt/local/include/'] 80 | boost_libs.append('boost_python-mt') 81 | elif system == 'Windows': 82 | event_detect_include.append(os.path.join(pkg_path, 'eventdetection')) 83 | if not mingw: 84 | print "Adding windows (MSVC) compile/link options" 85 | optimisation = ['/O2', '/Gs-'] 86 | c_compile_args = ['/wd4820'] 87 | cpp_compile_args.extend(['/EHsc', '/wd4996']) 88 | include_dirs.append(os.path.join(main_include, 'extras')) 89 | boost_location = os.path.join('c:', os.sep, 'local', 'boost_1_55_0') 90 | boost_lib_name = 'lib64-msvc-9.0' 91 | if opencl_2d: 92 | raise NotImplementedError('OpenCL 2D caller not currently supported on Windows with MSVC.') 93 | else: 94 | print "Adding windows (mingw64) compile/link options" 95 | optimisation.extend(['-O3', '-fstrict-aliasing']) 96 | c_compile_args.extend(['-DMS_WIN64', '-D_hypot=hypot']) 97 | cpp_compile_args.extend(['-DMS_WIN64', '-D_hypot=hypot', '-Wno-unused-local-typedefs']) 98 | boost_location = os.environ.get( 99 | 'BOOST_ROOT', os.path.join('c:', os.sep, 'local', 'boost_1_55_0')) 100 | boost_lib_name = os.environ.get( 101 | 'BOOST_LIB', os.path.join('stage', 'lib')) 102 | boost_libs.append( 103 | os.environ.get('BOOST_PYTHON', 'boost_python-mgw48-mt-1_55')) 104 | # may wish to edit - required for OpenCL 2D, this will compile 105 | # but likely die at runtime. 106 | if opencl_2d: 107 | raise NotImplementedError('OpenCL 2D caller not currently supported on Windows with mingw64.') 108 | # nvidia_opencl = os.path.join('c:', os.sep, 109 | # 'Program Files', 'NVIDIA GPU Computing Toolkit', 'CUDA', 'v7.5') 110 | # opencl_include = [os.environ.get('OPENCL_INC', os.path.join(nvidia_opencl, 'include'))] 111 | # opencl_lib_path = [os.environ.get('OPENCL_LIB', os.path.join(nvidia_opencl, 'lib', 'x64'))] 112 | # opencl_libs.append('OpenCL') 113 | boost_lib_path = [os.path.join(boost_location, boost_lib_name)] 114 | boost_inc = [boost_location] 115 | else: 116 | print "Adding Linux(?) compile/link options" 117 | optimisation.extend(['-O3', '-fstrict-aliasing']) 118 | cpp_compile_args.extend(['-std=c++0x', '-Wno-unused-local-typedefs']) 119 | boost_libs.append('boost_python') 120 | # may wish to edit - required for OpenCL 2D 121 | opencl_include = [os.environ.get('OPENCL_INC')] 122 | opencl_lib_path = [os.environ.get('OPENCL_LIB', os.path.join(os.sep, 'opt', 'intel', 'opencl'))] 123 | opencl_libs.append('OpenCL') 124 | c_compile_args.extend(optimisation) 125 | cpp_compile_args.extend(optimisation) 126 | 127 | extensions = [] 128 | 129 | extensions.append(Extension( 130 | 'nanonetfilters', 131 | sources=[os.path.join(pkg_path, 'eventdetection', 'filters.c')], 132 | include_dirs=include_dirs + event_detect_include, 133 | extra_compile_args=c_compile_args 134 | )) 135 | 136 | extensions.append(Extension( 137 | 'nanonetdecode', 138 | sources=[os.path.join(pkg_path, 'decoding.cpp')], 139 | include_dirs=include_dirs, 140 | extra_compile_args=cpp_compile_args 141 | )) 142 | 143 | if with_2d: 144 | caller_2d_path = os.path.join('nanonet', 'caller_2d') 145 | extensions.append(Extension( 146 | 'nanonet.caller_2d.viterbi_2d.viterbi_2d', 147 | include_dirs=[os.path.join(caller_2d_path, x) for x in 148 | ('viterbi_2d', 'common')] + 149 | [numpy.get_include()] + boost_inc + include_dirs, 150 | sources=[os.path.join(caller_2d_path, 'viterbi_2d', x) for x in 151 | ('viterbi_2d_py.cpp', 'viterbi_2d.cpp')], 152 | depends=[os.path.join(caller_2d_path, x) for x in 153 | ('viterbi_2d_py.h', 'viterbi_2d.h')] + 154 | [os.path.join(caller_2d_path, 'common', x) for x in 155 | ('bp_tools.h', 'data_view.h', 'utils.h', 'view_numpy_arrays.h')], 156 | extra_compile_args=cpp_compile_args, 157 | library_dirs=boost_lib_path, 158 | libraries=boost_libs 159 | )) 160 | 161 | extensions.append(Extension( 162 | 'nanonet.caller_2d.pair_align.pair_align', 163 | include_dirs=[os.path.join(caller_2d_path, 'pair_align')] + 164 | boost_inc + include_dirs, 165 | sources=[os.path.join(caller_2d_path, 'pair_align', x) for x in 166 | ('pair_align_py.cpp', 'nw_align.cpp', 'mm_align.cpp')], 167 | depends=[os.path.join(caller_2d_path, 'pair_align', x) for x in 168 | ('pair_align_py.h', 'pair_align.h', 'nw_align.h', 'mm_align.h')], 169 | extra_compile_args=cpp_compile_args, 170 | library_dirs=boost_lib_path, 171 | libraries=boost_libs 172 | )) 173 | 174 | extensions.append(Extension( 175 | 'nanonet.caller_2d.common.stub', 176 | include_dirs=[os.path.join(caller_2d_path, 'common')] + 177 | [numpy.get_include()] + boost_inc + include_dirs, 178 | sources=[os.path.join(caller_2d_path, 'common', 'stub_py.cpp')], 179 | depends=[os.path.join(caller_2d_path, 'common', x) for x in 180 | ('bp_tools.h', 'data_view.h', 'utils.h', 'view_numpy_arrays.h')], 181 | extra_compile_args=cpp_compile_args, 182 | library_dirs=boost_lib_path, 183 | libraries=boost_libs 184 | )) 185 | 186 | if opencl_2d: 187 | print "Setting up OpenCL 2D basecall extension, this may need some tinkering" 188 | extensions.append(Extension( 189 | 'nanonet.caller_2d.viterbi_2d_ocl.viterbi_2d_ocl', 190 | include_dirs=[os.path.join(caller_2d_path, x) for x in 191 | ('viterbi_2d_ocl', 'common')] + 192 | [numpy.get_include()] + boost_inc + include_dirs + opencl_include, 193 | sources=[os.path.join(caller_2d_path, 'viterbi_2d_ocl', x) for x in 194 | ('viterbi_2d_ocl_py.cpp', 'viterbi_2d_ocl.cpp', 'proxyCL.cpp')], 195 | depends=[os.path.join(caller_2d_path, 'viterbi_2d_ocl', x) for x in 196 | ('viterbi_2d_ocl.py.h', 'viterbi_2d_ocl.h', 'proxyCL.h')] + 197 | [os.path.join(caller_2d_path, 'common', x) for x in 198 | ('bp_tools.h', 'data_view.h', 'utils.h', 'view_numpy_arrays.h')], 199 | extra_compile_args=cpp_compile_args, 200 | library_dirs=boost_lib_path + opencl_lib_path, 201 | libraries=boost_libs + opencl_libs 202 | )) 203 | 204 | requires = [ 205 | 'h5py', 206 | 'myriad >=0.1.2', 207 | 'numpy', 208 | ] 209 | extra_requires = { 210 | 'currennt': ['netCDF4'], 211 | 'watcher': ['watchdog'], 212 | 'opencl': ['pyopencl'], 213 | 'simulate': ['biopython'], 214 | } 215 | 216 | # Making a whl for windows 217 | bdist_args = dict() 218 | if system == 'Windows' and "bdist_wheel" in sys.argv: 219 | from setuptools import Distribution 220 | from distutils.spawn import find_executable 221 | from glob import glob 222 | 223 | 224 | class BinaryDistribution(Distribution): 225 | def is_pure(self): 226 | return False 227 | 228 | def has_ext_modules(self): 229 | return True 230 | 231 | 232 | blibs = [os.path.join(boost_location, boost_lib_name, 'lib{}.dll'.format(x)) for x in boost_libs] 233 | mingwdir = os.path.dirname(find_executable('gcc')) 234 | mingwlibs = glob(os.path.join(mingwdir, '*.dll')) 235 | mingwlibs = [os.path.join(mingwdir, x) for x in mingwlibs] 236 | dlls = [os.path.relpath(x) for x in blibs + mingwlibs] 237 | bdist_args = { 238 | 'scripts': dlls, 239 | 'distclass': BinaryDistribution 240 | } 241 | 242 | setup( 243 | name='nanonet', 244 | version=version, 245 | description='A simple recurrent neural network based basecaller nanopore data.', 246 | maintainer='Chris Wright', 247 | maintainer_email='chris.wright@nanoporetech.com', 248 | url='http://www.nanoporetech.com', 249 | packages=find_packages(exclude=["*.test", "*.test.*", "test.*", "test"]), 250 | package_data={'nanonet.data': ['nanonet/data/*']}, 251 | include_package_data=True, 252 | tests_require=requires, 253 | install_requires=requires, 254 | extras_require=extra_requires, 255 | dependency_links=[], 256 | zip_safe=True, 257 | ext_modules=extensions, 258 | test_suite='discover_tests', 259 | entry_points={ 260 | 'console_scripts': [ 261 | 'nanonetcall = nanonet.nanonetcall:main', 262 | 'nanonet2d = nanonet.nanonetcall_2d:main', 263 | 'nanonettrain = nanonet.nanonettrain:main', 264 | 'simulate_minion = nanonet.simulate.simulate_minion:main', 265 | ] 266 | }, 267 | **bdist_args 268 | ) 269 | 270 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | name='RUBRIC', 7 | version='1.0', 8 | packages=['RUBRIC'], 9 | url='https://github.com/ragak/RUBRIC', 10 | license='MPL 2.0', 11 | author='Harrison Edwards, Raga Krishnakumar, and Michael Bartsch', 12 | author_email='harrison.edwards@mail.utoronto.ca', 13 | description='Read-Until with Basecalling and Reference-Informed Criteria (RUBRIC)', 14 | install_requires=['configargparse', 'h5py', 'numpy', 'termcolor', 'thrift==0.9.2', 'ws4py', 'biopython', 'psutil'] 15 | ) 16 | 17 | # some extra lines to run the nanonet setup file... 18 | cwd = os.getcwd() 19 | cmd_string = os.path.join(cwd, 'RUBRIC', 'setup_nanonet.py') 20 | cmd_string = 'python ' + cmd_string + ' develop' 21 | os.system(cmd_string) 22 | 23 | 24 | 25 | --------------------------------------------------------------------------------