├── MANIFEST.in ├── dev_requirements.txt ├── fast5_research ├── test │ ├── data │ │ ├── recursive │ │ │ ├── 1 │ │ │ │ └── fake.fast5 │ │ │ ├── 2 │ │ │ │ └── fake.fast5 │ │ │ ├── fake1.fast5 │ │ │ ├── fake2.fast5 │ │ │ └── fake3.fast5 │ │ ├── example_template.bc_scale │ │ ├── abf2bulkfast5.fast5 │ │ ├── elec3_example.fast5 │ │ ├── example_template.map_scale │ │ ├── additional_test_file.fast5 │ │ ├── example_basecall_squiggle_mapping.fast5 │ │ ├── example_template.bc_path │ │ ├── example_template.map_path │ │ ├── example_template.events │ │ └── example_template.map_post │ ├── test_iterate.py │ ├── test_fast5_basecalling_and_mapping.py │ ├── test_fast5.py │ └── test_fast5_bulk.py ├── __init__.py ├── util.py ├── extract.py └── fast5_bulk.py ├── .gitignore ├── requirements.txt ├── .travis.yml ├── docs ├── index.rst ├── cmdline.rst ├── examples.rst └── conf.py ├── Makefile ├── setup.py ├── CHANGELOG.md ├── README.md └── LICENSE.md /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | -------------------------------------------------------------------------------- /dev_requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | nose 3 | -------------------------------------------------------------------------------- /fast5_research/test/data/recursive/1/fake.fast5: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fast5_research/test/data/recursive/2/fake.fast5: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fast5_research/test/data/recursive/fake1.fast5: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fast5_research/test/data/recursive/fake2.fast5: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fast5_research/test/data/recursive/fake3.fast5: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *~ 3 | *.swp 4 | *.swo 5 | *.so 6 | *.egg-info 7 | .eggs 8 | build 9 | dist 10 | docs/_build/ 11 | venv* 12 | -------------------------------------------------------------------------------- /fast5_research/test/data/example_template.bc_scale: -------------------------------------------------------------------------------- 1 | -0.00211363485711 1.00484079515 -0.00629128442239 0.647664177638 0.933711803503 28.3709450329 2 | -------------------------------------------------------------------------------- /fast5_research/test/data/abf2bulkfast5.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/fast5_research/HEAD/fast5_research/test/data/abf2bulkfast5.fast5 -------------------------------------------------------------------------------- /fast5_research/test/data/elec3_example.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/fast5_research/HEAD/fast5_research/test/data/elec3_example.fast5 -------------------------------------------------------------------------------- /fast5_research/test/data/example_template.map_scale: -------------------------------------------------------------------------------- 1 | -0.000425261825492 1.00000660841 2.07396170589e-05 0.0288695991923 0.926918438703 579.501041091 2 | -------------------------------------------------------------------------------- /fast5_research/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.2.23' 2 | 3 | from fast5_research.fast5 import Fast5, iterate_fast5 4 | from fast5_research.fast5_bulk import BulkFast5 5 | -------------------------------------------------------------------------------- /fast5_research/test/data/additional_test_file.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/fast5_research/HEAD/fast5_research/test/data/additional_test_file.fast5 -------------------------------------------------------------------------------- /fast5_research/test/data/example_basecall_squiggle_mapping.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/fast5_research/HEAD/fast5_research/test/data/example_basecall_squiggle_mapping.fast5 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # fast5_research requirements. 2 | # Use comment lines or inline comments to document why we are using particular versions 3 | futures 4 | h5py<2.9.0 # causes some tests to fail 5 | numpy>=1.14.0 # 1.14 made some relatively big changes 6 | progressbar2 7 | pysam 8 | 9 | -------------------------------------------------------------------------------- /fast5_research/test/data/example_template.bc_path: -------------------------------------------------------------------------------- 1 | 8.280000000000000000e+02 2 | 2.410000000000000000e+02 3 | 9.660000000000000000e+02 4 | 7.930000000000000000e+02 5 | 1.000000000000000000e+02 6 | 4.030000000000000000e+02 7 | 5.910000000000000000e+02 8 | 3.190000000000000000e+02 9 | 2.520000000000000000e+02 10 | 1.009000000000000000e+03 11 | 9.650000000000000000e+02 12 | 7.900000000000000000e+02 13 | 8.800000000000000000e+01 14 | 3.520000000000000000e+02 15 | 3.840000000000000000e+02 16 | 5.130000000000000000e+02 17 | 5.000000000000000000e+00 18 | 2.300000000000000000e+01 19 | 9.400000000000000000e+01 20 | 3.780000000000000000e+02 21 | 4.900000000000000000e+02 22 | 9.370000000000000000e+02 23 | 6.760000000000000000e+02 24 | 6.560000000000000000e+02 25 | 5.790000000000000000e+02 26 | -------------------------------------------------------------------------------- /fast5_research/test/data/example_template.map_path: -------------------------------------------------------------------------------- 1 | 0.000000000000000000e+00 2 | 1.000000000000000000e+00 3 | 2.000000000000000000e+00 4 | 3.000000000000000000e+00 5 | 4.000000000000000000e+00 6 | 5.000000000000000000e+00 7 | 6.000000000000000000e+00 8 | 7.000000000000000000e+00 9 | 8.000000000000000000e+00 10 | 9.000000000000000000e+00 11 | 1.000000000000000000e+01 12 | 1.100000000000000000e+01 13 | 1.200000000000000000e+01 14 | 1.300000000000000000e+01 15 | 1.400000000000000000e+01 16 | 1.500000000000000000e+01 17 | 1.600000000000000000e+01 18 | 1.700000000000000000e+01 19 | 1.800000000000000000e+01 20 | 1.900000000000000000e+01 21 | 2.000000000000000000e+01 22 | 2.100000000000000000e+01 23 | 2.200000000000000000e+01 24 | 2.300000000000000000e+01 25 | 2.400000000000000000e+01 26 | -------------------------------------------------------------------------------- /fast5_research/test/data/example_template.events: -------------------------------------------------------------------------------- 1 | mean stdv length start 2 | 57.307656 1.352701 1.000000 0.000000 3 | 58.422004 1.185755 1.000000 1.000000 4 | 65.627289 1.191993 1.000000 2.000000 5 | 68.845879 1.104547 1.000000 3.000000 6 | 71.449841 1.184341 1.000000 4.000000 7 | 76.549487 1.253127 1.000000 5.000000 8 | 69.556924 1.515694 1.000000 6.000000 9 | 54.685719 1.612181 1.000000 7.000000 10 | 52.595864 1.167940 1.000000 8.000000 11 | 57.696494 1.216995 1.000000 9.000000 12 | 63.905448 1.110766 1.000000 10.000000 13 | 67.273390 1.100466 1.000000 11.000000 14 | 73.397066 1.182740 1.000000 12.000000 15 | 76.326802 1.216704 1.000000 13.000000 16 | 77.069438 1.321304 1.000000 14.000000 17 | 69.062284 1.417715 1.000000 15.000000 18 | 60.748061 1.162331 1.000000 16.000000 19 | 64.499068 1.123995 1.000000 17.000000 20 | 69.120248 1.115507 1.000000 18.000000 21 | 63.973174 1.202806 1.000000 19.000000 22 | 64.451044 1.195180 1.000000 20.000000 23 | 62.669594 1.379852 1.000000 21.000000 24 | 68.471935 1.513985 1.000000 22.000000 25 | 77.861964 1.349168 1.000000 23.000000 26 | 73.011127 1.299344 1.000000 24.000000 27 | -------------------------------------------------------------------------------- /fast5_research/test/test_iterate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import unittest 4 | from uuid import uuid4 5 | 6 | from fast5_research import Fast5, iterate_fast5 7 | 8 | 9 | class IterateFiles(unittest.TestCase): 10 | def setUp(self): 11 | self.path = (os.path.join( 12 | os.path.dirname(__file__), 'data', 'recursive' 13 | )) 14 | 15 | def test_000_single_layer(self): 16 | fnames = list(iterate_fast5(self.path, paths=True)) 17 | self.assertEqual(len(fnames), 3) 18 | 19 | def test_001_recursive(self): 20 | fnames = list(iterate_fast5(self.path, paths=True, recursive=True)) 21 | self.assertEqual(len(fnames), 5) 22 | 23 | def test_002_from_file(self): 24 | tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) 25 | with open(tmp_file, 'w') as fh: 26 | fh.write('filename\tjunk\n') 27 | for i, fname in enumerate(iterate_fast5(self.path, paths=True)): 28 | fh.write('{}\t{}\n'.format(os.path.basename(fname), i)) 29 | fnames = list(iterate_fast5(self.path, paths=True, strand_list=tmp_file)) 30 | self.assertEqual(len(fnames), 3) 31 | 32 | 33 | if __name__ == "__main__": 34 | unittest.main() 35 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.5" 5 | dist: xenial 6 | 7 | before_install: 8 | - pip install --upgrade pip 9 | 10 | install: "pip install -r dev_requirements.txt" 11 | 12 | 13 | script: 14 | - pip install . 15 | - nosetests 16 | 17 | 18 | before_deploy: 19 | - make docs 20 | 21 | 22 | deploy: 23 | - provider: pypi 24 | user: ontresearch 25 | password: 26 | secure: "vrNEDv4dw6FuVBRId3dC5F3FFFIgFP1AJ2PFpBNCQ2q3Qn5iUpTOUQDRHDFIcxvWMfunzOORUD/7M1f2l7x1fHSQ28L/peXVdTuVmiwHnkcPqtug+UOKysgd8K8X0cxcO2/0MOQoev+AFiOXf815Za4/GnA822NcXcyzhugTzTfyqWTyDoGXJ7b3i4Upkty+d2j+nBRpKl4N3mX040gKbDszuTUAqsjO433qJf8SXPH8SJdW2TcM0KsPWf5kvOflqMKb4CLHI5m4NpNLBjd56PnPdVA9czazlR8ZW584+zaYyW6yTtfgg+44WuxNDfXv48cnsCloqiQusCfsl3bQ4LKGk1gg8tTaVOGfD9TI7OBXpLR6dG9SPOaER9flq0gUR9AOjI6zNw2B98RzpOlm82nJIbOYrRUdVy66uZaOt7se3OeYG182k487lrfGHYw217Z3x/Ua0CFMmmp0+WXDBOkozEywEw1ScPi17oLi25nyUHOOyBRKo3Wa6pgGaOdK7SQOEGtmQxmKmB18KOhoKQWsS6sLSKLEv316YhbxDNevwFILZzQ86t3qhDYQkl6y+oYkl6Ha5DRl6jnCEQDAIMG++kmUZbGeqkfPJBy/XpsPePY3HiK0WQWwu37J1nvDWbayfdGtciuQ82bkX/XdymrCgcFSSFoMVB+z/HFqLMk=" 27 | on: 28 | tags: true 29 | python: "3.5" 30 | - provider: pages 31 | skip_cleanup: true 32 | github_token: $GHPAGES_TOKEN 33 | local_dir: docs/_build/html 34 | target_branch: gh-pages 35 | on: 36 | tags: true 37 | python: "3.5" 38 | 39 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to Fast5's documentation! 2 | ================================= 3 | 4 | This package comprises an API to HDF containers used by the research groups 5 | within Oxford Nanopore Technologies. It complements the 6 | `official API `_. Reading and 7 | writing of read files can be accomplished as well as reading of bulk .fast5 8 | files. 9 | 10 | .. admonition:: Research Release 11 | 12 | Research releases are provided as technology demonstrators to provide early 13 | access to features or stimulate Community development of tools. Support for 14 | this software will be minimal and is only provided directly by the developers. 15 | Feature requests, improvements, and discussions are welcome and can be 16 | implemented by forking and pull requests. However much as we would 17 | like to rectify every issue and piece of feedback users may have, the 18 | developers may have limited resource for support of this software. Research 19 | releases may be unstable and subject to rapid iteration by Oxford Nanopore 20 | Technologies. 21 | 22 | 23 | Installation 24 | ------------ 25 | 26 | The package can been installed via pip with: 27 | 28 | .. code-block:: bash 29 | 30 | pip install fast5_research 31 | 32 | See _`examples` for details of basic use. 33 | 34 | 35 | Contents 36 | -------- 37 | 38 | .. toctree:: 39 | :maxdepth: 2 40 | 41 | examples 42 | cmdline 43 | 44 | Full API reference 45 | ------------------ 46 | 47 | .. toctree:: 48 | :maxdepth: 3 49 | 50 | fast5_research 51 | 52 | 53 | Indices and tables 54 | ------------------ 55 | 56 | * :ref:`genindex` 57 | * :ref:`modindex` 58 | * :ref:`search` 59 | 60 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ### 2 | # This Makefile is simply for testing and making docs, to install 3 | # the project it should be sufficient to use python setup.py 4 | 5 | 6 | .PHONY: docs clean test test_py2 test_py3 7 | 8 | 9 | venv_py2/bin/activate: 10 | test -d venv_py2 || virtualenv venv_py2 --prompt '(fast5_py2) ' --python=python2 11 | . $@ && pip install pip --upgrade 12 | . $@ && pip install "setuptools<45" 13 | . $@ && pip install -r dev_requirements.txt 14 | . $@ && pip install -r requirements.txt; 15 | 16 | test_py2: venv_py2/bin/activate 17 | . $< && python setup.py nosetests 18 | 19 | 20 | venv_py3/bin/activate: 21 | test -d venv_py3 || virtualenv venv_py3 --prompt '(fast5_py3) ' --python=python3 22 | . $@ && pip install pip --upgrade 23 | . $@ && pip install -r dev_requirements.txt 24 | . $@ && pip install -r requirements.txt; 25 | 26 | test_py3: venv_py3/bin/activate 27 | . $< && python setup.py nosetests 28 | 29 | 30 | test: test_py2 test_py3 31 | 32 | clean: 33 | rm -rf build dist *.egg-info venv_* 34 | 35 | # You can set these variables from the command line. 36 | SPHINXOPTS = 37 | SPHINXBUILD = sphinx-build 38 | PAPER = 39 | BUILDDIR = _build 40 | 41 | # Internal variables. 42 | PAPEROPT_a4 = -D latex_paper_size=a4 43 | PAPEROPT_letter = -D latex_paper_size=letter 44 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 45 | 46 | DOCSRC = docs 47 | 48 | docs: venv_py3/bin/activate 49 | . $< && pip install sphinx sphinx_rtd_theme sphinx-argparse 50 | . $< && cd $(DOCSRC) && $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 51 | rm -rf docs/modules.rst docs/fast5_research.rst 52 | @echo 53 | @echo "Build finished. The HTML pages are in $(DOCSRC)/$(BUILDDIR)/html." 54 | touch $(DOCSRC)/$(BUILDDIR)/html/.nojekyll 55 | -------------------------------------------------------------------------------- /docs/cmdline.rst: -------------------------------------------------------------------------------- 1 | Command line Programs 2 | ===================== 3 | 4 | `fast5_research` comes with two commandline programs for conversion of sequencing 5 | read data. 6 | 7 | **extract_reads** - extracts reads from a bulk ``.fast5`` to either single- or multi-read 8 | ``.fast5``: 9 | 10 | .. code-block:: bash 11 | 12 | usage: extract_reads [-h] [--multi | --single] [--flat] [--by_id] 13 | [--prefix PREFIX] 14 | [--channel_range CHANNEL_RANGE CHANNEL_RANGE] 15 | [--workers WORKERS] [--limit LIMIT] 16 | input output 17 | 18 | Bulk .fast5 to read .fast5 conversion. 19 | 20 | positional arguments: 21 | input Bulk .fast5 file for input. 22 | output Output folder. 23 | 24 | optional arguments: 25 | -h, --help show this help message and exit 26 | --multi Output multi-read files. 27 | --single Output single-read files. 28 | --flat Create all .fast5 files in one directory 29 | --by_id Name single-read .fast5 files by read_id. 30 | --prefix PREFIX Read file prefix. 31 | --channel_range CHANNEL_RANGE CHANNEL_RANGE 32 | Channel range (inclusive). 33 | --workers WORKERS Number of worker processes. 34 | --limit LIMIT Limit reads per channel. 35 | 36 | 37 | **filter_reads** - extracts a subset of reads from a set of multi-read ``.fast5`` files. 38 | 39 | .. code-block:: bash 40 | 41 | usage: filter_reads [-h] [--tsv_field TSV_FIELD] [--multi | --single] 42 | [--prefix PREFIX] [--recursive] [--workers WORKERS] 43 | input output filter 44 | 45 | Extract reads from multi-read .fast5 files. 46 | 47 | positional arguments: 48 | input Path to input multi-read .fast5 files. 49 | output Output folder. 50 | filter A .tsv file with column `read_id` defining required 51 | reads. 52 | 53 | optional arguments: 54 | -h, --help show this help message and exit 55 | --tsv_field TSV_FIELD 56 | Field name from `filter` file to obtain read IDs. 57 | --multi Output multi-read files. 58 | --single Output single-read files. 59 | --prefix PREFIX Read file prefix. 60 | --recursive Search recursively under `input` for source files. 61 | --workers WORKERS Number of worker processes. 62 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | Fast5 Examples 2 | ============== 3 | 4 | The following code snippets demonstrate basic IO using key features of the API. 5 | 6 | Read Files 7 | ---------- 8 | 9 | The library provides the `Fast5` class which extends `h5py.File` with methods 10 | for acquiring common datasets and attributes from files without requiring 11 | knowledge of the file structure. To read a file and obtain a useful summary: 12 | 13 | .. code-block:: python 14 | 15 | from fast5_research import Fast5 16 | 17 | filename='my.fast5' 18 | 19 | with Fast5(filename) as fh: 20 | raw = fh.get_read(raw=True) 21 | summary = fh.summary() 22 | print('Raw is {} samples long.'.format(len(raw))) 23 | print('Summary {}.'.format(summary)) 24 | 25 | Note that in this example the raw data will be provided in pA s. 26 | 27 | The library also allows writing of files which are conformant with Oxford 28 | Nanopore Technologies' software. Certain meta data are needed, which the 29 | library will enforce are present: 30 | 31 | .. code-block:: python 32 | 33 | import numpy as np 34 | from fast5_research import Fast5 35 | 36 | filename='my_new.fast5' 37 | mean, stdv, n = 40.0, 2.0, 10000 38 | raw_data = np.random.laplace(mean, stdv/np.sqrt(2), int(dwell)) 39 | 40 | # example of how to digitize data 41 | start, stop = int(min(raw_data - 1)), int(max(raw_data + 1)) 42 | rng = stop - start 43 | digitisation = 8192.0 44 | bins = np.arange(start, stop, rng / digitisation) 45 | # np.int16 is required, the library will refuse to write anything other 46 | raw_data = np.digitize(raw_data, bins).astype(np.int16) 47 | 48 | # The following are required meta data 49 | channel_id = { 50 | 'digitisation': digitisation, 51 | 'offset': 0, 52 | 'range': rng, 53 | 'sampling_rate': 4000, 54 | 'channel_number': 1, 55 | } 56 | read_id = { 57 | 'start_time': 0, 58 | 'duration': len(raw_data), 59 | 'read_number': 1, 60 | 'start_mux': 1, 61 | 'read_id': str(uuid4()), 62 | 'scaling_used': 1, 63 | 'median_before': 0, 64 | } 65 | tracking_id = { 66 | 'exp_start_time': '1970-01-01T00:00:00Z', 67 | 'run_id': str(uuid4()).replace('-',''), 68 | 'flow_cell_id': 'FAH00000', 69 | } 70 | context_tags = {} 71 | 72 | with Fast5.New(filename, 'w', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h: 73 | h.set_raw(raw_data, meta=read_id, read_number=1) 74 | 75 | 76 | Bulk Files 77 | ---------- 78 | 79 | The library exposes data within bulk `.fast5` files through the `BulkFast5` class: 80 | 81 | .. code-block:: python 82 | 83 | from fast5_research import BulkFast5 84 | 85 | filename = 'my_bulk.fast5' 86 | channel = 100 87 | samples = [1000, 100000] 88 | 89 | with BulkFast5(filename) as fh: 90 | raw = fh.get_raw(channel, raw_indices=samples) 91 | multiplexer_changes = get_mux_changes_in_window( 92 | channel, raw_indices=samples) 93 | 94 | The `BulkFast5` class provides in-memory caching of many intermediate results, 95 | to optimize repeated calls to the same methods. 96 | 97 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | from setuptools import setup, find_packages 5 | 6 | 7 | __pkg_name__ = 'fast5_research' 8 | __author__ = 'cwright' 9 | __description__ = 'ONT Research .fast5 read/write API.' 10 | 11 | # Use readme as long description and say its github-flavour markdown 12 | from os import path 13 | this_directory = path.abspath(path.dirname(__file__)) 14 | kwargs = {'encoding':'utf-8'} if sys.version_info.major == 3 else {} 15 | with open(path.join(this_directory, 'README.md'), **kwargs) as f: 16 | __long_description__ = f.read() 17 | __long_description_content_type__ = 'text/markdown' 18 | 19 | 20 | # Get the version number from __init__.py 21 | verstrline = open(os.path.join(__pkg_name__, '__init__.py'), 'r').read() 22 | vsre = r"^__version__ = ['\"]([^'\"]*)['\"]" 23 | mo = re.search(vsre, verstrline, re.M) 24 | if mo: 25 | __version__ = mo.group(1) 26 | else: 27 | raise RuntimeError('Unable to find version string in "{}/__init__.py".'.format(__pkg_name__)) 28 | 29 | dir_path = os.path.dirname(__file__) 30 | with open(os.path.join(dir_path, 'requirements.txt')) as fh: 31 | install_requires = [ 32 | r.split('#')[0].strip() 33 | for r in fh.read().splitlines() if not r.strip().startswith('#') 34 | ] 35 | 36 | extra_requires={} 37 | 38 | py2only_requirements = ['futures'] 39 | if len(py2only_requirements) > 0: 40 | extra_requires[':python_version == "2.7"'] = [] 41 | 42 | for requirement in py2only_requirements: 43 | install_requires.remove(requirement) 44 | extra_requires[':python_version == "2.7"'].append(requirement) 45 | 46 | 47 | setup( 48 | name=__pkg_name__, 49 | version=__version__, 50 | url='https://github.com/nanoporetech/{}'.format(__pkg_name__), 51 | author=__author__, 52 | author_email='{}@nanoporetech.com'.format(__author__), 53 | description=__description__, 54 | long_description=__long_description__, 55 | long_description_content_type=__long_description_content_type__, 56 | entry_points={ 57 | 'console_scripts': [ 58 | 'index_reads = {}.extract:build_read_index'.format(__pkg_name__), 59 | 'extract_reads = {}.extract:extract_reads'.format(__pkg_name__), 60 | 'read_summary = {}.extract:extract_read_summary'.format(__pkg_name__), 61 | 'filter_reads = {}.extract:filter_multi_reads'.format(__pkg_name__), 62 | 'filter_from_bam = {}.extract:filter_file_from_bam'.format(__pkg_name__), 63 | ] 64 | }, 65 | license='Mozilla Public License 2.0', 66 | dependency_links=[], 67 | install_requires=install_requires, 68 | tests_require=['nose>=1.3.7'].extend(install_requires), 69 | extras_require=extra_requires, 70 | packages=find_packages(exclude=['*.test', '*.test.*', 'test.*', 'test']), 71 | package_data={}, 72 | zip_safe=True, 73 | classifiers=[ 74 | 'Development Status :: 4 - Beta', 75 | 'Intended Audience :: Science/Research', 76 | 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)', 77 | 'Natural Language :: English', 78 | 'Programming Language :: Python', 79 | 'Topic :: Software Development :: Libraries :: Python Modules', 80 | 'Topic :: Scientific/Engineering :: Bio-Informatics' 81 | ], 82 | keywords='ONT Research fast5 API', 83 | ) 84 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | v1.2.23 2 | ------- 3 | * Build using python3.5 4 | * Fix bug with shuffle option within fast5.py 5 | * Allow `None` as an input to BulkFast5.get_raw 6 | * Allow summary file to use `strand_duration` instead of `strand` as column name 7 | 8 | v1.2.22 9 | ------- 10 | * Update way reads classifications are parsed from the bulk to use modal classification as Minknow now does. 11 | 12 | v1.2.21 13 | ------- 14 | * Add filter_from_bam command to create filter tsv file from BAM and sequencing summary file. 15 | 16 | v1.2.20 17 | ------- 18 | * Add `index_reads` program to build read_id->file index tsv file. 19 | 20 | v1.2.19 21 | ------- 22 | * Add program to produce read summary text file from Bulk .fast5. 23 | 24 | v1.2.18 25 | ------- 26 | * Allow `extract_reads` to extract only reads present in a given read_summary.txt 27 | 28 | v1.2.17 29 | ------- 30 | * Allow `filter_reads` to be given full filename/read_id information 31 | 32 | v1.2.16 33 | ------- 34 | * Fix bug in `filter_reads` resulting in the last worker's reads not being written. 35 | 36 | v1.2.15 37 | ------- 38 | * Calculate drift from read table as it will no longer be present in the read table. 39 | 40 | v1.2.14 41 | ------- 42 | * Fix issue affecting conversion of |u1 dtypes 43 | 44 | v1.2.13 45 | ------- 46 | * Use filename if possible when extracting reads 47 | 48 | v1.2.12 49 | ------- 50 | * Clear up a deprecation warning: https://github.com/nanoporetech/fast5_research/issues/30 51 | 52 | v1.2.11 53 | ------- 54 | * Add `filter_reads` program to extract a subset of reads from multi-reads. 55 | 56 | v1.2.10 57 | ------- 58 | * Minor syntax fix in `extract.py` for python2 59 | 60 | v1.2.9 61 | ------ 62 | * Add basic support for creation of multi-read files from bulk files. 63 | 64 | v1.2.8 65 | ------ 66 | * Small refactor of writing of mapping data. 67 | 68 | v1.2.6 69 | ------ 70 | * Fix slow creation of mapping table 71 | 72 | v1.2.5 73 | ------ 74 | * Ensure event structures containing text data are returned as strings rather than bytes under python3. 75 | 76 | v1.2.3 77 | ------ 78 | * Fixes issue with numpy 1.15 on reading type of views of structured data. 79 | * Updated documentation (https://nanoporetech.github.io/fast5_research/) 80 | 81 | v1.2.2 82 | ------ 83 | * Conversion from bulk to reads. 84 | * Require numpy >= 1.14. 85 | * A bit more python3 bytes cleaning. 86 | * Enforce types in raw, and required tracking_id attributes. 87 | 88 | v1.1.0 89 | ------ 90 | * Python3 compatibility changes 91 | * Add data cleaning steps for stringly types 92 | * Unpin numpy version 93 | 94 | v1.0.12 95 | ------- 96 | * Enforce some typing constraints on meta data for compatibility with some basecallers. 97 | 98 | v1.0.11 99 | ------- 100 | * Ignore h5py warnings on import 101 | 102 | v1.0.10 103 | ------- 104 | * Fix bug finding attributes when EventDetection not present 105 | 106 | v1.0.8 107 | ------ 108 | * Easy import of core classes and functions: 109 | `from fast5_research import Fast5, BulkFast5, iterate_fast5` 110 | * Enable recursive (lazy) search in `iterate_fast5`. 111 | 112 | v1.0.9 113 | ------ 114 | * Fix itertools import 115 | 116 | v1.0.6 117 | ------ 118 | * Ensure returned events have same dtype 119 | * fast5.py: all returned event arrays same dtype by passing them through self._get_read_data() 120 | * requirements: use any version of numpy 121 | * bump version to 1.0.6 122 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Oxford Nanopore Technologies logo](https://github.com/nanoporetech/medaka/raw/master/images/ONT_logo_590x106.png) 2 | 3 | 4 | fast5_research 5 | ============== 6 | 7 | [![Build Status](https://travis-ci.org/nanoporetech/fast5_research.svg?branch=master)](https://travis-ci.org/nanoporetech/fast5_research) 8 | 9 | Python fast5 reading and writing functionality provided by ONT Research. 10 | 11 | © 2018 Oxford Nanopore Technologies Ltd. 12 | 13 | Features 14 | -------- 15 | 16 | * Read interface bulk `.fast5` file for extracting reads, channel states, voltage, ... 17 | * Read/Write interface to single read files guaranteeing conformity. 18 | * Works on Linux, MacOS, and Windows. 19 | * Open source (Mozilla Public License 2.0). 20 | 21 | Documentation can be found at https://nanoporetech.github.io/fast5_research/. 22 | 23 | Installation 24 | ------------ 25 | 26 | `fast5_research` is available from pypi can can be installed with pip: 27 | 28 | pip install fast5_research 29 | 30 | 31 | Usage 32 | ----- 33 | 34 | Full documentation can be found at the link above, below are two simple examples. 35 | 36 | To read a file: 37 | 38 | from fast5_research import Fast5 39 | 40 | filename='my.fast5' 41 | 42 | with Fast5(filename) as fh: 43 | raw = fh.get_read(raw=True) 44 | summary = fh.summary() 45 | print('Raw is {} samples long.'.format(len(raw))) 46 | print('Summary {}.'.format(summary)) 47 | 48 | Write a file, the library will check the given meta data, ensure that all required 49 | values are present, and covert all values to their defined types. 50 | 51 | from uuid import uuid4 52 | import numpy as np 53 | from fast5_research import Fast5 54 | 55 | filename='my_new.fast5' 56 | mean, stdv, n = 40.0, 2.0, 10000 57 | raw_data = np.random.laplace(mean, stdv/np.sqrt(2), int(dwell)) 58 | 59 | # example of how to digitize data 60 | start, stop = int(min(raw_data - 1)), int(max(raw_data + 1)) 61 | rng = stop - start 62 | digitisation = 8192.0 63 | bins = np.arange(start, stop, rng / digitisation) 64 | # np.int16 is required, the library will refuse to write anything other 65 | raw_data = np.digitize(raw_data, bins).astype(np.int16) 66 | 67 | # The following are required meta data 68 | channel_id = { 69 | 'digitisation': digitisation, 70 | 'offset': 0, 71 | 'range': rng, 72 | 'sampling_rate': 4000, 73 | 'channel_number': 1, 74 | } 75 | read_id = { 76 | 'start_time': 0, 77 | 'duration': len(raw_data), 78 | 'read_number': 1, 79 | 'start_mux': 1, 80 | 'read_id': str(uuid4()), 81 | 'scaling_used': 1, 82 | 'median_before': 0, 83 | } 84 | tracking_id = { 85 | 'exp_start_time': '1970-01-01T00:00:00Z', 86 | 'run_id': str(uuid4()).replace('-',''), 87 | 'flow_cell_id': 'FAH00000', 88 | } 89 | context_tags = {} 90 | 91 | with Fast5.New(filename, 'w', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h: 92 | h.set_raw(raw_data, meta=read_id, read_number=1) 93 | 94 | 95 | Help 96 | ---- 97 | 98 | **Licence and Copyright** 99 | 100 | © 2018 Oxford Nanopore Technologies Ltd. 101 | 102 | `medaka` is distributed under the terms of the Mozilla Public License 2.0. 103 | 104 | **Research Release** 105 | 106 | Research releases are provided as technology demonstrators to provide early 107 | access to features or stimulate Community development of tools. Support for 108 | this software will be minimal and is only provided directly by the developers. 109 | Feature requests, improvements, and discussions are welcome and can be 110 | implemented by forking and pull requests. However much as we would 111 | like to rectify every issue and piece of feedback users may have, the 112 | developers may have limited resource for support of this software. Research 113 | releases may be unstable and subject to rapid iteration by Oxford Nanopore 114 | Technologies. 115 | -------------------------------------------------------------------------------- /fast5_research/test/test_fast5_basecalling_and_mapping.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import os 3 | import sys 4 | import tempfile 5 | import unittest 6 | 7 | import numpy as np 8 | import numpy.testing as nptest 9 | 10 | from fast5_research import Fast5 11 | from fast5_research.util import _sanitize_data_for_reading 12 | 13 | class Fast5BasecallerAndMapper(unittest.TestCase): 14 | 15 | @classmethod 16 | def get_file_path(self,filename): 17 | file_path = os.path.join(os.path.dirname(__file__), 'data', filename) 18 | return file_path 19 | 20 | @classmethod 21 | def setUpClass(self): 22 | """Create a read fast5 from scratch with previously simulated mapping and basecall 1D data""" 23 | print('* Fast5 Basecaller and Mapper') 24 | 25 | self.seq = 'CATTACGCATTTACCGAAACCTGGGCAAA' 26 | self.qstring = '!'*len(self.seq) 27 | self.model_file = 'example_template.model' 28 | self.events_file = 'example_template.events' 29 | self.model_file = 'example_template.model' 30 | self.bc_scale_file = 'example_template.bc_scale' 31 | self.bc_path_file = 'example_template.bc_path' 32 | self.map_scale_file = 'example_template.map_scale' 33 | self.map_path_file = 'example_template.map_path' 34 | self.map_post_file = 'example_template.map_post' 35 | self.ref_name = 'test_seq' 36 | self.npstr_dtype = 'U' 37 | if sys.version_info[0] < 3: 38 | self.npstr_dtype = 'S' 39 | 40 | # Open new file 41 | header = ['channel_number', 'offset', 'range', 'digitisation', 'sampling_rate'] 42 | channel_id = {x:0 for x in header} 43 | tracking_id = tracking_id = { 44 | 'exp_start_time': '1970-01-00T00:00:00Z', 45 | 'run_id': 'a'*32, 46 | 'flow_cell_id': 'FAH00000', 47 | } 48 | fakefile = tempfile.NamedTemporaryFile() 49 | self.fh = Fast5.New(fakefile.name, channel_id=channel_id, tracking_id=tracking_id, read='a') 50 | 51 | # load data to set within fast5 file 52 | self.model = np.genfromtxt(self.get_file_path(self.model_file), dtype=None, delimiter='\t', names=True, encoding='utf8') 53 | 54 | self.model['kmer'] = self.model['kmer'].astype(str) 55 | 56 | self.events = np.genfromtxt(self.get_file_path(self.events_file), dtype=None, delimiter='\t', names=True) 57 | 58 | # use namedtuple to imitate a Scale object 59 | Scale = namedtuple('Scale', ['shift', 'scale', 'drift', 'var', 'scale_sd', 'var_sd']) 60 | 61 | bc_scale = Scale(*np.genfromtxt(self.get_file_path(self.bc_scale_file), dtype=None, delimiter='\t')) 62 | bc_path = np.genfromtxt(self.get_file_path(self.bc_path_file), dtype=np.int32, delimiter='\t') 63 | 64 | self.fh.set_basecall_data(self.events, bc_scale, bc_path, self.model, self.seq) 65 | 66 | map_scale = Scale(*np.genfromtxt(self.get_file_path(self.map_scale_file), dtype=None, delimiter='\t')) 67 | map_path = np.genfromtxt(self.get_file_path(self.map_path_file), dtype=np.int32, delimiter='\t') 68 | map_post = np.genfromtxt(self.get_file_path(self.map_post_file), delimiter='\t') 69 | 70 | n_states = len(self.seq) - len(self.model['kmer'][0]) + 1 71 | self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name) 72 | self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name, post=map_post) 73 | 74 | @classmethod 75 | def tearDownClass(self): 76 | self.fh.close() 77 | 78 | def test_000_basic_folder_structure(self): 79 | """Test root folder structure creation""" 80 | 81 | self.assertEqual(list(self.fh.keys()), ['Analyses', 'UniqueGlobalKey']) 82 | self.assertEqual(list(self.fh['/Analyses'].keys()), ['Basecall_1D_000', 'Squiggle_Map_000', 'Squiggle_Map_001']) 83 | 84 | def test_005_basecall_1d_folder_structure(self): 85 | """Test basecall 1d folder structure creation""" 86 | 87 | self.assertEqual(list(self.fh['/Analyses/Basecall_1D_000'].keys()), ['BaseCalled_template', 'Summary']) 88 | self.assertEqual(list(self.fh['/Analyses/Basecall_1D_000/BaseCalled_template'].keys()), ['Events', 'Fastq', 'Model']) 89 | 90 | def test_010_mapping_folder_structure(self): 91 | """Test mapping structure creation""" 92 | 93 | self.assertEqual(list(self.fh['/Analyses/Squiggle_Map_000'].keys()), ['SquiggleMapped_template', 'Summary']) 94 | self.assertEqual(list(self.fh['/Analyses/Squiggle_Map_000/SquiggleMapped_template'].keys()), ['Events', 'Model']) 95 | self.assertEqual(list(self.fh['/Analyses/Squiggle_Map_000/Summary'].keys()), ['squiggle_map_template']) 96 | 97 | self.assertEqual(list(self.fh['/Analyses/Squiggle_Map_001'].keys()), ['SquiggleMapped_template', 'Summary']) 98 | self.assertEqual(list(self.fh['/Analyses/Squiggle_Map_001/SquiggleMapped_template'].keys()), ['Events', 'Model']) 99 | self.assertEqual(list(self.fh['/Analyses/Squiggle_Map_001/Summary'].keys()), ['squiggle_map_template']) 100 | 101 | def test_015_fastq(self): 102 | """ Test fastq assembly and writing """ 103 | 104 | fastq = '@unknown\n{}\n+\n{}\n'.format(self.seq, self.qstring) 105 | self.assertEqual(_sanitize_data_for_reading(self.fh['/Analyses/Basecall_1D_000/BaseCalled_template/Fastq'][()]), fastq) 106 | 107 | def test_020_basecall_1d_event_writing(self): 108 | """Test basecall event writing""" 109 | 110 | input_events = self.events['mean'] 111 | output_events = self.fh['/Analyses/Basecall_1D_000/BaseCalled_template/Events']['mean'][()] 112 | nptest.assert_array_equal(input_events, output_events) 113 | 114 | def test_025_basecall_1d_event_reading(self): 115 | """Test basecall event reading with the getter function""" 116 | 117 | input_events = self.events['mean'] 118 | output_events = self.fh.get_basecall_data()['mean'] 119 | nptest.assert_array_equal(input_events, output_events) 120 | 121 | def test_030_mapping_event_writing(self): 122 | """Test mapping event writing""" 123 | 124 | input_events = self.events['mean'] 125 | output_events = self.fh['/Analyses/Squiggle_Map_000/SquiggleMapped_template/Events']['mean'][()] 126 | output_events_with_post = self.fh['/Analyses/Squiggle_Map_001/SquiggleMapped_template/Events']['mean'][()] 127 | 128 | nptest.assert_array_equal(input_events, output_events) 129 | nptest.assert_array_equal(input_events, output_events_with_post) 130 | 131 | def test_035_mapping_event_reading(self): 132 | """Test mapping event reading with the getter function""" 133 | 134 | input_means = self.events['mean'] 135 | events = self.fh.get_mapping_data() 136 | nptest.assert_array_equal(input_means, events['mean']) 137 | self.assertEqual(events['kmer'].dtype, np.dtype('|{}5'.format(self.npstr_dtype))) 138 | 139 | def test_036_mapping_event_reading_any(self): 140 | """Test mapping event reading with the I don't care function""" 141 | 142 | input_means = self.events['mean'] 143 | events = self.fh.get_mapping_data() 144 | nptest.assert_array_equal(input_means, events['mean']) 145 | self.assertEqual(events['kmer'].dtype, np.dtype('|{}5'.format(self.npstr_dtype))) 146 | 147 | 148 | if __name__ == '__main__': 149 | unittest.main() 150 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is execfile()d with the current directory set to its containing dir. 4 | # 5 | # Note that not all possible configuration values are present in this 6 | # autogenerated file. 7 | # 8 | # All configuration values have a default; values that are commented out 9 | # serve to show the default. 10 | 11 | import sys, os, re, subprocess 12 | import sphinx_rtd_theme 13 | 14 | # If extensions (or modules to document with autodoc) are in another directory, 15 | # add these directories to sys.path here. If the directory is relative to the 16 | # documentation root, use os.path.abspath to make it absolute, like shown here. 17 | sys.path.insert(0, os.path.abspath('..')) 18 | 19 | # -- General configuration ----------------------------------------------------- 20 | 21 | # If your documentation needs a minimal Sphinx version, state it here. 22 | #needs_sphinx = '1.0' 23 | 24 | # Add any Sphinx extension module names here, as strings. They can be extensions 25 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 26 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.intersphinx', 27 | 'sphinx.ext.mathjax'] 28 | mathjax_path = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | __pkg_name__ = u'fast5_research' 44 | project = __pkg_name__.capitalize() 45 | copyright = u'2017, Oxford Nanopore Technologies' 46 | 47 | # Generate API documentation: 48 | if subprocess.call(['sphinx-apidoc', '-o', './', "../{}".format(__pkg_name__)]) != 0: 49 | sys.stderr.write('Failed to generate API documentation!\n') 50 | 51 | # The version info for the project you're documenting, acts as replacement for 52 | # |version| and |release|, also used in various other places throughout the 53 | # built documents. 54 | # 55 | 56 | # Get the version number from __init__.py 57 | verstrline = open(os.path.join('..', __pkg_name__, '__init__.py'), 'r').read() 58 | vsre = r"^__version__ = ['\"]([^'\"]*)['\"]" 59 | mo = re.search(vsre, verstrline, re.M) 60 | if mo: 61 | __version__ = mo.group(1) 62 | else: 63 | raise RuntimeError('Unable to find version string in "{}/__init__.py".'.format(__pkg_name__)) 64 | 65 | # The short X.Y version. 66 | version = __version__ 67 | # The full version, including alpha/beta/rc tags. 68 | release = __version__ 69 | 70 | # The language for content autogenerated by Sphinx. Refer to documentation 71 | # for a list of supported languages. 72 | #language = None 73 | 74 | # There are two options for replacing |today|: either, you set today to some 75 | # non-false value, then it is used: 76 | #today = '' 77 | # Else, today_fmt is used as the format for a strftime call. 78 | #today_fmt = '%B %d, %Y' 79 | 80 | # List of patterns, relative to source directory, that match files and 81 | # directories to ignore when looking for source files. 82 | exclude_patterns = ['_build','*test*'] 83 | 84 | # The reST default role (used for this markup: `text`) to use for all documents. 85 | #default_role = None 86 | 87 | # If true, '()' will be appended to :func: etc. cross-reference text. 88 | #add_function_parentheses = True 89 | 90 | # If true, the current module name will be prepended to all description 91 | # unit titles (such as .. function::). 92 | #add_module_names = True 93 | 94 | # If true, sectionauthor and moduleauthor directives will be shown in the 95 | # output. They are ignored by default. 96 | #show_authors = False 97 | 98 | # The name of the Pygments (syntax highlighting) style to use. 99 | pygments_style = 'sphinx' 100 | 101 | # A list of ignored prefixes for module index sorting. 102 | #modindex_common_prefix = [] 103 | 104 | 105 | # -- Options for HTML output --------------------------------------------------- 106 | 107 | # The theme to use for HTML and HTML Help pages. See the documentation for 108 | # a list of builtin themes. 109 | html_theme = 'sphinx_rtd_theme' 110 | 111 | # Theme options are theme-specific and customize the look and feel of a theme 112 | # further. For a list of options available for each theme, see the 113 | # documentation. 114 | #html_theme_options = {} 115 | 116 | # Add any paths that contain custom themes here, relative to this directory. 117 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 118 | 119 | # The name for this set of Sphinx documents. If None, it defaults to 120 | # " v documentation". 121 | #html_title = None 122 | 123 | # A shorter title for the navigation bar. Default is the same as html_title. 124 | #html_short_title = None 125 | 126 | # The name of an image file (relative to this directory) to place at the top 127 | # of the sidebar. 128 | #html_logo = None 129 | 130 | # The name of an image file (within the static path) to use as favicon of the 131 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 132 | # pixels large. 133 | #html_favicon = None 134 | 135 | # Add any paths that contain custom static files (such as style sheets) here, 136 | # relative to this directory. They are copied after the builtin static files, 137 | # so a file named "default.css" will overwrite the builtin "default.css". 138 | #html_static_path = ['_static'] 139 | 140 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 141 | # using the given strftime format. 142 | #html_last_updated_fmt = '%b %d, %Y' 143 | 144 | # If true, SmartyPants will be used to convert quotes and dashes to 145 | # typographically correct entities. 146 | #html_use_smartypants = True 147 | 148 | # Custom sidebar templates, maps document names to template names. 149 | #html_sidebars = {} 150 | 151 | # Additional templates that should be rendered to pages, maps page names to 152 | # template names. 153 | #html_additional_pages = {} 154 | 155 | # If false, no module index is generated. 156 | #html_domain_indices = True 157 | 158 | # If false, no index is generated. 159 | #html_use_index = True 160 | 161 | # If true, the index is split into individual pages for each letter. 162 | #html_split_index = False 163 | 164 | # If true, links to the reST sources are added to the pages. 165 | #html_show_sourcelink = True 166 | 167 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 168 | #html_show_sphinx = True 169 | 170 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 171 | #html_show_copyright = True 172 | 173 | # If true, an OpenSearch description file will be output, and all pages will 174 | # contain a tag referring to it. The value of this option must be the 175 | # base URL from which the finished HTML is served. 176 | #html_use_opensearch = '' 177 | 178 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 179 | #html_file_suffix = None 180 | 181 | # Output file base name for HTML help builder. 182 | htmlhelp_basename = '{}doc'.format(project) 183 | 184 | 185 | # -- Options for LaTeX output -------------------------------------------------- 186 | 187 | latex_elements = { 188 | # The paper size ('letterpaper' or 'a4paper'). 189 | #'papersize': 'letterpaper', 190 | 191 | # The font size ('10pt', '11pt' or '12pt'). 192 | #'pointsize': '10pt', 193 | 194 | # Additional stuff for the LaTeX preamble. 195 | #'preamble': '', 196 | } 197 | 198 | # Grouping the document tree into LaTeX files. List of tuples 199 | # (source start file, target name, title, author, documentclass [howto/manual]). 200 | latex_documents = [ 201 | ('index', '{}.tex'.format(project), u'{} Documentation'.format(project), 202 | u'Oxford Nanopore Technologies', 'manual'), 203 | ] 204 | 205 | # The name of an image file (relative to this directory) to place at the top of 206 | # the title page. 207 | #latex_logo = None 208 | 209 | # For "manual" documents, if this is true, then toplevel headings are parts, 210 | # not chapters. 211 | #latex_use_parts = False 212 | 213 | # If true, show page references after internal links. 214 | #latex_show_pagerefs = False 215 | 216 | # If true, show URL addresses after external links. 217 | #latex_show_urls = False 218 | 219 | # Documents to append as an appendix to all manuals. 220 | #latex_appendices = [] 221 | 222 | # If false, no module index is generated. 223 | #latex_domain_indices = True 224 | 225 | 226 | # -- Options for manual page output -------------------------------------------- 227 | 228 | # One entry per manual page. List of tuples 229 | # (source start file, name, description, authors, manual section). 230 | man_pages = [ 231 | ('index', project, u'{} Documentation'.format(project), 232 | [u'Oxford Nanopore Technologies'], 1) 233 | ] 234 | 235 | # If true, show URL addresses after external links. 236 | #man_show_urls = False 237 | 238 | 239 | # -- Options for Texinfo output ------------------------------------------------ 240 | 241 | # Grouping the document tree into Texinfo files. List of tuples 242 | # (source start file, target name, title, author, 243 | # dir menu entry, description, category) 244 | texinfo_documents = [ 245 | ('index', project, u'{} Documentation'.format(project), 246 | u'Oxford Nanopore Technologies', project, 'One line description of project.', 247 | 'Miscellaneous'), 248 | ] 249 | 250 | # Documents to append as an appendix to all manuals. 251 | #texinfo_appendices = [] 252 | 253 | # If false, no module index is generated. 254 | #texinfo_domain_indices = True 255 | 256 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 257 | #texinfo_show_urls = 'footnote' 258 | 259 | 260 | # -- Options for Epub output --------------------------------------------------- 261 | 262 | # Bibliographic Dublin Core info. 263 | epub_title = project 264 | epub_author = u'Oxford Nanopore Technologies' 265 | epub_publisher = u'Oxford Nanopore Technologies' 266 | epub_copyright = u'2017, Oxford Nanopore Technologies' 267 | 268 | # The language of the text. It defaults to the language option 269 | # or en if the language is not set. 270 | #epub_language = '' 271 | 272 | # The scheme of the identifier. Typical schemes are ISBN or URL. 273 | #epub_scheme = '' 274 | 275 | # The unique identifier of the text. This can be a ISBN number 276 | # or the project homepage. 277 | #epub_identifier = '' 278 | 279 | # A unique identification for the text. 280 | #epub_uid = '' 281 | 282 | # A tuple containing the cover image and cover page html template filenames. 283 | #epub_cover = () 284 | 285 | # HTML files that should be inserted before the pages created by sphinx. 286 | # The format is a list of tuples containing the path and title. 287 | #epub_pre_files = [] 288 | 289 | # HTML files shat should be inserted after the pages created by sphinx. 290 | # The format is a list of tuples containing the path and title. 291 | #epub_post_files = [] 292 | 293 | # A list of files that should not be packed into the epub file. 294 | #epub_exclude_files = [] 295 | 296 | # The depth of the table of contents in toc.ncx. 297 | #epub_tocdepth = 3 298 | 299 | # Allow duplicate toc entries. 300 | #epub_tocdup = True 301 | -------------------------------------------------------------------------------- /fast5_research/test/test_fast5.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import types 4 | import unittest 5 | from uuid import uuid4 6 | 7 | import h5py 8 | import numpy as np 9 | 10 | from fast5_research import Fast5 11 | 12 | class Fast5API(unittest.TestCase): 13 | test_file = 'example_basecall_squiggle_mapping.fast5' 14 | additional_file = 'additional_test_file.fast5' 15 | 16 | def setUp(self): 17 | self.h = Fast5(os.path.join( 18 | os.path.dirname(__file__), 'data', self.test_file 19 | )) 20 | 21 | 22 | self.additional_h = Fast5(os.path.join( 23 | os.path.dirname(__file__), 'data', self.additional_file 24 | )) 25 | 26 | # Use to create new temp files 27 | self.tmp_events_float = np.array( 28 | [(0.0, 1.0, 10.0, 2.0)], 29 | dtype=[(x, 'float') for x in ['start','length', 'mean', 'stdv']] 30 | ) 31 | self.tmp_events_int = np.array( 32 | [(0, 5000, 10.0, 2.0)], 33 | dtype=[ 34 | ('start', 'uint32'), ('length', 'uint32'), 35 | ('mean', 'float'), ('stdv', 'float') 36 | ] 37 | ) 38 | self.tmp_raw = np.ones(15, dtype=np.int16) 39 | 40 | self.tmp_channel_id = { 41 | 'channel_number': 1, 42 | 'range': 1.0, 43 | 'digitisation': 1.0, 44 | 'offset': 0.0, 45 | 'sample_rate': 5000.0, 46 | 'sampling_rate': 5000.0 47 | } 48 | self.tmp_read_id = { 49 | 'start_time': 0.0, 50 | 'duration': 1.0, 51 | 'read_number': 1, 52 | 'start_mux': 1, 53 | 'read_id': str(uuid4()), 54 | 'scaling_used': 1, 55 | 'median_before': 0 56 | } 57 | self.tmp_tracking_id = { 58 | 'exp_start_time': '1970-01-01T00:00:00Z', 59 | 'run_id': str(uuid4()).replace('-',''), 60 | 'flow_cell_id': 'FAH00000', 61 | } 62 | 63 | 64 | def tearDown(self): 65 | self.h.close() 66 | self.additional_h.close() 67 | 68 | @classmethod 69 | def setUpClass(self): 70 | print('* Fast5 API') 71 | 72 | 73 | def test_000_basic_functions(self): 74 | # Just test an inherited member 75 | self.assertEqual( 76 | os.path.basename(self.h.filename), self.test_file, 77 | 'Inherited member attribute not correct.' 78 | ) 79 | 80 | # We shouldn't be writable by default 81 | self.assertFalse(self.h.writable, 'File is not non-writable by default.') 82 | 83 | def test_010_get_meta(self): 84 | self.assertSetEqual( 85 | set(self.h.attributes.keys()), 86 | { 87 | 'scaling_used', 'median_before', 88 | 'start_time', 'read_number', 89 | 'abasic_found', 'duration', 'start_mux' 90 | }, 91 | '.attributes does not contain expected fields.' 92 | ) 93 | 94 | self.assertSetEqual( 95 | set(self.h.channel_meta.keys()), 96 | { 97 | 'channel_number', 'range', 'offset', 98 | 'digitisation', 'sampling_rate', 99 | }, 100 | '.channel_meta does not contain expected fields.' 101 | ) 102 | 103 | self.assertTrue( 104 | { 105 | 'strand_duration', 'pore_before', 'abasic', 106 | 'start_time', 'mux', 'channel', 'filename' 107 | }.issubset(self.h.summary().keys()), 108 | '.summary does not contain expected fields.' 109 | ) 110 | 111 | # Duration and start_time should be int, not float (samples, not times) 112 | for key in ['duration', 'start_time']: 113 | self.assertIsInstance( 114 | self.h.attributes[key], int 115 | ) 116 | 117 | def test_020_get_reads_et_al(self): 118 | reads = self.h.get_reads() 119 | try: 120 | read = reads.next() 121 | except AttributeError: 122 | read = next(reads) 123 | self.assertIsInstance( 124 | reads, types.GeneratorType, 125 | '.get_reads() does not give generator.' 126 | ) 127 | self.assertIsInstance( 128 | read, np.ndarray, 129 | '.get_reads().next() does not give numpy array by default.' 130 | ) 131 | self.assertSequenceEqual( 132 | read.dtype.names, ['start', 'length', 'mean', 'stdv'], 133 | '.get_reads().next() does not give "event data".' 134 | ) 135 | reads = self.h.get_reads(group=True) 136 | try: 137 | read = reads.next() 138 | except AttributeError: 139 | read = next(reads) 140 | self.assertIsInstance( 141 | read, h5py._hl.group.Group, 142 | '.get_reads().next() does not give h5py group when asked.' 143 | ) 144 | 145 | def test_030_analysis_locations(self): 146 | path = self.h.get_analysis_latest('Basecall_1D') 147 | self.assertEqual( 148 | '/Analyses/Basecall_1D_000', path, 149 | '.get_analysis_latest() does not return correct.' 150 | ) 151 | 152 | path = self.h.get_analysis_new('Basecall_1D') 153 | self.assertEqual( 154 | '/Analyses/Basecall_1D_001', path, 155 | '.get_analysis_new() does not return correct.' 156 | ) 157 | 158 | def test_040_split_data_legacy(self): 159 | indices = self.h.get_section_indices() 160 | self.assertIsInstance( 161 | indices, tuple, 162 | '.get_section_indices() does not give tuple' 163 | ) 164 | 165 | for i in range(2): 166 | self.assertIsInstance( 167 | indices[i], tuple, 168 | '.get_section_indices() does not give tuple of tuple, item {}'.format(i) 169 | ) 170 | 171 | def test_042_split_data_linear(self): 172 | indices = self.additional_h.get_section_indices() 173 | self.assertIsInstance( 174 | indices, tuple, 175 | '.get_section_indices() does not give tuple' 176 | ) 177 | 178 | for i in range(2): 179 | self.assertIsInstance( 180 | indices[i], tuple, 181 | '.get_section_indices() does not give tuple of tuple, item {}'.format(i) 182 | ) 183 | 184 | def test_045_split_data_events(self): 185 | for section in ('template', 'complement'): 186 | read = self.h.get_section_events(section) 187 | self.assertIsInstance( 188 | read, np.ndarray, 189 | '.get_section_events({}) does not give numpy array by default.'.format(section) 190 | ) 191 | 192 | 193 | def test_050_sequence_data(self): 194 | for section in ('template', 'complement'): 195 | call = self.h.get_fastq( 196 | 'Basecall_1D', section 197 | ) 198 | self.assertIsInstance(call, str, '{} call is not str.'.format(section)) 199 | 200 | # Check ValueError raised when requesting absent data 201 | self.assertRaises( 202 | ValueError, self.h.get_fastq, 'Basecall_1D', '2D' 203 | ) 204 | 205 | 206 | def test_060_construct_new_file_checks(self): 207 | tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) 208 | 209 | with self.assertRaises(IOError): 210 | fh = Fast5.New(tmp_file, 'r') 211 | fh = Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id) 212 | fh = Fast5.New(tmp_file, 'a', tracking_id=self.tmp_tracking_id) 213 | 214 | # This should be fine 215 | with Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h: 216 | h.set_read(self.tmp_events_float, self.tmp_read_id) 217 | 218 | 219 | def test_061_write_read_float_data(self): 220 | tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) 221 | 222 | with Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h: 223 | h.set_read(self.tmp_events_float, self.tmp_read_id) 224 | 225 | # Metadata duration and start_time should be integers, not floats 226 | with Fast5(tmp_file, 'r') as h: 227 | for key in ['duration', 'start_time']: 228 | self.assertIsInstance( 229 | h.attributes[key], int 230 | ) 231 | 232 | 233 | with Fast5(tmp_file) as h: 234 | events = h.get_read() 235 | self.assertEqual(events['start'].dtype.descr[0][1], ' 0)[0][0] 237 | unscaled_voltage = self.fh.get_voltage(use_scaling=False) 238 | self.assertNotEqual(voltage[index], unscaled_voltage[index]) 239 | 240 | def test_parse_state_data(self): 241 | """Test parsing of state data""" 242 | states = self.fh.get_state_changes(self.fh.channels[0]) 243 | self.assertEqual(len(states), 43) 244 | 245 | def test_get_state_by_raw_index(self): 246 | """Test channel state at a give raw index""" 247 | 248 | state = self.fh.get_state(self.fh.channels[0], raw_index=100) 249 | self.assertEqual(state, 'unclassified') 250 | 251 | state = self.fh.get_state(self.fh.channels[0], raw_index=61000) 252 | self.assertEqual(state, 'inrange') 253 | 254 | # now test another channel - this might fail if caching has gone wrong 255 | state = self.fh.get_state(self.fh.channels[1], raw_index=61000) 256 | self.assertEqual(state, 'saturated') 257 | 258 | def test_get_state_by_time(self): 259 | """Test channel state at a give raw index""" 260 | state = self.fh.get_state(self.fh.channels[0], time=100/self.fh.sample_rate) 261 | self.assertEqual(state, 'unclassified') 262 | 263 | state = self.fh.get_state(self.fh.channels[0], time=61000/self.fh.sample_rate) 264 | self.assertEqual(state, 'inrange') 265 | 266 | state = self.fh.get_state(self.fh.channels[1], time=61000/self.fh.sample_rate) 267 | self.assertEqual(state, 'saturated') 268 | 269 | def test_get_states_in_window_by_raw_index(self): 270 | """Test get_states_in_window using a window specified in raw indices""" 271 | inds = (3045000, 3930001) 272 | states = self.fh.get_states_in_window(self.fh.channels[0], raw_indices=inds) 273 | expected = np.array(['above', 'inrange', 'unclassified_following_reset', 'unusable_pore'], dtype='U28') 274 | 275 | assert np.all(states == expected) 276 | states = self.fh.get_states_in_window(self.fh.channels[1], raw_indices=inds) 277 | expected = np.array(['above', 'inrange', 'unclassified_following_reset'], dtype='U28') 278 | assert np.all(states == expected) 279 | 280 | def test_get_states_in_window_by_times(self): 281 | """Test get_states_in_window using a window specified in times""" 282 | times = (3045000.0 / self.fh.sample_rate, 3930001.0 / self.fh.sample_rate) 283 | states = self.fh.get_states_in_window(self.fh.channels[0], times=times) 284 | expected = np.array(['above', 'inrange', 'unclassified_following_reset', 'unusable_pore'], dtype='U28') 285 | assert np.all(states == expected) 286 | states = self.fh.get_states_in_window(self.fh.channels[1], times=times) 287 | expected = np.array(['above', 'inrange', 'unclassified_following_reset'], dtype='U28') 288 | assert np.all(states == expected) 289 | 290 | 291 | class BulkABFFast5Test(BulkFast5Test): 292 | 293 | example_file = 'abf2bulkfast5.fast5' 294 | 295 | def setUp(self): 296 | self.filepath = os.path.join( 297 | os.path.dirname(__file__), 'data', self.example_file 298 | ) 299 | self.fh = BulkFast5(self.filepath) 300 | 301 | def tearDown(self): 302 | self.fh.close() 303 | 304 | @classmethod 305 | def setUpClass(self): 306 | print('\n* Bulk ABF Fast5') 307 | 308 | # tests to skip 309 | @unittest.skip("Skipping test_parse_experimental_metadata") 310 | def test_parse_experimental_metadata(self): 311 | pass 312 | 313 | @unittest.skip("Skipping test_parse_temperature") 314 | def test_parse_temperature(self): 315 | pass 316 | 317 | @unittest.skip("Skipping test_parse_waveform_timings") 318 | def test_parse_waveform_timings(self): 319 | pass 320 | 321 | @unittest.skip("Skipping test_parse_read_data") 322 | def test_parse_read_data(self): 323 | pass 324 | 325 | @unittest.skip("Skipping test_parse_state_data") 326 | def test_parse_state_data(self): 327 | """Test parsing of state data""" 328 | pass 329 | 330 | @unittest.skip("Skipping test_get_state_by_raw_index") 331 | def test_get_state_by_raw_index(self): 332 | """Test channel state at a give raw index""" 333 | pass 334 | 335 | @unittest.skip("Skipping test_get_state_by_time") 336 | def test_get_state_by_time(self): 337 | """Test channel state at a give raw index""" 338 | pass 339 | 340 | @unittest.skip("Skipping test_get_states_in_window_by_raw_index") 341 | def test_get_states_in_window_by_raw_index(self): 342 | """Test get_states_in_window using a window specified in raw indices""" 343 | pass 344 | 345 | @unittest.skip("Skipping test_get_states_in_window_by_times") 346 | def test_get_states_in_window_by_times(self): 347 | """Test get_states_in_window using a window specified in times""" 348 | pass 349 | 350 | def test_raw_data_raises_exception_if_absent(self): 351 | """Test parsing raw data from a channel without raw data raises an exception.""" 352 | with self.assertRaises(KeyError): 353 | self.fh.get_raw(2) 354 | 355 | def test_parse_raw_data(self): 356 | """Test parsing the whole raw dataset""" 357 | raw = self.fh.get_raw(self.fh.channels[0]) 358 | self.assertEqual(len(raw), 10000) 359 | 360 | def test_parse_event_data_len(self): 361 | """Test parsing the whole event dataset""" 362 | events = self.fh.get_events(self.fh.channels[0]) 363 | self.assertEqual(len(events), 5) 364 | 365 | def test_get_mux_changes(self): 366 | """Test parsing of mux changes""" 367 | mux_changes = list(self.fh.get_mux_changes(self.fh.channels[0])) 368 | self.assertEqual(len(mux_changes), 1) 369 | self.assertTupleEqual((0, 1), tuple(mux_changes[0])) 370 | 371 | 372 | if __name__ == "__main__": 373 | unittest.main() 374 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | This Source Code Form is subject to the terms of the Mozilla Public 2 | License, v. 2.0. If a copy of the MPL was not distributed with this 3 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | (c) 2016 Oxford Nanopore Technologies Ltd. 6 | 7 | 8 | Mozilla Public License Version 2.0 9 | ================================== 10 | 11 | ### 1. Definitions 12 | 13 | **1.1. “Contributor”** 14 | means each individual or legal entity that creates, contributes to 15 | the creation of, or owns Covered Software. 16 | 17 | **1.2. “Contributor Version”** 18 | means the combination of the Contributions of others (if any) used 19 | by a Contributor and that particular Contributor's Contribution. 20 | 21 | **1.3. “Contribution”** 22 | means Covered Software of a particular Contributor. 23 | 24 | **1.4. “Covered Software”** 25 | means Source Code Form to which the initial Contributor has attached 26 | the notice in Exhibit A, the Executable Form of such Source Code 27 | Form, and Modifications of such Source Code Form, in each case 28 | including portions thereof. 29 | 30 | **1.5. “Incompatible With Secondary Licenses”** 31 | means 32 | 33 | * **(a)** that the initial Contributor has attached the notice described 34 | in Exhibit B to the Covered Software; or 35 | * **(b)** that the Covered Software was made available under the terms of 36 | version 1.1 or earlier of the License, but not also under the 37 | terms of a Secondary License. 38 | 39 | **1.6. “Executable Form”** 40 | means any form of the work other than Source Code Form. 41 | 42 | **1.7. “Larger Work”** 43 | means a work that combines Covered Software with other material, in 44 | a separate file or files, that is not Covered Software. 45 | 46 | **1.8. “License”** 47 | means this document. 48 | 49 | **1.9. “Licensable”** 50 | means having the right to grant, to the maximum extent possible, 51 | whether at the time of the initial grant or subsequently, any and 52 | all of the rights conveyed by this License. 53 | 54 | **1.10. “Modifications”** 55 | means any of the following: 56 | 57 | * **(a)** any file in Source Code Form that results from an addition to, 58 | deletion from, or modification of the contents of Covered 59 | Software; or 60 | * **(b)** any new file in Source Code Form that contains any Covered 61 | Software. 62 | 63 | **1.11. “Patent Claims” of a Contributor** 64 | means any patent claim(s), including without limitation, method, 65 | process, and apparatus claims, in any patent Licensable by such 66 | Contributor that would be infringed, but for the grant of the 67 | License, by the making, using, selling, offering for sale, having 68 | made, import, or transfer of either its Contributions or its 69 | Contributor Version. 70 | 71 | **1.12. “Secondary License”** 72 | means either the GNU General Public License, Version 2.0, the GNU 73 | Lesser General Public License, Version 2.1, the GNU Affero General 74 | Public License, Version 3.0, or any later versions of those 75 | licenses. 76 | 77 | **1.13. “Source Code Form”** 78 | means the form of the work preferred for making modifications. 79 | 80 | **1.14. “You” (or “Your”)** 81 | means an individual or a legal entity exercising rights under this 82 | License. For legal entities, “You” includes any entity that 83 | controls, is controlled by, or is under common control with You. For 84 | purposes of this definition, “control” means **(a)** the power, direct 85 | or indirect, to cause the direction or management of such entity, 86 | whether by contract or otherwise, or **(b)** ownership of more than 87 | fifty percent (50%) of the outstanding shares or beneficial 88 | ownership of such entity. 89 | 90 | 91 | ### 2. License Grants and Conditions 92 | 93 | #### 2.1. Grants 94 | 95 | Each Contributor hereby grants You a world-wide, royalty-free, 96 | non-exclusive license: 97 | 98 | * **(a)** under intellectual property rights (other than patent or trademark) 99 | Licensable by such Contributor to use, reproduce, make available, 100 | modify, display, perform, distribute, and otherwise exploit its 101 | Contributions, either on an unmodified basis, with Modifications, or 102 | as part of a Larger Work; and 103 | * **(b)** under Patent Claims of such Contributor to make, use, sell, offer 104 | for sale, have made, import, and otherwise transfer either its 105 | Contributions or its Contributor Version. 106 | 107 | #### 2.2. Effective Date 108 | 109 | The licenses granted in Section 2.1 with respect to any Contribution 110 | become effective for each Contribution on the date the Contributor first 111 | distributes such Contribution. 112 | 113 | #### 2.3. Limitations on Grant Scope 114 | 115 | The licenses granted in this Section 2 are the only rights granted under 116 | this License. No additional rights or licenses will be implied from the 117 | distribution or licensing of Covered Software under this License. 118 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 119 | Contributor: 120 | 121 | * **(a)** for any code that a Contributor has removed from Covered Software; 122 | or 123 | * **(b)** for infringements caused by: **(i)** Your and any other third party's 124 | modifications of Covered Software, or **(ii)** the combination of its 125 | Contributions with other software (except as part of its Contributor 126 | Version); or 127 | * **(c)** under Patent Claims infringed by Covered Software in the absence of 128 | its Contributions. 129 | 130 | This License does not grant any rights in the trademarks, service marks, 131 | or logos of any Contributor (except as may be necessary to comply with 132 | the notice requirements in Section 3.4). 133 | 134 | #### 2.4. Subsequent Licenses 135 | 136 | No Contributor makes additional grants as a result of Your choice to 137 | distribute the Covered Software under a subsequent version of this 138 | License (see Section 10.2) or under the terms of a Secondary License (if 139 | permitted under the terms of Section 3.3). 140 | 141 | #### 2.5. Representation 142 | 143 | Each Contributor represents that the Contributor believes its 144 | Contributions are its original creation(s) or it has sufficient rights 145 | to grant the rights to its Contributions conveyed by this License. 146 | 147 | #### 2.6. Fair Use 148 | 149 | This License is not intended to limit any rights You have under 150 | applicable copyright doctrines of fair use, fair dealing, or other 151 | equivalents. 152 | 153 | #### 2.7. Conditions 154 | 155 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 156 | in Section 2.1. 157 | 158 | 159 | ### 3. Responsibilities 160 | 161 | #### 3.1. Distribution of Source Form 162 | 163 | All distribution of Covered Software in Source Code Form, including any 164 | Modifications that You create or to which You contribute, must be under 165 | the terms of this License. You must inform recipients that the Source 166 | Code Form of the Covered Software is governed by the terms of this 167 | License, and how they can obtain a copy of this License. You may not 168 | attempt to alter or restrict the recipients' rights in the Source Code 169 | Form. 170 | 171 | #### 3.2. Distribution of Executable Form 172 | 173 | If You distribute Covered Software in Executable Form then: 174 | 175 | * **(a)** such Covered Software must also be made available in Source Code 176 | Form, as described in Section 3.1, and You must inform recipients of 177 | the Executable Form how they can obtain a copy of such Source Code 178 | Form by reasonable means in a timely manner, at a charge no more 179 | than the cost of distribution to the recipient; and 180 | 181 | * **(b)** You may distribute such Executable Form under the terms of this 182 | License, or sublicense it under different terms, provided that the 183 | license for the Executable Form does not attempt to limit or alter 184 | the recipients' rights in the Source Code Form under this License. 185 | 186 | #### 3.3. Distribution of a Larger Work 187 | 188 | You may create and distribute a Larger Work under terms of Your choice, 189 | provided that You also comply with the requirements of this License for 190 | the Covered Software. If the Larger Work is a combination of Covered 191 | Software with a work governed by one or more Secondary Licenses, and the 192 | Covered Software is not Incompatible With Secondary Licenses, this 193 | License permits You to additionally distribute such Covered Software 194 | under the terms of such Secondary License(s), so that the recipient of 195 | the Larger Work may, at their option, further distribute the Covered 196 | Software under the terms of either this License or such Secondary 197 | License(s). 198 | 199 | #### 3.4. Notices 200 | 201 | You may not remove or alter the substance of any license notices 202 | (including copyright notices, patent notices, disclaimers of warranty, 203 | or limitations of liability) contained within the Source Code Form of 204 | the Covered Software, except that You may alter any license notices to 205 | the extent required to remedy known factual inaccuracies. 206 | 207 | #### 3.5. Application of Additional Terms 208 | 209 | You may choose to offer, and to charge a fee for, warranty, support, 210 | indemnity or liability obligations to one or more recipients of Covered 211 | Software. However, You may do so only on Your own behalf, and not on 212 | behalf of any Contributor. You must make it absolutely clear that any 213 | such warranty, support, indemnity, or liability obligation is offered by 214 | You alone, and You hereby agree to indemnify every Contributor for any 215 | liability incurred by such Contributor as a result of warranty, support, 216 | indemnity or liability terms You offer. You may include additional 217 | disclaimers of warranty and limitations of liability specific to any 218 | jurisdiction. 219 | 220 | 221 | ### 4. Inability to Comply Due to Statute or Regulation 222 | 223 | If it is impossible for You to comply with any of the terms of this 224 | License with respect to some or all of the Covered Software due to 225 | statute, judicial order, or regulation then You must: **(a)** comply with 226 | the terms of this License to the maximum extent possible; and **(b)** 227 | describe the limitations and the code they affect. Such description must 228 | be placed in a text file included with all distributions of the Covered 229 | Software under this License. Except to the extent prohibited by statute 230 | or regulation, such description must be sufficiently detailed for a 231 | recipient of ordinary skill to be able to understand it. 232 | 233 | 234 | ### 5. Termination 235 | 236 | **5.1.** The rights granted under this License will terminate automatically 237 | if You fail to comply with any of its terms. However, if You become 238 | compliant, then the rights granted under this License from a particular 239 | Contributor are reinstated **(a)** provisionally, unless and until such 240 | Contributor explicitly and finally terminates Your grants, and **(b)** on an 241 | ongoing basis, if such Contributor fails to notify You of the 242 | non-compliance by some reasonable means prior to 60 days after You have 243 | come back into compliance. Moreover, Your grants from a particular 244 | Contributor are reinstated on an ongoing basis if such Contributor 245 | notifies You of the non-compliance by some reasonable means, this is the 246 | first time You have received notice of non-compliance with this License 247 | from such Contributor, and You become compliant prior to 30 days after 248 | Your receipt of the notice. 249 | 250 | **5.2.** If You initiate litigation against any entity by asserting a patent 251 | infringement claim (excluding declaratory judgment actions, 252 | counter-claims, and cross-claims) alleging that a Contributor Version 253 | directly or indirectly infringes any patent, then the rights granted to 254 | You by any and all Contributors for the Covered Software under Section 255 | 2.1 of this License shall terminate. 256 | 257 | **5.3.** In the event of termination under Sections 5.1 or 5.2 above, all 258 | end user license agreements (excluding distributors and resellers) which 259 | have been validly granted by You or Your distributors under this License 260 | prior to termination shall survive termination. 261 | 262 | 263 | ### 6. Disclaimer of Warranty 264 | 265 | > Covered Software is provided under this License on an “as is” 266 | > basis, without warranty of any kind, either expressed, implied, or 267 | > statutory, including, without limitation, warranties that the 268 | > Covered Software is free of defects, merchantable, fit for a 269 | > particular purpose or non-infringing. The entire risk as to the 270 | > quality and performance of the Covered Software is with You. 271 | > Should any Covered Software prove defective in any respect, You 272 | > (not any Contributor) assume the cost of any necessary servicing, 273 | > repair, or correction. This disclaimer of warranty constitutes an 274 | > essential part of this License. No use of any Covered Software is 275 | > authorized under this License except under this disclaimer. 276 | 277 | ### 7. Limitation of Liability 278 | 279 | > Under no circumstances and under no legal theory, whether tort 280 | > (including negligence), contract, or otherwise, shall any 281 | > Contributor, or anyone who distributes Covered Software as 282 | > permitted above, be liable to You for any direct, indirect, 283 | > special, incidental, or consequential damages of any character 284 | > including, without limitation, damages for lost profits, loss of 285 | > goodwill, work stoppage, computer failure or malfunction, or any 286 | > and all other commercial damages or losses, even if such party 287 | > shall have been informed of the possibility of such damages. This 288 | > limitation of liability shall not apply to liability for death or 289 | > personal injury resulting from such party's negligence to the 290 | > extent applicable law prohibits such limitation. Some 291 | > jurisdictions do not allow the exclusion or limitation of 292 | > incidental or consequential damages, so this exclusion and 293 | > limitation may not apply to You. 294 | 295 | 296 | ### 8. Litigation 297 | 298 | Any litigation relating to this License may be brought only in the 299 | courts of a jurisdiction where the defendant maintains its principal 300 | place of business and such litigation shall be governed by laws of that 301 | jurisdiction, without reference to its conflict-of-law provisions. 302 | Nothing in this Section shall prevent a party's ability to bring 303 | cross-claims or counter-claims. 304 | 305 | 306 | ### 9. Miscellaneous 307 | 308 | This License represents the complete agreement concerning the subject 309 | matter hereof. If any provision of this License is held to be 310 | unenforceable, such provision shall be reformed only to the extent 311 | necessary to make it enforceable. Any law or regulation which provides 312 | that the language of a contract shall be construed against the drafter 313 | shall not be used to construe this License against a Contributor. 314 | 315 | 316 | ### 10. Versions of the License 317 | 318 | #### 10.1. New Versions 319 | 320 | Mozilla Foundation is the license steward. Except as provided in Section 321 | 10.3, no one other than the license steward has the right to modify or 322 | publish new versions of this License. Each version will be given a 323 | distinguishing version number. 324 | 325 | #### 10.2. Effect of New Versions 326 | 327 | You may distribute the Covered Software under the terms of the version 328 | of the License under which You originally received the Covered Software, 329 | or under the terms of any subsequent version published by the license 330 | steward. 331 | 332 | #### 10.3. Modified Versions 333 | 334 | If you create software not governed by this License, and you want to 335 | create a new license for such software, you may create and use a 336 | modified version of this License if you rename the license and remove 337 | any references to the name of the license steward (except to note that 338 | such modified license differs from this License). 339 | 340 | #### 10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses 341 | 342 | If You choose to distribute Source Code Form that is Incompatible With 343 | Secondary Licenses under the terms of this version of the License, the 344 | notice described in Exhibit B of this License must be attached. 345 | 346 | ## Exhibit A - Source Code Form License Notice 347 | 348 | This Source Code Form is subject to the terms of the Mozilla Public 349 | License, v. 2.0. If a copy of the MPL was not distributed with this 350 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 351 | 352 | If it is not possible or desirable to put the notice in a particular 353 | file, then You may include the notice in a location (such as a LICENSE 354 | file in a relevant directory) where a recipient would be likely to look 355 | for such a notice. 356 | 357 | You may add additional accurate notices of copyright ownership. 358 | 359 | ## Exhibit B - “Incompatible With Secondary Licenses” Notice 360 | 361 | This Source Code Form is "Incompatible With Secondary Licenses", as 362 | defined by the Mozilla Public License, v. 2.0. 363 | 364 | 365 | -------------------------------------------------------------------------------- /fast5_research/util.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from itertools import tee 3 | from math import pow, log10 4 | import os 5 | import sys 6 | 7 | import numpy as np 8 | import numpy.lib.recfunctions as nprf 9 | 10 | 11 | def qstring_to_phred(quality): 12 | """Compute standard phred scores from a quality string.""" 13 | qscores = [ord(q) - 33 for q in quality] 14 | return qscores 15 | 16 | 17 | def mean_qscore(scores): 18 | """Returns the phred score corresponding to the mean of the probabilities 19 | associated with the phred scores provided. Taken from chimaera.common.utilities. 20 | 21 | 22 | :param scores: Iterable of phred scores. 23 | 24 | :returns: Phred score corresponding to the average error rate, as 25 | estimated from the input phred scores. 26 | """ 27 | if len(scores) == 0: 28 | return 0.0 29 | sum_prob = 0.0 30 | for val in scores: 31 | sum_prob += pow(10, -0.1 * val) 32 | mean_prob = sum_prob / len(scores) 33 | return -10.0 * log10(mean_prob) 34 | 35 | 36 | def kmer_overlap_gen(kmers, moves=None): 37 | """From a list of kmers return the character shifts between them. 38 | (Movement from i to i+1 entry, e.g. [AATC,ATCG] returns [0,1]). 39 | Allowed moves may be specified in moves argument in order of preference. 40 | Taken from dragonet.bio.seq_tools 41 | 42 | :param kmers: sequence of kmer strings. 43 | :param moves: allowed movements, if None all movements to length of kmer 44 | are allowed. 45 | """ 46 | 47 | first = True 48 | yield 0 49 | for last_kmer, this_kmer in window(kmers, 2): 50 | if first: 51 | if moves is None: 52 | l = len(this_kmer) 53 | moves = range(l + 1) 54 | first = False 55 | 56 | l = len(this_kmer) 57 | for j in moves: 58 | if j < 0: 59 | if last_kmer[:j] == this_kmer[-j:]: 60 | yield j 61 | break 62 | elif j > 0 and j < l: 63 | if last_kmer[j:l] == this_kmer[0:-j]: 64 | yield j 65 | break 66 | elif j == 0: 67 | if last_kmer == this_kmer: 68 | yield 0 69 | break 70 | else: 71 | yield l 72 | break 73 | 74 | 75 | def build_mapping_table(events, ref_seq, post, scale, path, model): 76 | """Build a mapping table based on output of a dragonet.mapper style object. 77 | Taken from chimaera.common.utilities. 78 | 79 | :param events: Numpy record array of events. Must contain the mean, 80 | stdv, start and length fields. 81 | :param ref_seq: String representation of the reference sequence. 82 | :param post: Numpy 2D array containing the posteriors (event, state). 83 | :param scale: Scaling object. 84 | :param path: Numpy 1D array containing position in reference. May contain 85 | negative values, which will be interpreted as "bad emissions". 86 | :param model: Model object to use. 87 | 88 | :returns: numpy record array containing summary fields. One record per event. 89 | 90 | ==================== ===================================================== 91 | Output Field Description 92 | ==================== ===================================================== 93 | *mean* mean value of event samples (level) 94 | *scaled_mean* *mean* scaled to the bare level emission (mean/mode) 95 | *stdv* standard deviation of event samples (noise) 96 | *scaled_stdv* *stdv* scaled to the bare stdv emission (mode) 97 | *start* start time of event /s 98 | *length* length of event /s 99 | *model_level* modelled event level, i.e. the level emission 100 | associated with the kmer *kmer*, scaled to the data 101 | *model_scaled_level* bare level emission 102 | *model_sd* modelled event noise, i.e. the sd emission associated 103 | with the kmer *kmer*, scaled to the data 104 | *model_scaled_sd* bare noise emission 105 | *seq_pos* aligned sequence position, position on Viterbi path 106 | *p_seq_pos* posterior probability of states on Viterbi path 107 | *kmer* kmer identity of *seq_pos* 108 | *mp_pos* aligned sequence position, position with highest 109 | posterioir 110 | *p_mp_pos* posterior probability of most probable states 111 | *mp_kmer* kmer identity of *mp_kmer* 112 | *good_emission* whether or not the HMM has tagged event as fitting 113 | the model 114 | ==================== ===================================================== 115 | 116 | """ 117 | kmer_len = len(model['kmer'][0]) 118 | 119 | kmer_index = seq_to_kmers(ref_seq, kmer_len) 120 | label_index = dict((j,i) for i,j in enumerate(model['kmer'])) 121 | kmer_dtype = '|S{}'.format(kmer_len) 122 | 123 | column_names = ['mean', 'scaled_mean', 'stdv', 'scaled_stdv', 'start', 'length', 124 | 'model_level', 'model_scaled_level', 'model_sd', 'model_scaled_sd', 125 | 'p_seq_pos', 'p_mp_pos', 'seq_pos', 'mp_pos', 'move', 'good_emission', 126 | 'kmer', 'mp_kmer'] 127 | column_types = [float] * 12 + [int] * 3 + [bool] + [kmer_dtype] * 2 128 | table = np.zeros(events.size, dtype=list(zip(column_names, column_types))) 129 | 130 | zero_start = events['start'] - events['start'][0] 131 | 132 | # Sequence position 133 | seq_pos = np.where(path >= 0, path, np.abs(path) - 1) 134 | seq_kmer = [kmer_index[x] for x in seq_pos] 135 | seq_kmer_i = [label_index[i] for i in seq_kmer] 136 | 137 | table['seq_pos'] = seq_pos 138 | table['kmer'] = seq_kmer 139 | table['p_seq_pos'] = post[range(post.shape[0]), seq_pos] 140 | table['move'] = np.ediff1d(seq_pos, to_begin=[0]) 141 | # Highest posterior positions 142 | mp_pos = np.argmax(post, axis=1) 143 | table['mp_pos'] = mp_pos 144 | table['mp_kmer'] = [kmer_index[x] for x in mp_pos] 145 | table['p_mp_pos'] = post[range(post.shape[0]), table['mp_pos']] 146 | # The data 147 | for x in ('mean', 'start','length', 'stdv'): 148 | table[x] = events[x] 149 | # scaling data to model 150 | table['scaled_mean'] = (table['mean'] - scale.shift - scale.drift * zero_start) / scale.scale 151 | table['scaled_stdv'] = table['stdv'] / scale.scale_sd 152 | # The model 153 | table['model_scaled_level'] = model['level_mean'][seq_kmer_i] 154 | table['model_scaled_sd'] = model['sd_mean'][seq_kmer_i] 155 | # The model scaled to the data 156 | table['model_level'] = scale.shift + scale.drift * zero_start + scale.scale * table['model_scaled_level'] 157 | table['model_sd'] = scale.scale_sd * table['model_scaled_sd'] 158 | # Tag ignore and outlier states 159 | table['good_emission'] = [x >= 0 for x in path] 160 | return table 161 | 162 | 163 | def build_mapping_summary_table(mapping_summary): 164 | """Build a mapping summary table 165 | 166 | :param mapping_summary: List of curr_map dictionaries 167 | 168 | :returns: Numpy record array containing summary contents. One record per array element of mapping_summary 169 | 170 | """ 171 | # Set memory allocation for variable length strings 172 | # This works, but there must be a better way 173 | max_len_name = 1 174 | max_len_direction = 1 175 | max_len_seq = 1 176 | for summary_line in mapping_summary: 177 | len_name = len(summary_line['name']) 178 | if len_name > max_len_name: 179 | max_len_name = len_name 180 | 181 | len_direction = len(summary_line['direction']) 182 | if len_direction > max_len_direction: 183 | max_len_direction = len_direction 184 | 185 | len_seq = len(summary_line['seq']) 186 | if len_seq > max_len_seq: 187 | max_len_seq = len_seq 188 | 189 | column_names = ['name', 'direction', 'is_best', 'score', 'scale', 'shift', 'drift', 'scale_sd', 'var_sd', 'var', 'seq'] 190 | column_types = ['|S{}'.format(max_len_name)] + ['|S{}'.format(max_len_direction)] + [bool] + [float] * 7 + ['|S{}'.format(max_len_seq)] 191 | 192 | table = np.zeros(len(mapping_summary), dtype=list(zip(column_names, column_types))) 193 | for table_line, summary_line, in zip(table,mapping_summary): 194 | table_line['name'] = summary_line['name'] 195 | table_line['direction'] = summary_line['direction'] 196 | table_line['score'] = summary_line['score'] 197 | table_line['scale'] = summary_line['scale'].scale 198 | table_line['shift'] = summary_line['scale'].shift 199 | table_line['drift'] = summary_line['scale'].drift 200 | table_line['scale_sd'] = summary_line['scale'].scale_sd 201 | table_line['var_sd'] = summary_line['scale'].var_sd 202 | table_line['var'] = summary_line['scale'].var 203 | table_line['seq'] = summary_line['seq'] 204 | 205 | table['is_best'] = False 206 | is_best = np.argmin([line['score'] for line in mapping_summary]) 207 | table[is_best]['is_best'] = True 208 | 209 | return table 210 | 211 | 212 | def create_basecall_1d_output(raw_events, scale, path, model, post=None): 213 | """Create the annotated event table and basecalling summaries similiar to chimaera. 214 | 215 | :param raw_events: :class:`np.ndarray` with fields mean, stdv, start and, 216 | length fields. 217 | :param scale: :class:`dragonet.basecall.scaling.Scaler` object (or object 218 | with attributes `shift`, `scale`, `drift`, `var`, `scale_sd`, `var_sd`, 219 | and `var_sd`. 220 | :param path: list containing state indices with respect to `model`. 221 | :param model: `:class:dragonet.util.model.Model` object. 222 | :param post: Two-dimensional :class:`np.ndarray` containing posteriors (event, state). 223 | :param quality_data: :class:np.ndarray Array containing quality_data, used to annotate events. 224 | 225 | :returns: A tuple of: 226 | 227 | * the annotated input event table 228 | * a dict of result 229 | """ 230 | 231 | events = raw_events.copy() 232 | model_state = np.array([model[x]['kmer'] for x in path]) 233 | raw_model_level = np.array([model[x]['level_mean'] for x in path]) 234 | move = np.array(list(kmer_overlap_gen(model_state))) 235 | counts = np.bincount(move) 236 | stays = counts[0] 237 | skips = counts[2] if len(counts) > 2 else 0 238 | 239 | # Extend the event table 240 | read_start = events[0]['start'] 241 | model_level = scale.shift + scale.scale * raw_model_level +\ 242 | scale.drift * (events['start'] - read_start) 243 | new_columns = ['model_state', 'model_level', 'move'] 244 | column_data = [model_state, model_level, move] 245 | 246 | if post is not None: 247 | weights = np.sum(post, axis=1) 248 | new_columns.append('weights') 249 | column_data.append(weights) 250 | 251 | drop_first = set(new_columns) & set(events.dtype.names) 252 | events = nprf.drop_fields(events, drop_first) 253 | table = nprf.append_fields(events, new_columns, data=column_data, asrecarray=True) 254 | 255 | # Compile the results 256 | results = { 257 | 'num_events': events.size, 258 | 'called_events': events.size, 259 | 'shift': scale.shift, 260 | 'scale': scale.scale, 261 | 'drift': scale.drift, 262 | 'var': scale.var, 263 | 'scale_sd': scale.scale_sd, 264 | 'var_sd': scale.var_sd, 265 | 'num_stays': stays, 266 | 'num_skips': skips 267 | } 268 | 269 | return table, results 270 | 271 | 272 | def create_mapping_output(raw_events, scale, path, model, seq, post=None, n_states=None, is_reverse=False, substates=False): 273 | """Create the annotated event table and summaries similiar to chimaera 274 | 275 | :param raw_events: :class:`np.ndarray` with fields `mean`, `stdv`, `start`, 276 | and `length` fields. 277 | :param scale: :class:`dragonet.basecall.scaling.Scaler` object (or object 278 | with attributes `shift`, `scale`, `drift`, `var`, `scale_sd`, `var_sd`, 279 | and `var_sd`. 280 | :param path: list containing state indices with respect to `model`. 281 | :param model: `:class:dragonet.util.model.Model` object. 282 | :param seq: String representation of the reference sequence. 283 | :param post: Two-dimensional :class:`np.ndarray` containing posteriors (event, state). 284 | :param is_reverse: Mapping refers to '-' strand (bool). 285 | :param substate: Mapping contains substates? 286 | 287 | :returns: A tuple of: 288 | * the annotated input event table, 289 | * a dict of result. 290 | """ 291 | 292 | events = raw_events.copy() 293 | direction = '+' if not is_reverse else '-' 294 | has_post = True 295 | 296 | # If we don't have a posterior, pass a mock object 297 | if post is None: 298 | if n_states is None: 299 | raise ValueError('n_states is required if post is None.') 300 | has_post = False 301 | post = MockZeroArray((len(events), n_states)) 302 | table = build_mapping_table(events, seq, post, scale, path, model) 303 | 304 | # Delete mocked out columns 305 | if not has_post: 306 | to_delete = ['p_seq_pos', 'mp_pos', 'mp_kmer', 'p_mp_pos'] 307 | table = nprf.drop_fields(table, to_delete) 308 | 309 | if direction == '-': 310 | events['seq_pos'] = len(seq) - table['seq_pos'] 311 | ref_start = table['seq_pos'][-1] 312 | ref_stop = table['seq_pos'][0] 313 | else: 314 | ref_start = table['seq_pos'][0] 315 | ref_stop = table['seq_pos'][-1] 316 | 317 | # Compute movement stats. 318 | _, stays, skips = compute_movement_stats(path) 319 | 320 | results = { 321 | 'direction': direction, 322 | 'reference': seq, 323 | 'ref_start': ref_start, 324 | 'ref_stop': ref_stop, 325 | 'shift': scale.shift, 326 | 'scale': scale.scale, 327 | 'drift': scale.drift, 328 | 'var': scale.var, 329 | 'scale_sd': scale.scale_sd, 330 | 'var_sd': scale.var_sd, 331 | 'num_stays': stays, 332 | 'num_skips': skips 333 | } 334 | 335 | return table, results 336 | 337 | 338 | class MockZeroArray(np.ndarray): 339 | def __init__(self, shape): 340 | """Mock enough of ndarray interface to be passable as a posterior matrix 341 | to chimaera build_mapping_table 342 | 343 | :param shape: tuple, shape of array 344 | 345 | """ 346 | self.shape = shape 347 | 348 | def argmax(self, axis=0): 349 | """Fake argmax values of an array.""" 350 | return np.zeros(self.shape[1-axis], dtype=int) 351 | 352 | 353 | def validate_event_table(table): 354 | """Check if an object contains all columns of a basic event array.""" 355 | 356 | if not isinstance(table, np.ndarray): 357 | raise TypeError('Table is not a ndarray.') 358 | 359 | req_fields = ['mean', 'stdv', 'start', 'length'] 360 | if not set(req_fields).issubset(table.dtype.names): 361 | raise KeyError( 362 | 'Array does not contain fields for event array: {}, got {}.'.format( 363 | req_fields, table.dtype.names 364 | ) 365 | ) 366 | 367 | 368 | def validate_model_table(table): 369 | """Check if an object contains all columns of a dragonet Model.""" 370 | if not isinstance(table, np.ndarray): 371 | raise TypeError('Table is not a ndarray.') 372 | 373 | req_fields = ['kmer', 'level_mean', 'level_stdv', 'sd_mean', 'sd_stdv'] 374 | if not set(req_fields).issubset(table.dtype.names): 375 | raise KeyError( 376 | 'Object does not contain fields required for Model: {}, got {}.'.format( 377 | req_fields, table.dtype.names 378 | ) 379 | ) 380 | 381 | 382 | def validate_scale_object(obj): 383 | """Check if an object contains all attributes of dragonet Scaler.""" 384 | 385 | req_attributes = ['shift', 'scale', 'drift', 'var', 'scale_sd', 'var_sd'] 386 | msg = 'Object does not contain attributes required for Scaler: {}'.format(req_attributes) 387 | assert all([hasattr(obj, attr) for attr in req_attributes]), msg 388 | 389 | 390 | def compute_movement_stats(path): 391 | """Compute movement stats from a mapping state path 392 | 393 | :param path: :class:`np.ndarry` containing position in reference. 394 | Negative values are interpreted as "bad emissions". 395 | """ 396 | 397 | vitstate_indices = np.where(path >= 0, path, np.abs(path) - 1) 398 | move = np.ediff1d(vitstate_indices, to_begin=0) 399 | counts = np.bincount(move) 400 | stays = counts[0] 401 | skips = counts[2] if len(counts) > 2 else 0 402 | 403 | return move, stays, skips 404 | 405 | 406 | def seq_to_kmers(seq, length): 407 | """Turn a string into a list of (overlapping) kmers. 408 | 409 | e.g. perform the transformation: 410 | 411 | 'ATATGCG' => ['ATA','TAT', 'ATG', 'TGC', 'GCG'] 412 | 413 | :param seq: character string 414 | :param length: length of kmers in output 415 | 416 | :returns: A list of overlapping kmers 417 | """ 418 | return [seq[x:x + length] for x in range(0, len(seq) - length + 1)] 419 | 420 | 421 | def window(iterable, size): 422 | """Create an iterator returning a sliding window from another iterator 423 | 424 | :param iterable: Iterator 425 | :param size: Size of window 426 | 427 | :returns: an iterator returning a tuple containing the data in the window 428 | 429 | """ 430 | assert size > 0, "Window size for iterator should be strictly positive, got {0}".format(size) 431 | iters = tee(iterable, size) 432 | for i in range(1, size): 433 | for each in iters[i:]: 434 | next(each, None) 435 | return list(zip(*iters)) 436 | 437 | 438 | def readtsv(fname, fields=None, **kwargs): 439 | """Read a tsv file into a numpy array with required field checking 440 | 441 | :param fname: filename to read. If the filename extension is 442 | gz or bz2, the file is first decompressed. 443 | :param fields: list of required fields. 444 | """ 445 | 446 | if not file_has_fields(fname, fields): 447 | raise KeyError('File {} does not contain requested required fields {}'.format(fname, fields)) 448 | 449 | for k in ['names', 'delimiter', 'dtype']: 450 | kwargs.pop(k, None) 451 | table = np.genfromtxt(fname, names=True, delimiter='\t', dtype=None, encoding='utf8', **kwargs) 452 | # Numpy tricks to force single element to be array of one row 453 | return table.reshape(-1) 454 | 455 | 456 | def docstring_parameter(*sub): 457 | """Allow docstrings to contain parameters.""" 458 | def dec(obj): 459 | obj.__doc__ = obj.__doc__.format(*sub) 460 | return obj 461 | return dec 462 | 463 | 464 | def med_mad(data, factor=None, axis=None, keepdims=False): 465 | """Compute the Median Absolute Deviation, i.e., the median 466 | of the absolute deviations from the median, and the median 467 | 468 | :param data: A :class:`ndarray` object 469 | :param factor: Factor to scale MAD by. Default (None) is to be consistent 470 | with the standard deviation of a normal distribution 471 | (i.e. mad( N(0,\sigma^2) ) = \sigma). 472 | :param axis: For multidimensional arrays, which axis to calculate over 473 | :param keepdims: If True, axis is kept as dimension of length 1 474 | 475 | :returns: a tuple containing the median and MAD of the data 476 | 477 | """ 478 | if factor is None: 479 | factor = 1.4826 480 | dmed = np.median(data, axis=axis, keepdims=True) 481 | dmad = factor * np.median(abs(data - dmed), axis=axis, keepdims=True) 482 | if axis is None: 483 | dmed = dmed.flatten()[0] 484 | dmad = dmad.flatten()[0] 485 | elif not keepdims: 486 | dmed = dmed.squeeze(axis) 487 | dmad = dmad.squeeze(axis) 488 | return dmed, dmad 489 | 490 | 491 | def mad(data, factor=None, axis=None, keepdims=False): 492 | """Compute the Median Absolute Deviation, i.e., the median 493 | of the absolute deviations from the median, and (by default) 494 | adjust by a factor for asymptotically normal consistency. 495 | 496 | :param data: A :class:`ndarray` object 497 | :param factor: Factor to scale MAD by. Default (None) is to be consistent 498 | with the standard deviation of a normal distribution 499 | (i.e. mad( N(0,\sigma^2) ) = \sigma). 500 | :param axis: For multidimensional arrays, which axis to calculate the median over. 501 | :param keepdims: If True, axis is kept as dimension of length 1 502 | 503 | :returns: the (scaled) MAD 504 | 505 | """ 506 | _ , dmad = med_mad(data, factor=factor, axis=axis, keepdims=keepdims) 507 | return dmad 508 | 509 | 510 | def file_has_fields(fname, fields=None): 511 | """Check that a tsv file has given fields 512 | 513 | :param fname: filename to read. If the filename extension is 514 | gz or bz2, the file is first decompressed. 515 | :param fields: list of required fields. 516 | 517 | :returns: boolean 518 | """ 519 | 520 | # Allow a quick return 521 | req_fields = deepcopy(fields) 522 | if isinstance(req_fields, str): 523 | req_fields = [fields] 524 | if req_fields is None or len(req_fields) == 0: 525 | return True 526 | req_fields = set(req_fields) 527 | 528 | inspector = open 529 | ext = os.path.splitext(fname)[1] 530 | if ext == '.gz': 531 | inspector = gzopen 532 | elif ext == '.bz2': 533 | inspector = bzopen 534 | 535 | has_fields = None 536 | with inspector(fname, 'r') as fh: 537 | present_fields = set(fh.readline().rstrip('\n').split('\t')) 538 | has_fields = req_fields.issubset(present_fields) 539 | return has_fields 540 | 541 | 542 | def get_changes(data, ignore_cols=None, use_cols=None): 543 | """Return only rows of a structured array which are not equal to the previous row. 544 | 545 | :param data: Numpy record array. 546 | :param ignore_cols: iterable of column names to ignore in checking for equality between rows. 547 | :param use_cols: iterable of column names to include in checking for equality between rows (only used if ignore_cols is None). 548 | 549 | :returns: Numpy record array. 550 | """ 551 | cols = list(data.dtype.names) 552 | if ignore_cols is not None: 553 | for col in ignore_cols: 554 | cols.remove(col) 555 | elif use_cols is not None: 556 | cols = list(use_cols) 557 | changed_inds = np.where(data[cols][1:] != data[cols][:-1])[0] + 1 558 | changed_inds = [0] + [i for i in changed_inds] 559 | return data[(changed_inds,)] 560 | 561 | 562 | def _clean(value): 563 | """Convert numpy numeric types to their python equivalents.""" 564 | if isinstance(value, np.ndarray): 565 | if value.dtype.kind == 'S': 566 | return np.char.decode(value).tolist() 567 | else: 568 | return value.tolist() 569 | elif type(value).__module__ == np.__name__: 570 | conversion = value.item() 571 | if sys.version_info.major == 3 and isinstance(conversion, bytes): 572 | conversion = conversion.decode() 573 | return conversion 574 | elif sys.version_info.major == 3 and isinstance(value, bytes): 575 | return value.decode() 576 | else: 577 | return value 578 | 579 | 580 | def _clean_attrs(attrs): 581 | return {_clean(k): _clean(v) for k, v in attrs.items()} 582 | 583 | 584 | def _sanitize_data_for_writing(data): 585 | if isinstance(data, str): 586 | return data.encode() 587 | elif isinstance(data, np.ndarray) and data.dtype.kind == np.dtype(np.unicode): 588 | return data.astype('S') 589 | elif isinstance(data, np.ndarray) and len(data.dtype) > 1: 590 | dtypes = dtype_descr(data) 591 | for index, entry in enumerate(dtypes): 592 | type_check = entry[1] 593 | if isinstance(type_check, tuple): 594 | # an enum? 595 | return data 596 | if type_check.startswith(' 1: 615 | dtypes = list(dtype_descr(data)) 616 | for index, entry in enumerate(dtypes): 617 | type_check = entry[1] 618 | if isinstance(type_check, tuple): 619 | # an enum? 620 | return data 621 | if entry[1].startswith('|S'): 622 | # numpy.astype can't handle empty datafields for some 623 | # reason, so we'll explicitly state that. 624 | if len(entry[1]) <= 2 or (len(entry[1]) == 3 and 625 | entry[1][2] == '0'): 626 | raise TypeError('Empty datafield {} cannot be converted' 627 | ' by np.astype.'.format(entry[0])) 628 | dtypes[index] = (entry[0], ' 1: 151 | with ProcessPoolExecutor(args.workers) as executor: 152 | futures = [executor.submit(worker, c, summary=summary_by_ch[c]) for c in channels] 153 | for future in as_completed(futures): 154 | try: 155 | n_reads, channel = future.result() 156 | except Exception as e: 157 | logger.warning("Error processing channel.") 158 | print(e) 159 | else: 160 | logger.info("Extracted {} reads from channel {}.".format(n_reads, channel)) 161 | else: 162 | for channel in channels: 163 | worker(channel, summary=summary_by_ch[channel]) 164 | logger.info("Finished.") 165 | 166 | 167 | def time_cast(time, sample_rate): 168 | """ 169 | Convert a float time to sample index, or return time unmodified 170 | """ 171 | if isinstance(time, float): 172 | return int(time * sample_rate) 173 | else: 174 | return time 175 | 176 | 177 | def extract_channel_reads(source, output, prefix, flat, by_id, max_files, multi, channel, summary=None): 178 | if flat: 179 | out_path = output 180 | # give multi files a channel prefix else they will 181 | # conflict between channels. Singles already get 182 | # a "ch" component in their name 183 | if multi: 184 | extra = 'ch{}'.format(channel) 185 | if prefix == '': 186 | prefix = extra 187 | else: 188 | prefix = '{}_{}'.format(prefix, extra) 189 | else: 190 | out_path = os.path.join(output, str(channel)) 191 | os.makedirs(out_path) 192 | 193 | with BulkFast5(source) as src: 194 | meta = src.get_metadata(channel) 195 | tracking_id = src.get_tracking_meta() 196 | context_tags = src.get_context_meta() 197 | channel_id = { 198 | 'channel_number': channel, 199 | 'range': meta['range'], 200 | 'digitisation': meta['digitisation'], 201 | 'offset': meta['offset'], 202 | 'sampling_rate': meta['sample_rate'] 203 | } 204 | 205 | Writer = MultiWriter if multi else SingleWriter 206 | with Writer(out_path, by_id, prefix=prefix) as writer: 207 | 208 | median_before = None 209 | counter = 1 210 | raw_data = src.get_raw(channel, use_scaling=False) 211 | 212 | if summary is not None: 213 | # convert array into stream of dicts 214 | reads = ({k: row[k] for k in row.dtype.names} for row in summary) 215 | class_field = 'class' 216 | start_field = 'start_time' 217 | duration_field = 'duration' 218 | # if start_time is a float (seconds) we need to convert to 219 | # samples 220 | time_cols = ['start_time', 'duration'] 221 | else: 222 | reads = src.get_reads(channel) 223 | class_field = 'classification' 224 | start_field = 'read_start' 225 | duration_field = 'read_length' 226 | 227 | for read_number, read in enumerate(reads): 228 | 229 | if summary is not None: 230 | if 'median_current_before' in read: 231 | median_before = read['median_current_before'] 232 | else: 233 | median_before = 0.0 234 | elif median_before is None: 235 | median_before = read['median'] 236 | continue 237 | 238 | if summary is None and read[class_field] != 'strand': 239 | median_before = read['median'] 240 | else: 241 | counter += 1 242 | start = time_cast(read[start_field], meta['sample_rate']) 243 | length = time_cast(read[duration_field], meta['sample_rate']) 244 | read_id = { 245 | 'start_time': start, 246 | 'duration': length, 247 | 'read_number': read_number, 248 | 'start_mux': src.get_mux(channel, raw_index=start, wells_only=True), 249 | 'read_id': str(read['read_id']) if 'read_id' in read else str(uuid4()), 250 | 'scaling_used': 1, 251 | 'median_before': median_before 252 | } 253 | 254 | raw_slice = raw_data[start:start+length] 255 | read = Read(read_id, read_number, tracking_id, channel_id, context_tags, raw_slice) 256 | writer.write_read(read) 257 | if counter == max_files: 258 | break 259 | return counter, channel 260 | 261 | 262 | def build_read_index(): 263 | logging.basicConfig( 264 | format='[%(asctime)s - %(name)s] %(message)s', 265 | datefmt='%H:%M:%S', level=logging.INFO 266 | ) 267 | logger = logging.getLogger('Index Reads') 268 | 269 | parser = argparse.ArgumentParser(description='Build index of reads within .fast5s. Output to stdout.') 270 | parser.add_argument('input', help='.fast5 directory') 271 | parser.add_argument('--recursive', action='store_true', 272 | help='Search recursively under `input` for source files.') 273 | parser.add_argument('--workers', type=int, default=8, 274 | help='Number of worker processes.') 275 | args = parser.parse_args() 276 | 277 | src_files = list(iterate_fast5(args.input, paths=True, recursive=args.recursive)) 278 | logger.info("Found {} files.".format(len(src_files))) 279 | 280 | with ProcessPoolExecutor(args.workers) as executor: 281 | n_reads = 0 282 | for i, (src, read_ids) in enumerate( 283 | zip(src_files, executor.map(reads_in_multi, src_files, chunksize=10))): 284 | n_reads += len(read_ids) 285 | for read in read_ids: 286 | print('\t'.join((read, os.path.abspath(src)))) 287 | if i % 10 == 0: 288 | logger.info("Indexed {}/{} files. {} reads".format(i, len(src_files), n_reads)) 289 | 290 | def filter_file_from_bam(): 291 | logging.basicConfig( 292 | format='[%(asctime)s - %(name)s] %(message)s', 293 | datefmt='%H:%M:%S', level=logging.INFO 294 | ) 295 | logger = logging.getLogger('Filter') 296 | parser = argparse.ArgumentParser( 297 | description='Create filter file from BAM and sequencing summary') 298 | parser.add_argument('--seperator', 299 | dest="SEP", 300 | default='\t', 301 | help="Seperator in sequencing summary files") 302 | parser.add_argument('--id-col', 303 | dest="READID_COL", 304 | default='read_id', 305 | help="Column name for read_id in sequencing summary files") 306 | parser.add_argument('--fname-col', 307 | dest="FNAME_COL", 308 | default='filename', 309 | help="Column name for fast5 filename in sequencing summary files") 310 | parser.add_argument('-r', '--region', 311 | dest="REGION", 312 | default=None, 313 | help="Print reads only from this region") 314 | parser.add_argument('--workers', type=int, default=4, 315 | help='Number of worker processes.') 316 | parser.add_argument('-p', '--primary-only', 317 | dest="PRIMARY", 318 | action='store_true', 319 | help="Ignore secondary and supplementary alignments") 320 | 321 | parser.add_argument('BAM', help='Path to BAM file') 322 | parser.add_argument("SUMMARY", 323 | type=str, 324 | nargs='+', 325 | help="Sequencing summary files") 326 | 327 | args = parser.parse_args() 328 | 329 | region = args.REGION 330 | primary_only = args.PRIMARY 331 | bam_in = args.BAM 332 | summary_files = args.SUMMARY 333 | threads = args.workers 334 | readid_col = args.READID_COL 335 | fast5_col = args.FNAME_COL 336 | sep = args.SEP 337 | 338 | if not region: 339 | logger.info("No region specified. Extracting all reads from BAM file") 340 | else: 341 | logger.info("Extracting read ids from {}".format(region)) 342 | 343 | read_ids = {} 344 | with pysam.AlignmentFile(bam_in, "rb", threads=threads) as infile: 345 | for read in infile.fetch(region=region): 346 | if read.is_unmapped or (primary_only and (read.is_secondary or read.is_supplementary)): 347 | continue 348 | read_ids[read.query_name] = None 349 | 350 | n = len(read_ids) 351 | logger.info("Reads found in BAM file: {}".format(n)) 352 | if n == 0: 353 | return 354 | 355 | # Print header 356 | print("read_id", "filename", sep='\t') 357 | 358 | n_print = 0 359 | for summary_file in summary_files: 360 | logging.info("Opening: {}".format(summary_file)) 361 | with gzip.open(summary_file) as fh: 362 | header = fh.readline().decode().strip() 363 | header_cols = header.split(sep) 364 | readid_idx = header_cols.index(readid_col) 365 | path_idx = header_cols.index(fast5_col) 366 | 367 | for line in fh: 368 | line = line.decode().strip() 369 | if not line: 370 | continue 371 | cols = line.split(sep) 372 | readid = cols[readid_idx] 373 | f5_path = cols[path_idx] 374 | if readid not in read_ids: 375 | continue 376 | 377 | if read_ids[readid]: 378 | logging.error("Two entries found for {} ({} and {})".format(readid, read_ids[readid], f5_path)) 379 | continue 380 | 381 | n_print += 1 382 | read_ids[readid] = f5_path 383 | print(readid, read_ids[readid], sep='\t') 384 | logging.info("Filename found for {} reads ({}%)".format(n_print, round(n_print * 100.0 / n))) 385 | 386 | def filter_multi_reads(): 387 | logging.basicConfig( 388 | format='[%(asctime)s - %(name)s] %(message)s', 389 | datefmt='%H:%M:%S', level=logging.INFO 390 | ) 391 | logger = logging.getLogger('Filter') 392 | parser = argparse.ArgumentParser( 393 | description='Extract reads from multi-read .fast5 files.') 394 | parser.add_argument('input', 395 | help='Path to input multi-read .fast5 files (or list of files).') 396 | parser.add_argument('output', 397 | help='Output folder.') 398 | parser.add_argument('filter', 399 | help='A .tsv file with column `read_id` defining required reads. ' 400 | 'If a `filename` column is present, this will be used as the ' 401 | 'location of the read.') 402 | parser.add_argument('--tsv_field', default='read_id', 403 | help='Field name from `filter` file to obtain read IDs.') 404 | parser.add_argument('--prefix', default="", 405 | help='Read file prefix.') 406 | parser.add_argument('--recursive', action='store_true', 407 | help='Search recursively under `input` for source files.') 408 | parser.add_argument('--workers', type=int, default=4, 409 | help='Number of worker processes.') 410 | 411 | out_format = parser.add_mutually_exclusive_group() 412 | out_format.add_argument('--multi', action='store_true', default=True, 413 | help='Output multi-read files.') 414 | out_format.add_argument('--single', action='store_false', dest='multi', 415 | help='Output single-read files.') 416 | 417 | #parser.add_argument('--limit', type=int, default=None, help='Limit reads per channel.') 418 | args = parser.parse_args() 419 | 420 | if not args.multi: 421 | raise NotImplementedError('Extraction of reads to single read files is on the TODO list.') 422 | 423 | if not os.path.exists(args.output): 424 | os.makedirs(args.output) 425 | else: 426 | raise IOError('The output directory must not exist.') 427 | 428 | # grab list of source files 429 | logger.info("Searching for input files.") 430 | try: 431 | src_files = list(set(readtsv(args.input)['filename'])) 432 | except Exception as e: 433 | logger.info('Failed to read `input` as filelist, assuming path to search. {}'.format(e)) 434 | src_files = list(iterate_fast5(args.input, paths=True, recursive=args.recursive)) 435 | n_files = len(src_files) 436 | logger.info("Found {} source files.".format(n_files)) 437 | 438 | logger.info("Reading filter file.") 439 | read_table = readtsv(args.filter, fields=[args.tsv_field]) 440 | logger.info("Found {} reads in filter.".format(len(read_table))) 441 | 442 | try: 443 | # try to build index from the filter file with 'filename' column 444 | if 'filename' not in read_table.dtype.names: 445 | raise ValueError("'filename' column not present in filter.") 446 | logger.info("Attempting to build read index from input filter.") 447 | src_path_files = { 448 | os.path.basename(x):x for x in src_files 449 | } 450 | if len(src_path_files) != len(src_files): 451 | raise ValueError('Found non-uniquely named source files') 452 | read_index = dict() 453 | for fname, indices in group_vector(read_table['filename']).items(): 454 | fpath = src_path_files[os.path.basename(fname)] 455 | read_index[fpath] = read_table[args.tsv_field][indices] 456 | logger.info("Successfully build read index from input filter.") 457 | except Exception as e: 458 | logger.info("Failed to build read index from summary: {}".format(e)) 459 | read_index = None 460 | required_reads = set(read_table[args.tsv_field]) 461 | logger.info("Finding reads within {} source files.".format(n_files)) 462 | index_worker = functools.partial(reads_in_multi, filt=required_reads) 463 | read_index = dict() 464 | n_reads = 0 465 | with ProcessPoolExecutor(args.workers) as executor: 466 | i = 0 467 | for src_file, read_ids in zip(src_files, executor.map(index_worker, src_files, chunksize=10)): 468 | i += 1 469 | n_reads += len(read_ids) 470 | read_index[src_file] = read_ids 471 | if i % 10 == 0: 472 | logger.info("Indexed {}/{} files. {}/{} reads".format(i, n_files, n_reads, len(required_reads))) 473 | 474 | n_reads = sum(len(x) for x in read_index.values()) 475 | # We don't go via creating Read objects, copying the data verbatim 476 | # likely quicker and nothing should need the verification that the APIs 477 | # provide (garbage in, garbage out). 478 | logger.info("Extracting {} reads.".format(n_reads)) 479 | if args.prefix != '': 480 | args.prefix = '{}_'.format(args.prefix) 481 | 482 | with ProcessPoolExecutor(args.workers) as executor: 483 | reads_per_process = np.ceil(n_reads / args.workers) 484 | proc_n_reads = 0 485 | proc_reads = dict() 486 | job = 0 487 | futures = list() 488 | for src in read_index.keys(): 489 | proc_reads[src] = read_index[src] 490 | proc_n_reads += len(proc_reads[src]) 491 | if proc_n_reads > reads_per_process: 492 | proc_prefix = "{}{}_".format(args.prefix, job) 493 | futures.append(executor.submit(_subset_reads_to_file, proc_reads, args.output, proc_prefix, worker_id=job)) 494 | job += 1 495 | proc_n_reads = 0 496 | proc_reads = dict() 497 | if proc_n_reads > 0: # processing remaining reads 498 | proc_prefix = "{}{}_".format(args.prefix, job) 499 | futures.append(executor.submit(_subset_reads_to_file, proc_reads, args.output, proc_prefix, worker_id=job)) 500 | 501 | 502 | for fut in as_completed(futures): 503 | try: 504 | reads_written, prefix = fut.result() 505 | logger.info("Written {} reads to {}.".format(reads_written, prefix)) 506 | except Exception as e: 507 | logger.warning("Error: {}".format(e)) 508 | logger.info("Done.") 509 | 510 | 511 | def _subset_reads_to_file(read_index, output, prefix, worker_id=0): 512 | logger = logging.getLogger('Worker-{}'.format(worker_id)) 513 | n_reads = sum(len(x) for x in read_index.values()) 514 | reads_written = 0 515 | t0 = now() 516 | with MultiWriter(output, None, prefix=prefix) as writer: 517 | for src_file, read_ids in read_index.items(): 518 | reads_written += len(read_ids) 519 | t1 = now() 520 | if t1 - t0 > 30: # log update every 30 seconds 521 | logger.info("Written {}/{} reads ({:.0f}% done)".format( 522 | reads_written, n_reads, 100 * reads_written / n_reads 523 | )) 524 | t0 = t1 525 | with h5py.File(src_file, 'r') as src_fh: 526 | for read_id in read_ids: 527 | try: 528 | read_grp = src_fh["read_{}".format(read_id)] 529 | except: 530 | logger.warning("Did not find {} in {}.".format(read_id, src_fh.filename)) 531 | else: 532 | writer.write_read(read_grp) 533 | return reads_written, prefix 534 | 535 | 536 | def reads_in_multi(src, filt=None): 537 | """Get list of read IDs contained within a multi-read file. 538 | 539 | :param src: source file. 540 | :param filt: perform filtering by given set. 541 | :returns: set of read UUIDs (as string and recorded in hdf group name). 542 | """ 543 | logger = logging.getLogger(os.path.splitext(os.path.basename(src))[0]) 544 | logger.debug("Finding reads.") 545 | prefix = 'read_' 546 | with h5py.File(src, 'r') as fh: 547 | read_ids = set(grp[len(prefix):] for grp in fh if grp.startswith(prefix)) 548 | logger.debug("Found {} reads.".format(len(read_ids))) 549 | if filt is not None: 550 | read_ids = read_ids.intersection(filt) 551 | logger.debug("Filtered to {} reads.".format(len(read_ids))) 552 | return read_ids 553 | 554 | 555 | class Read(object): 556 | # Just a sketch to help interchange of format 557 | def __init__(self, read_id, read_number, tracking_id, channel_id, context_tags, raw): 558 | self.read_id = read_id 559 | self.read_number = read_number 560 | self.tracking_id = tracking_id 561 | self.channel_id = channel_id 562 | self.context_tags = context_tags 563 | self.raw = raw 564 | 565 | # ensure typing and required fields 566 | self.channel_id = Fast5.convert_channel_id(self.channel_id) 567 | self.tracking_id = Fast5.convert_tracking_id(self.tracking_id) 568 | 569 | 570 | class ReadWriter(object): 571 | def __init__(self, out_path, by_id, prefix=""): 572 | self.out_path = out_path 573 | self.by_id = by_id 574 | if prefix != "": 575 | prefix = "{}_".format(prefix) 576 | self.prefix = prefix 577 | 578 | def write_read(self): 579 | raise NotImplementedError() 580 | 581 | def __enter__(self): 582 | return self 583 | 584 | def __exit__(self, exception_type, exception_value, traceback): 585 | pass 586 | 587 | 588 | class SingleWriter(ReadWriter): 589 | def write_read(self, read): 590 | if self.by_id: 591 | filename = '{}.fast5'.format(read.read_id['read_id']) 592 | else: 593 | filename = '{}read_ch{}_file{}.fast5'.format( 594 | self.prefix, read.channel_id['channel_number'], read.read_number 595 | ) 596 | filename = os.path.join(self.out_path, filename) 597 | with Fast5.New(filename, 'a', tracking_id=read.tracking_id, context_tags=read.context_tags, channel_id=read.channel_id) as h: 598 | h.set_raw(read.raw, meta=read.read_id, read_number=read.read_number) 599 | 600 | 601 | MULTI_READ_FILE_VERSION = "2.0" 602 | 603 | class MultiWriter(ReadWriter): 604 | def __init__(self, out_path, by_id, prefix="", reads_per_file=4000): 605 | super(MultiWriter, self).__init__(out_path, by_id, prefix=prefix) 606 | self.reads_per_file = reads_per_file 607 | self.current_reads = 0 # reads in open file, used to signal new file condition 608 | self.file_counter = 0 609 | self.current_file = None 610 | self.closed = False 611 | 612 | 613 | def __exit__(self, exception_type, exception_value, traceback): 614 | self.close() 615 | 616 | 617 | def close(self): 618 | if isinstance(self.current_file, h5py.File): 619 | self.current_file.close() 620 | 621 | 622 | def write_read(self, read): 623 | """Write a read. 624 | 625 | :param read: either a `Read` object or an hdf group handle from a 626 | source multi-read file. 627 | """ 628 | if self.closed: 629 | raise RuntimeError('Cannot write after closed.') 630 | 631 | if self.current_reads == 0: 632 | # start a new file 633 | self.close() 634 | filename = '{}mreads_file{}.fast5'.format( 635 | self.prefix, self.file_counter 636 | ) 637 | filename = os.path.join(self.out_path, filename) 638 | self.current_file = h5py.File(filename, 'w') 639 | self.current_file.attrs[_sanitize_data_for_writing('file_version')] = _sanitize_data_for_writing("2.0") 640 | self.file_counter += 1 641 | 642 | # write data 643 | if isinstance(read, Read): 644 | self._write_read(read) 645 | elif isinstance(read, h5py.Group): 646 | self._copy_read_group(read) 647 | else: 648 | raise TypeError("Cannot write type {} to output file.") 649 | self.current_reads += 1 650 | 651 | # update 652 | if self.current_reads == self.reads_per_file: 653 | self.current_reads = 0 654 | 655 | 656 | def _write_read(self, read): 657 | if read.raw.dtype != np.int16: 658 | raise TypeError('Raw data must be of type int16.') 659 | 660 | read_group = '/read_{}'.format(read.read_id['read_id']) 661 | Fast5._add_attrs_to_fh(self.current_file, {'run_id': read.tracking_id['run_id']}, read_group, convert=str) 662 | 663 | # add all attributes 664 | for grp_name in ('tracking_id', 'context_tags'): 665 | # spec has all of these as str 666 | data = getattr(read, grp_name) 667 | Fast5._add_attrs_to_fh(self.current_file, data, '{}/{}'.format(read_group, grp_name), convert=str) 668 | Fast5._add_attrs_to_fh(self.current_file, read.channel_id, '{}/channel_id'.format(read_group)) 669 | 670 | # add the data (and some more attrs) 671 | data_path = '{}/Raw'.format(read_group) 672 | read_id = Fast5._convert_meta_times(read.read_id, read.channel_id['sampling_rate']) 673 | read_id = Fast5.convert_raw_meta(read_id) 674 | Fast5._add_attrs_to_fh(self.current_file, read_id, data_path) 675 | signal_path = '{}/Signal'.format(data_path) 676 | self.current_file.create_dataset( 677 | signal_path, data=read.raw, compression='gzip', shuffle=True, dtype='i2') 678 | 679 | 680 | def _copy_read_group(self, read): 681 | self.current_file.copy(read, read.name) 682 | -------------------------------------------------------------------------------- /fast5_research/fast5_bulk.py: -------------------------------------------------------------------------------- 1 | import ast 2 | from collections import defaultdict 3 | from fast5_research.util import dtype_descr 4 | import itertools 5 | import re 6 | from sys import version_info 7 | from xml.dom import minidom 8 | import warnings 9 | 10 | with warnings.catch_warnings(): 11 | warnings.simplefilter("ignore", category=FutureWarning) 12 | import h5py 13 | 14 | import numpy as np 15 | from numpy.lib.recfunctions import append_fields 16 | 17 | 18 | from fast5_research.util import get_changes, _clean_attrs, _sanitize_data_for_writing, _sanitize_data_for_reading 19 | 20 | if version_info[0] < 3: 21 | from StringIO import StringIO 22 | else: 23 | from io import StringIO 24 | 25 | 26 | class BulkFast5(h5py.File): 27 | """Class for reading data from a bulk fast5 file""" 28 | 29 | __tracking_path__ = '/UniqueGlobalKey/tracking_id' 30 | __pore_model_old__ = 'Meta/User/pore_model' 31 | __pore_model_new__ = 'Meta/User/analysis_conf' 32 | __context_path__ = '/UniqueGlobalKey/context_tags/' 33 | __intermediate_data__ = '/IntermediateData/' 34 | __voltage_meta__ = '/Device/VoltageMeta' 35 | __voltage_data__ = '/Device/MetaData' 36 | __channel_meta__ = '/IntermediateData/Channel_{}/Meta' 37 | __multiplex_data__ = '/MultiplexData/Channel_{}/Multiplex' 38 | 39 | __raw_data__ = "Raw/Channel_{}/Signal" 40 | __raw_meta__ = "Raw/Channel_{}/Meta" 41 | __event_data__ = "/IntermediateData/Channel_{}/Events" 42 | __read_data__ = "/IntermediateData/Channel_{}/Reads" 43 | __state_data__ = "/StateData/Channel_{}/States" 44 | 45 | # The below refers to MinION Mk1 ASIC, may change in future 46 | __mk1_asic_mux_states__ = { 47 | 'common_voltage_1': 1, 48 | 'common_voltage_2': 2, 49 | 'common_voltage_3': 3, 50 | 'common_voltage_4': 4, 51 | 'gnd': 15, 52 | 'gnd_through_resistor': 14, 53 | 'open_pore': 0, 54 | 'test_current_1': 10, 55 | 'test_current_2': 11, 56 | 'test_current_3': 12, 57 | 'test_current_4': 13, 58 | 'test_current_open_pore': 5, 59 | 'unblock_voltage_1': 6, 60 | 'unblock_voltage_2': 7, 61 | 'unblock_voltage_3': 8, 62 | 'unblock_voltage_4': 9 63 | } 64 | 65 | def __init__(self, filename, mode='r'): 66 | """Create an BulkFast5 instance. 67 | 68 | :param filename: path to a bulk fast5 file. 69 | :param mode: h5py opening mode. 70 | """ 71 | 72 | super(BulkFast5, self).__init__(filename, mode) 73 | if mode == 'r': 74 | data = self[self.__intermediate_data__] 75 | self.channels = sorted([int(name.strip('Channel_')) for name in data.keys()]) 76 | self.parsed_exp_history = None # we parse the history lazily 77 | 78 | # Parse experimental metadata 79 | self.exp_metadata = dict() 80 | for path in (self.__tracking_path__, self.__context_path__): 81 | try: 82 | self.exp_metadata.update(_clean_attrs(self[path].attrs)) 83 | except KeyError: 84 | raise KeyError('Cannot read summary from {}'.format(path)) 85 | 86 | # This should be safe 87 | try: 88 | self.sample_rate = float(self['Meta'].attrs['sample_rate']) 89 | except: 90 | self.sample_rate = float(self.get_metadata(self.channels[0])['sample_rate']) 91 | 92 | 93 | def get_metadata(self, channel): 94 | """Get the metadata for the specified channel. 95 | 96 | Look for first for events metadata, and fall-back on raw metadata, returning an empty dict if neither could be found.""" 97 | if hasattr(self, '_cached_metadata'): 98 | if channel in self._cached_metadata: 99 | return self._cached_metadata[channel] 100 | else: 101 | self._cached_metadata = {} 102 | 103 | if self.__channel_meta__.format(channel) in self: 104 | meta = _clean_attrs(self[self.__channel_meta__.format(channel)].attrs) 105 | elif self.has_raw(channel): # use raw meta data 106 | meta = _clean_attrs(self[self.__raw_meta__.format(channel)].attrs) 107 | else: 108 | meta = {} 109 | 110 | self._cached_metadata[channel] = meta 111 | return meta 112 | 113 | 114 | def get_event_detection_parameters(self): 115 | """Get the full set of parameters related to event detection """ 116 | if self.__pore_model_old__ in self: # Old Minknow file 117 | xmldoc = minidom.parseString("".join(self[self.__pore_model_old__].value)) 118 | return dict(xmldoc.getElementsByTagName('event_detection')[0].attributes.items()) 119 | elif self.__pore_model_new__ in self: # New Minknow file 120 | result = "".join(self[self.__pore_model_new__].value) 121 | result = result.replace('true', 'True').replace('false', 'False') 122 | return ast.literal_eval(result)['event_detection'] 123 | 124 | 125 | def get_tracking_meta(self): 126 | """Get tracking meta data""" 127 | return _clean_attrs(self[self.__tracking_path__].attrs) 128 | 129 | 130 | def get_context_meta(self): 131 | """Get context meta""" 132 | return _clean_attrs(self[self.__context_path__].attrs) 133 | 134 | 135 | def has_raw(self, channel): 136 | """Return True if there is raw data for this channel.""" 137 | raw_location = self.__raw_data__.format(channel) 138 | return self._has_data(raw_location) 139 | 140 | 141 | def has_reads(self, channel): 142 | """Return True if there is read data for this channel.""" 143 | read_location = self.__read_data__.format(channel) 144 | return self._has_data(read_location) 145 | 146 | 147 | def has_states(self, channel): 148 | """Return True if there is State data for this channel.""" 149 | state_location = self.__state_data__.format(channel) 150 | return self._has_data(state_location) 151 | 152 | 153 | def _has_data(self, location): 154 | """Return true if the given data path exists 155 | 156 | :param location: str, path with fast5. 157 | """ 158 | if hasattr(self, '_cached_paths'): 159 | if location in self._cached_paths: 160 | return self._cached_paths[location] 161 | else: 162 | self._cached_paths = {} 163 | 164 | location_split = location.split('/') 165 | folder = '/'.join(location_split[:-1]) 166 | name = location_split[-1] 167 | present = folder in self and name in self[folder].keys() 168 | self._cached_paths[location] = present 169 | return present 170 | 171 | 172 | def _time_interval_to_index(self, channel, times): 173 | """Translate a tuple of (start_sec, end_sec) to an index.""" 174 | start_sec, end_sec = times 175 | start = self._seconds_to_index(channel, start_sec) 176 | end = self._seconds_to_index(channel, end_sec) 177 | return (start, end) 178 | 179 | 180 | def _seconds_to_index(self, channel, time): 181 | """Translate a point in time to an index.""" 182 | if time is None: 183 | return None 184 | 185 | return int(time * float(self.sample_rate)) 186 | 187 | 188 | def _scale(self, channel, data): 189 | """Scale event data if necessary, else return unchanged. 190 | 191 | If event metadata can't be found, assume events don't need scaling.""" 192 | 193 | meta_data = self.get_metadata(channel) 194 | 195 | if 'scaling_used' not in meta_data or meta_data.get('scaling_used'): 196 | return data 197 | else: 198 | channel_scale = meta_data['range'] / meta_data['digitisation'] 199 | channel_offset = meta_data['offset'] 200 | data['mean'] = (data['mean'] + channel_offset) * channel_scale 201 | return data 202 | 203 | 204 | def get_raw(self, channel, times=None, raw_indices=(None, None), use_scaling=True): 205 | """If available, parse channel raw data. 206 | 207 | :param channel: channel number int 208 | :param times: tuple of floats (start_second, end_second) 209 | :param raw_indices: tuple of ints (start_index, end_index) 210 | :param use_scaling: if True, scale the current level 211 | 212 | .. note:: 213 | Exactly one of the slice keyword arguments needs to be specified, 214 | as the method will override them in the order of times 215 | > raw_indices. 216 | """ 217 | 218 | if not self.has_raw(channel): 219 | raise KeyError('Channel {} does not contain raw data.'.format(channel)) 220 | 221 | if times is not None: 222 | raw_indices = self._time_interval_to_index(channel, times) 223 | 224 | raw_data = self.__raw_data__.format(channel) 225 | data = self[raw_data][raw_indices[0]:raw_indices[1]] 226 | 227 | if use_scaling: 228 | meta_data = self.get_metadata(channel) 229 | raw_unit = meta_data['range'] / meta_data['digitisation'] 230 | data = (data + meta_data['offset']) * raw_unit 231 | 232 | return data 233 | 234 | 235 | def get_events(self, channel, times=None, raw_indices=None, event_indices=(None, None), 236 | use_scaling=True): 237 | """Parse channel event data. 238 | 239 | :param channel: channel number int 240 | :param times: tuple of floats (start_second, end_second) 241 | :param raw_indices: tuple of ints (start_index, end_index) 242 | :param event_indices: tuple of ints (start_index, end_index) 243 | :param use_scaling: if True, scale the current level 244 | 245 | .. note:: 246 | Exactly one of the slice keyword arguments needs to be specified, 247 | as the method will override them in the order of times 248 | > raw_indices > event_indices. 249 | """ 250 | 251 | event_data = self.__event_data__.format(channel) 252 | ev = self[event_data] 253 | 254 | if times is not None: 255 | raw_indices = self._time_interval_to_index(channel, times) 256 | if raw_indices is not None: 257 | event_indices = np.searchsorted(ev['start'], raw_indices) 258 | data = _sanitize_data_for_reading(ev[event_indices[0]:event_indices[1]]) 259 | 260 | # Change variance to stdv column 261 | data['variance'] = np.sqrt(data['variance']) 262 | data.dtype.names = ['stdv' if n == 'variance' else n for n in data.dtype.names] 263 | 264 | if use_scaling: 265 | return self._scale(channel, data) 266 | else: 267 | return data 268 | 269 | 270 | def _get_reads_data(self, channel): 271 | """Parse channel read data exactly as it is in the bulk fast5 file. 272 | 273 | :param channel: channel number int 274 | 275 | .. note:: 276 | No processing is done - reads might span several rows. 277 | """ 278 | if not self.has_reads(channel): 279 | raise KeyError('Channel {} does not contain read data.'.format(channel)) 280 | 281 | return self[self.__read_data__.format(channel)] 282 | 283 | 284 | def get_reads(self, channel, transitions=False, multi_row_class='auto'): 285 | """Parse channel read data to yield details of reads. 286 | 287 | :param channel: channel number int 288 | :param transitions: if True, include transition reads 289 | :param multi_row_class: options: 'auto', modal, 'penultimate', 'final'. 290 | For reads which span multiple rows, use the classification from 291 | 'auto': modal class if present, penultimate row if not 292 | 'modal': modal class if present 293 | 'penultimate': penultimate row 294 | 'final': final row. 295 | Modal classification not supported by very old versions of MinKNOW. 296 | """ 297 | 298 | multi_row_choices = {'auto', 'modal', 'penultimate', 'final'} 299 | if multi_row_class not in multi_row_choices: 300 | raise ValueError('''{} is not one of the permitted choices for 301 | multi_row_class. Permitted choices: {}.'''.format(multi_row_class, multi_row_choices)) 302 | 303 | read_data = self._get_reads_data(channel) 304 | 305 | return_keys = { 306 | 'read_start', 'read_length', 307 | 'event_index_start', 'event_index_end', 'classification', 'read_id', 308 | 'median', 'median_sd', 'median_dwell', 'range', 'drift' 309 | } 310 | additional_keys = {'flags'} 311 | computed_keys = {'drift'} 312 | required_keys = return_keys.union(additional_keys).difference(computed_keys) 313 | for key in required_keys: 314 | if key not in read_data.dtype.names: 315 | raise KeyError('The read data did not contain the required key {}.'.format(key)) 316 | 317 | if multi_row_class == 'modal': 318 | if 'modal_classification' not in read_data.dtype.names: 319 | raise KeyError("The read data did not contain the key 'modal_classification'.") 320 | 321 | # classification is enumerated 322 | enum_map = h5py.check_dtype(enum=read_data.dtype['classification']) 323 | classes = _clean_attrs({v:k for k, v in enum_map.items()}) 324 | # read dataset into memory, lest we return h5py objects 325 | read_data = read_data[()] 326 | 327 | # we need to combine 'event_index_start', 'read_start' from first row in 328 | # the read with sum 'read_length' over all rows and all other cols from 329 | # final row. If penultimate_class is True, use classification from penultimate row. 330 | # We also need to calculate drift, which is the absolute difference 331 | # between the local_median field of the first and last rows of a read. 332 | accum_stats = None 333 | accum_names = ('event_index_start', 'read_start', 'read_length', 'classification') 334 | for n, row in enumerate(read_data): 335 | if accum_stats is None: 336 | accum_stats = {k:row[k] for k in accum_names} 337 | accum_stats['drift'] = 0 338 | first_local_median = row['local_median'] 339 | else: 340 | accum_stats['read_length'] += row['read_length'] 341 | 342 | if multi_row_class == 'auto': # use modal classification if column present, use penultimate if not 343 | if 'modal_classification' in read_data.dtype.names: 344 | accum_stats['classification'] = row['modal_classification'] 345 | else: 346 | accum_stats['classification'] = read_data[n - 1]['classification'] 347 | 348 | if multi_row_class == 'modal': # use modal classification if column present 349 | accum_stats['classification'] = row['modal_classification'] 350 | 351 | if multi_row_class == 'penultimate': # use classification from previous row 352 | accum_stats['classification'] = read_data[n - 1]['classification'] 353 | 354 | if multi_row_class == 'final': # use classification from current row 355 | accum_stats['classification'] = row['classification'] 356 | 357 | accum_stats['drift'] = abs(row['local_median'] - first_local_median) 358 | 359 | # pick out only the columns we want 360 | row_details = {k:row[k] for k in return_keys - computed_keys} 361 | 362 | if row['flags'] & 0x1 == 0: 363 | # read has ended 364 | if classes[row['classification']] == 'transition' and not transitions: 365 | accum_stats = None # prepare for next read 366 | else: 367 | for k in accum_stats: # replace / add computed keys 368 | row_details[k] = accum_stats[k] 369 | row_details['classification'] = classes[row_details['classification']] 370 | yield _clean_attrs(row_details) 371 | accum_stats = None 372 | 373 | 374 | def get_state_changes(self, channel): 375 | """Parse channel state changes. 376 | 377 | :param channel: channel number int 378 | """ 379 | if not self.has_states(channel): 380 | raise KeyError('Channel {} does not contain state data.'.format(channel)) 381 | 382 | if hasattr(self, '_cached_state_changes'): 383 | if channel in self._cached_state_changes: 384 | return self._cached_state_changes[channel] 385 | else: 386 | self._cached_state_changes = {} 387 | 388 | # state data is enumerated 389 | col = 'summary_state' 390 | data = self[self.__state_data__.format(channel)] 391 | enum_map = h5py.check_dtype(enum=data.dtype[col]) 392 | enum_to_state = _clean_attrs({v:k for k, v in enum_map.items()}) 393 | 394 | # translate ints into strings 395 | states = np.array([enum_to_state[key] for key in data[col]]) 396 | 397 | try: 398 | data = np.array(data['approx_raw_index']) 399 | except ValueError: #not a KeyError: see h5py/_hl/dataset.pyc in readtime_dtype(basetype, names) 400 | data = np.array(data['acquisition_raw_index']) 401 | if len(data) > 0: 402 | data = data.astype([('approx_raw_index', data.dtype)], copy=False) 403 | data = append_fields(data, 404 | ['approx_raw_index_end','summary_state'], 405 | [np.roll(data['approx_raw_index'], -1), states], usemask=False) 406 | # set end of last state to something approximating infinity (the largest u64 int). 407 | data['approx_raw_index_end'][-1] = -1 408 | else: # some channels don't contain channel state data, just create a dummy array 409 | data = np.array([], dtype=[('approx_raw_index', ' raw_indices. 427 | """ 428 | assert (time is not None) or (raw_index is not None), 'Need either a time or a raw_index argument' 429 | if time is not None: 430 | raw_index = self._seconds_to_index(channel, time) 431 | 432 | data = self.get_state_changes(channel) 433 | 434 | # Check if the requested index is before the first state entry 435 | if raw_index < data['approx_raw_index'][0]: 436 | msg = 'No state data at index {}, which is before first state at {}' 437 | raise RuntimeError(msg.format(raw_index, data['approx_raw_index'][0])) 438 | 439 | # Now get last record before requested sample, handling the special case 440 | # where there is no last record (i.e. if raw_index == 0) 441 | if raw_index == 0: 442 | i = 0 443 | else: 444 | i = np.searchsorted(data['approx_raw_index'], raw_index) - 1 445 | 446 | state = data['summary_state'][i] 447 | 448 | return state 449 | 450 | 451 | def get_states_in_window(self, channel, times=None, raw_indices=None): 452 | """Find all channel states within a time window. 453 | 454 | :param channel: channel number int 455 | :param times: tuple of floats (start_second, end_second) 456 | :param raw_indices: tuple of ints (start_index, end_index) 457 | 458 | .. note:: 459 | Exactly one of the slice keyword arguments needs to be specified, 460 | as the method will override them in the order of times 461 | > raw_indices. 462 | """ 463 | 464 | assert (times is not None) or (raw_indices is not None), 'Need either a time or a raw_index argument' 465 | if times is not None: 466 | raw_indices = self._seconds_to_index(channel, times[0]), self._seconds_to_index(channel, times[1]) 467 | states = self.get_state_changes(channel) 468 | first_state, last_state = np.searchsorted(states['approx_raw_index'], raw_indices, side='right') 469 | return np.unique(states['summary_state'][first_state-1:last_state]) 470 | 471 | 472 | def get_mux(self, channel, raw_index=None, time=None, wells_only=False, return_raw_index=False): 473 | """Find the multiplex well_id ("the mux") at a given time 474 | 475 | :param channel: channel number int 476 | :param raw_index: sample index 477 | :param time: time in seconds 478 | :wells_only: bool, if True, ignore changes to mux states not in [1,2,3,4] 479 | and hence return the last well mux. 480 | :return_raw_index: bool, if True, return tuple (mux, raw_index), raw_index being 481 | raw index when the mux was set. 482 | 483 | .. note:: 484 | There are multiple mux states associated with each well (e.g. common_voltage_1 and unblock_volage_1). 485 | Here, we return the well_id associated with the mux state (using self.enum_to_mux), i.e. 1 in both these cases. 486 | 487 | Exactly one of the slice keyword arguments needs to be specified, 488 | as the method will override them in the order of times 489 | > raw_indices. 490 | """ 491 | assert (time is not None) or (raw_index is not None), 'Need either a time or a raw_index argument' 492 | if time is not None: 493 | raw_index = self._seconds_to_index(channel, time) 494 | 495 | data = self.get_mux_changes(channel, wells_only=wells_only) 496 | 497 | # Check if the requested index is before the first mux entry 498 | if raw_index < data['approx_raw_index'][0]: 499 | msg = 'No mux data at index {}, which is before first mux at {}' 500 | raise RuntimeError(msg.format(raw_index, data['approx_raw_index'][0])) 501 | 502 | # Now get last record before requested sample, handling the special case 503 | # where there is no last record (i.e. if raw_index == 0) 504 | if raw_index == 0: 505 | i = 0 506 | else: 507 | i = np.searchsorted(data['approx_raw_index'], raw_index) - 1 508 | 509 | mux = self.enum_to_mux[data[i]['well_id']] 510 | 511 | if return_raw_index: 512 | raw_index = data[i]['approx_raw_index'] # when the mux was set 513 | return mux, raw_index 514 | else: 515 | return mux 516 | 517 | 518 | @staticmethod 519 | def _strip_metadata(data): 520 | """Strip dtype.metadata dicts from enumerated arrays. 521 | 522 | :param data: structured np.array 523 | :returns: view of the same data with the metadata removed. 524 | 525 | .. note:: 526 | since h5py v 2.3, enumerated dtypes come with a dtype.metadata dict 527 | see https://github.com/numpy/numpy/issues/6771 and 528 | https://github.com/h5py/h5py/pull/355/commits/5da2e96942218ffb1c9b614be9be8409bea219f8 529 | This can stop functions like recfunctions.append_fields working on 530 | these arrays, so strip out this dict. as it's not writeable, just 531 | create a view with the appropriate data type 532 | """ 533 | d = [] 534 | for col, str_type in dtype_descr(data): 535 | if not isinstance(str_type, str) and isinstance(str_type[1], dict) and 'enum' in str_type[1]: 536 | str_type = str_type[0] 537 | d.append((col, str_type)) 538 | return data.view(np.dtype(d)) 539 | 540 | 541 | def get_mux_changes(self, channel, wells_only=False): 542 | """Get changes in multiplex settings for given channel. 543 | 544 | :param channel: channel for which to fetch data 545 | :wells_only: bool, if True, ignore changes to mux states not in [1,2,3,4] 546 | 547 | .. note:: 548 | There are multiple mux states associated with each well (e.g. 1:common_voltage_1 and 6:unblock_voltage_1). 549 | Here, we return mux state numbers, e.g. 1 and 6, which can be linked to the well_id using self.enum_to_mux 550 | """ 551 | if hasattr(self, '_cached_mux_changes'): 552 | if channel in self._cached_mux_changes[wells_only]: 553 | return self._cached_mux_changes[wells_only][channel] 554 | else: 555 | # cache mux changes separately for well_only True and False 556 | self._cached_mux_changes = {True: {}, False: {}} 557 | 558 | enum_col = 'well_id' 559 | multiplex_data = self.__multiplex_data__.format(channel) 560 | data = self[multiplex_data] 561 | enum = _clean_attrs(h5py.check_dtype(enum=data.dtype[enum_col])) 562 | assert enum == self.__mk1_asic_mux_states__, 'Got unexpected multiplex states' 563 | 564 | if not hasattr(self, "enum_to_mux"): 565 | # Build a dict which relates enum values to mux. 566 | self.enum_to_mux = {} 567 | for k, v in enum.items(): 568 | mux = 0 569 | mo = re.search(r'(\d)$', k) 570 | if mo is not None: 571 | mux = int(mo.group(0)) 572 | self.enum_to_mux[v] = mux 573 | data = data[()] # load into memory 574 | data = self._strip_metadata(data) # remove dtype.metadata dict present with h5py>=2.3.0 575 | 576 | # remove any rows where the mux state has not changed 577 | data = get_changes(data, ignore_cols=('approx_raw_index',)) 578 | 579 | if wells_only: # only consider changes to wells in [1,2,3,4] 580 | wells = [1, 2, 3, 4] 581 | mask = np.in1d(data['well_id'], wells) 582 | mask[0] = True # keep first mux, whatever it is 583 | data = data[mask] 584 | self._cached_mux_changes[wells_only][channel] = data 585 | return data 586 | 587 | 588 | def get_mux_changes_in_window(self, channel, times=None, raw_indices=None): 589 | """Find all mux changes within a time window. 590 | 591 | :param channel: channel number int 592 | :param times: tuple of floats (start_second, end_second) 593 | :param raw_indices: tuple of ints (start_index, end_index) 594 | 595 | .. note:: 596 | There are multiple mux values associated with each well (e.g. 1:common_voltage_1 and 6:unblock_voltage_1). 597 | Here, we return mux values, e.g. 1 and 6, which can be linked to the well_id using self.enum_to_mux. 598 | 599 | Exactly one of the slice keyword arguments needs to be specified, 600 | as the method will override them in the order of times 601 | > raw_indices. 602 | """ 603 | 604 | assert (times is not None) or (raw_indices is not None), 'Need either a time or a raw_index argument' 605 | if times is not None: 606 | raw_indices = self._seconds_to_index(channel, times[0]), self._seconds_to_index(channel, times[1]) 607 | muxes = self.get_mux_changes(channel) 608 | first_mux, last_mux = np.searchsorted(muxes['approx_raw_index'], raw_indices, side='right') 609 | return muxes[first_mux-1:last_mux] 610 | 611 | 612 | def get_waveform_timings(self): 613 | """Extract the timings of the waveforms (if any). 614 | 615 | :returns: list of tuples of start and end times 616 | """ 617 | mux_timings = [] 618 | on_index = None 619 | for i in range(0, len(self["Device"]["AsicCommands"])): 620 | if self._waveform_enabled(i): 621 | on_index = self["Device"]["AsicCommands"][i]["frame_number"] 622 | elif on_index is not None: 623 | # when _waveform_enabled(i) returns to False, save on and off 624 | # timings 625 | off_index = self["Device"]["AsicCommands"][i]["frame_number"] 626 | on_time = on_index / self.sample_rate 627 | off_time = off_index / self.sample_rate 628 | mux_timings.append((on_time, off_time)) 629 | on_index = None 630 | return mux_timings 631 | 632 | 633 | def _waveform_enabled(self, cmd_index): 634 | """Checks AsicCommand history to see if the waveform command was issued. 635 | 636 | .. note:: 637 | Here is the relevant section of the engineering documentation. 638 | engineering documentation (July 2015 version) 639 | 640 | Settings from PC: 512 bytes 641 | 1. Equals 17 otherwise FPGA drops the parcel 642 | 2. Command for FPGA: 643 | =1 load configuration data in ASIC 644 | =2 begin reading data from ASIC 645 | =3 reset ASIC chip 646 | =5 load configuration and begin/continue reading - used for 647 | real-time re-loading ASIC configuration 648 | 3. 4bit: enable zero supply voltage for Fan ('1'- Fan can be switched 649 | off completely, '0'- Fan is always On) 650 | 3bit: temperature control On/Off ('1' - On, '0' - Off) 651 | 2-1 bits: Fan speed control ('00' - Off, '11' - On 652 | (only when temperature control is off)) 653 | 0 bit: soft temperature control ('1' - On, '0' - Off) 654 | 4. 0bit: On/Off ASIC analogue supply voltage ('0' - off, '1' - on) 655 | 5. ASIC clock: '000' - 64MHz, '001' - 128MHz, '010' - 32MHz, 656 | '100' - 16MHz, '110' - 8MHz 657 | 6. 3 bit: Enable ('1' - on, '0' - off) channel mapping (channel 658 | sequence 0,1...510,511) for 512 channels mode 659 | 2 bit: Enable ('1' - on, '0' - off) ASIC configuration update every 660 | 1ms with values for bias voltage from LUT 661 | 1-0 bits: Number of channels from ASIC: '00' - 128ch, 662 | '01'-256ch, '10' - 512ch 663 | """ 664 | 665 | waveform_flag = self["Device"]["AsicCommands"][cmd_index]["command"].tostring()[5] 666 | # if cmd is not a bytestring, convert waveform flag to an integer. Needed for python2.x compatibility 667 | if not isinstance(waveform_flag, int): 668 | waveform_flag = ord(waveform_flag) 669 | waveform_enabled = waveform_flag & 4 != 0 670 | return waveform_enabled 671 | 672 | 673 | def get_voltage(self, times=None, raw_indices=(None, None), use_scaling=True): 674 | """Extracts raw common electrode trace 675 | 676 | :raw_indices: tuple of ints to limit section of voltage data loaded. 677 | :use_scaling: bool, whether to scale voltage data. If no scaling meta is found, 678 | scale by -5 (as appropriate for MinION). 679 | :return: voltage as array (including 5x multiplyer for MinKnow) 680 | """ 681 | if times is not None: 682 | raw_indices = self._time_interval_to_index(self.channels[0], times) 683 | 684 | voltages = self[self.__voltage_data__ 685 | ][raw_indices[0]:raw_indices[1]]['bias_voltage'] 686 | if use_scaling: 687 | # fast5 converted from ABF files have a voltage meta section 688 | # containing scaling parameters 689 | if self.__voltage_meta__ in self: 690 | voltage_meta = _clean_attrs(self[self.__voltage_meta__].attrs) 691 | unit = voltage_meta['range'] / voltage_meta['digitisation'] 692 | offset = voltage_meta['offset'] 693 | else: 694 | # Assume MinION scaling of 5 695 | unit = -5 696 | offset = 0 697 | voltages = (voltages + offset) * unit 698 | 699 | return voltages 700 | 701 | 702 | def get_bias_voltage_changes(self): 703 | """Get changes in the bias voltage. 704 | 705 | .. note:: 706 | For a long (-long-long) time the only logging of the common 707 | electrode voltage was the experimental history (accurate to one 708 | second). The addition of the voltage trace changed this, but this 709 | dataset is cumbersome. MinKnow 1.x(.3?) added the asic command 710 | history which is typically much shorter and therefore quicker to 711 | query. The bias voltage is numerously record. For MinION asics 712 | there is typically a -5X multiplier to convert the data into 713 | correct units with the sign people are used to. 714 | """ 715 | if hasattr(self, '_cached_voltage_changes'): 716 | return self._cached_voltage_changes 717 | 718 | # First try the asic command, fallback to the experimental history, 719 | # and finally the voltage trace. 720 | try: 721 | self._cached_voltage_changes = self._bias_from_asic_commands() 722 | except: 723 | try: 724 | self._cached_voltage_changes = self._bias_from_exp_hist() 725 | except: 726 | try: 727 | self._cached_voltage_changes = self._bias_from_voltages() 728 | except: 729 | raise RuntimeError('Cannot parse voltage changes.') 730 | 731 | return self._cached_voltage_changes 732 | 733 | 734 | def _bias_from_voltages(self): 735 | """Extract voltage changes from the voltage trace data.""" 736 | 737 | voltages = self.get_voltage() 738 | changes = np.where(voltages[:-1] != voltages[1:])[0] 739 | 740 | voltage_changes = np.empty( 741 | len(changes) + 1, 742 | dtype=[('time', float), ('set_bias_voltage', int)] 743 | ) 744 | voltage_changes['time'][0] = voltages[0] 745 | voltage_changes['time'][1:] = changes 746 | voltage_changes['time'] /= self.sample_rate 747 | voltage_changes['set_bias_voltage'] = voltages[0] 748 | voltage_changes['set_bias_voltage'][1:] = voltages[changes] 749 | return voltage_changes 750 | 751 | 752 | def _bias_from_asic_commands(self): 753 | """Extract voltages in Asic commands, filtering to only changes.""" 754 | 755 | all_voltages = [AsicBCommand(cmd).configuration.bias_voltage 756 | for cmd in self['/Device/AsicCommands']['command'] 757 | ] 758 | all_frames = self['/Device/AsicCommands']['frame_number'] 759 | 760 | prev_voltage = all_voltages[0] 761 | changes = [(all_frames[0], prev_voltage)] 762 | for frame, voltage in itertools.izip(all_frames[1:], all_voltages[1:]): 763 | if voltage != prev_voltage: 764 | changes.append((frame, voltage)) 765 | 766 | voltage_changes = np.array( 767 | changes, 768 | dtype=[('time', float), ('set_bias_voltage', int)] 769 | ) 770 | voltage_changes['time'] /= self.sample_rate 771 | voltage_changes['set_bias_voltage'] *= -5 772 | return voltage_changes 773 | 774 | 775 | def _bias_from_exp_hist(self): 776 | """Extract voltage changes from experimental history. 777 | 778 | ..note:: The experimental history is deprecated in MinKnow 1.3 779 | """ 780 | if self.parsed_exp_history is None: 781 | self.parse_history() 782 | voltage_changes = self.parsed_exp_history['set_bias_voltage'] 783 | voltage_changes['set_bias_voltage'] *= -1 784 | return voltage_changes 785 | 786 | 787 | def get_bias_voltage_changes_in_window(self, times=None, raw_indices=None): 788 | """Find all mux voltage changes within a time window. 789 | 790 | :param times: tuple of floats (start_second, end_second) 791 | :param raw_indices: tuple of ints (start_index, end_index) 792 | 793 | .. note:: 794 | This is the bias voltage from the expt history (accurate to 1 795 | second), and will not include any changes in voltage related to 796 | waveforms. For the full voltage trace, use get_voltage. 797 | 798 | Exactly one of the slice keyword arguments needs to be specified, 799 | as the method will override them in the order of times 800 | > raw_indices. 801 | """ 802 | 803 | assert (times is not None) or (raw_indices is not None), 'Need either a time or a raw_index argument' 804 | if times is None: 805 | times = float(raw_indices[0]) / self.sample_rate, float(raw_indices[1]) / self.sample_rate 806 | bias_voltage_changes = self.get_bias_voltage_changes() 807 | first_index, last_index = np.searchsorted(bias_voltage_changes['time'], times, side='right') 808 | return bias_voltage_changes[first_index:last_index] 809 | 810 | 811 | __engine_states__ = { 812 | 'minion_asic_temperature': float, 813 | 'minion_heatsink_temperature': float, 814 | 'set_bias_voltage': float, 815 | 'fan_speed': int 816 | } 817 | __temp_fields__ = ('heatsink', 'asic') 818 | 819 | 820 | def parse_history(self): 821 | """Parse the experimental history to pull out various environmental factors. 822 | The functions below are quite nasty, don't enquire too hard. 823 | """ 824 | try: 825 | exph_fh = StringIO(str(self['Meta/User']['experimental_history'][:].tostring().decode())) 826 | except Exception: 827 | raise RuntimeError('Cannot read experimental_history from fast5') 828 | 829 | data = defaultdict(list) 830 | for item in self._iter_records(exph_fh): 831 | #item should contain 'time' and something else 832 | time = item['time'] 833 | field, value = next((k, v) for k, v in item.items() if k != 'time') 834 | data[field].append((time, value)) 835 | 836 | self.parsed_exp_history = { 837 | k:np.array(data[k], dtype=[('time', float), (k, self.__engine_states__[k])]) 838 | for k in data.keys() 839 | } 840 | return self 841 | 842 | 843 | def get_engine_state(self, state, time=None): 844 | """Get changes in an engine state or the value of an engine 845 | state at a given time. 846 | 847 | :param state: the engine state to retrieve. 848 | :param time: the time at which to grab engine state. 849 | """ 850 | if state not in self.__engine_states__: 851 | raise RuntimeError("'field' argument must be one of {}.".format(self.__engine_states__.keys())) 852 | 853 | if self.parsed_exp_history is None: 854 | self.parse_history() 855 | 856 | states = self.parsed_exp_history[state] 857 | if time is None: 858 | return states 859 | else: 860 | i = np.searchsorted(states['time'], time) - 1 861 | return states[state][i] 862 | 863 | 864 | def get_temperature(self, time=None, field=__temp_fields__[0]): 865 | if field not in self.__temp_fields__: 866 | raise RuntimeError("'field' argument must be one of {}.".format(self.__temp_fields__)) 867 | 868 | return self.get_engine_state('minion_{}_temperature'.format(field), time) 869 | 870 | 871 | def _iter_records(self, exph_fh): 872 | """Parse an iterator over file-like object representing 873 | an experimental history. 874 | """ 875 | for line in exph_fh: 876 | mo = re.match(r'.*:\s+Expt time: (\d+)s:? (.*)', line) 877 | if mo: 878 | time, msg = mo.groups() 879 | rec = self._parse_line(msg) 880 | if rec: 881 | key, value = rec 882 | yield {'time': int(time), key:value} 883 | 884 | 885 | def _parse_line(self, msg): 886 | """Check if a line of experimental history records 887 | a change in the engine state. 888 | """ 889 | mo = re.match(r'Experimental EngineState: (.*)', msg) 890 | if mo: 891 | msg2 = mo.group(1) 892 | return self._parse_engine_state(msg2) 893 | 894 | 895 | def _parse_engine_state(self, msg): 896 | """Extract engine state and value from a line of 897 | experimental history. 898 | """ 899 | mo = re.match(r'(\w+) is now (.*)', msg) 900 | if mo: 901 | key, value = mo.group(1), mo.group(2) 902 | if key in self.__engine_states__: 903 | return key, value 904 | 905 | 906 | def _add_attrs(self, data, location, convert=None): 907 | """Convenience method for adding attrs to a possibly new group. 908 | :param data: dict of attrs to add 909 | :param location: hdf path 910 | :param convert: function to apply to all dictionary values 911 | """ 912 | self.__add_attrs(self, data, location, convert=None) 913 | 914 | 915 | @staticmethod 916 | def __add_attrs(self, data, location, convert=None): 917 | """Implementation of _add_attrs as staticmethod. This allows 918 | functionality to be used in .New() constructor but is otherwise nasty! 919 | """ 920 | if location not in self: 921 | self.create_group(location) 922 | attrs = self[location].attrs 923 | for k, v in data.items(): 924 | if convert is not None: 925 | attrs[_sanitize_data_for_writing(k)] = _sanitize_data_for_writing(convert(v)) 926 | else: 927 | attrs[_sanitize_data_for_writing(k)] = _sanitize_data_for_writing(v) 928 | 929 | 930 | def _add_numpy_table(self, data, location): 931 | data = _sanitize_data_for_writing(data) 932 | self.create_dataset(location, data=data, compression=True) 933 | 934 | 935 | @classmethod 936 | def New(cls, fname, read='a', tracking_id={}, context_tags={}, channel_id={}): 937 | """Construct a fresh bulk file, with meta data written to 938 | standard locations. There is currently no checking this meta data. 939 | TODO: Add meta data checking. 940 | 941 | """ 942 | 943 | # Start a new file, populate it with meta 944 | with h5py.File(fname, 'w') as h: 945 | h.attrs[_sanitize_data_for_writing('file_version')] = _sanitize_data_for_writing(1.0) 946 | for data, location in zip( 947 | [tracking_id, context_tags], 948 | [cls.__tracking_path__, cls.__context_path__] 949 | ): 950 | # see cjw's comment in fast5.py: 951 | # 'no idea why these must be str, just following ossetra' 952 | cls.__add_attrs(h, data, location, convert=str) 953 | 954 | # return instance from new file 955 | return cls(fname, read) 956 | 957 | 958 | def set_raw(self, raw, channel, meta=None): 959 | """Set the raw data in file. 960 | 961 | :param raw: raw data to add 962 | :param channel: channel number 963 | """ 964 | req_keys = ['description', 'digitisation', 'offset', 'range', 965 | 'sample_rate'] 966 | 967 | meta = {k:v for k,v in meta.items() if k in req_keys} 968 | if len(meta.keys()) != len(req_keys): 969 | raise KeyError( 970 | 'Raw meta data must contain keys: {}.'.format(req_keys) 971 | ) 972 | 973 | raw_folder = '/'.join(self.__raw_data__.format(channel).split('/')[:-1]) 974 | raw_data_path = self.__raw_data__.format(channel) 975 | self._add_attrs(meta, raw_folder) 976 | self[raw_data_path] = raw 977 | 978 | 979 | def set_events(self, data, meta, channel): 980 | """Write event data to file 981 | 982 | :param data: event data 983 | :param meta: meta data to attach to read 984 | :param read_number: per-channel read counter 985 | """ 986 | req_meta_keys = ['description', 'digitisation', 'offset', 'range', 987 | 'sample_rate'] 988 | if not set(req_meta_keys).issubset(meta.keys()): 989 | raise KeyError( 990 | 'Read meta does not contain required fields: {}, got {}'.format( 991 | req_fields, meta.keys() 992 | ) 993 | ) 994 | req_event_fields = [ 995 | 'start', 'length', 'mean', 'variance' 996 | ] 997 | if not isinstance(data, np.ndarray): 998 | raise TypeError('Data is not ndarray.') 999 | 1000 | # if data contains 'stdv', square this to get the variance 1001 | # seemingly bulk fast5 files contain variance and not stdv, as 1002 | # taking the sqrt would be slow on minknow. 1003 | names = list(data.dtype.names) 1004 | for i, name in enumerate(names): 1005 | if name == 'stdv': 1006 | names[i] = 'variance' 1007 | data['stdv'] = np.square(data['stdv']) 1008 | data.dtype.names = names 1009 | 1010 | if not set(req_event_fields).issubset(data.dtype.names): 1011 | raise KeyError( 1012 | 'Read data does not contain required fields: {}, got {}.'.format( 1013 | req_event_fields, data.dtype.names 1014 | ) 1015 | ) 1016 | 1017 | event_meta_path = self.__channel_meta__.format(channel) 1018 | self._add_attrs(meta, event_meta_path) 1019 | 1020 | uint_fields = ('start', 'length') 1021 | dtype = np.dtype([( 1022 | d[0], 'uint32') if d[0] in uint_fields else d 1023 | for d in dtype_descr(data) 1024 | ]) 1025 | 1026 | # If the data is not an int or uint we assume it is in seconds and scale 1027 | # appropriately 1028 | if data['start'].dtype.kind not in ['i', 'u']: 1029 | data['start'] *= meta['sample_rate'] 1030 | data['length'] *= meta['sample_rate'] 1031 | 1032 | events_path = self.__event_data__.format(channel) 1033 | self._add_numpy_table( 1034 | data.astype(dtype), events_path 1035 | ) 1036 | 1037 | 1038 | def set_voltage(self, data, meta): 1039 | req_keys = ['description', 'digitisation', 'offset', 'range', 1040 | 'sample_rate'] 1041 | meta = {k:v for k,v in meta.items() if k in req_keys} 1042 | if len(meta.keys()) != len(req_keys): 1043 | raise KeyError( 1044 | 'Raw meta data must contain keys: {}.'.format(req_keys) 1045 | ) 1046 | 1047 | self._add_attrs(meta, self.__voltage_meta__) 1048 | dtype = np.dtype([('bias_voltage', np.int16)]) 1049 | self._add_numpy_table( 1050 | data.astype(dtype, copy=False), self.__voltage_data__ 1051 | 1052 | ) 1053 | 1054 | 1055 | # 1056 | # Taken from minknow/asicb_command/__init__.py 1057 | # 1058 | class AsicBConfiguration(object): 1059 | """Wrapper around the asicb configuration struct passed to the asicb over usb""" 1060 | def __init__(self, config): 1061 | self.data = str(config) 1062 | # Interpret as bytes... 1063 | self.bytes = np.frombuffer(self.data, dtype="u1") 1064 | # ...with reverse bit order 1065 | self.bits = np.unpackbits(self.bytes[::-1])[::-1].copy() 1066 | 1067 | 1068 | @property 1069 | def bias_voltage(self): 1070 | val = self.int_at(129, 121) 1071 | if val > 256: 1072 | return 256 - val 1073 | return val 1074 | 1075 | 1076 | def active_mux(self, channel): 1077 | """ 1078 | Gets the active mux for the specified channel 1079 | :param channel: 0 based 1080 | """ 1081 | first_bit_channel_0 = 211 # bit of mux state for channel 0 1082 | mux_state_size = 4 1083 | requested_channel_first_bit = first_bit_channel_0 + mux_state_size * channel 1084 | return self.int_at(requested_channel_first_bit + mux_state_size - 1, requested_channel_first_bit) 1085 | 1086 | 1087 | def int_at(self, start, end): 1088 | bits = self.bits_at(start, end) 1089 | num = 0 1090 | for on in reversed(bits): 1091 | num = num << 1 1092 | if on: 1093 | num |= 1 1094 | return num 1095 | 1096 | 1097 | def bits_at(self, start, end): 1098 | return self.bits[end:start+1] 1099 | 1100 | 1101 | class AsicBCommand(object): 1102 | """Wrapper around the asicb command structure""" 1103 | def __init__(self, command): 1104 | self.data = str(command) 1105 | self._configuration = AsicBConfiguration(self.data[10:]) 1106 | self.bytes = np.frombuffer(self.data, dtype="u1") 1107 | 1108 | if self.bytes[0] != 17: 1109 | raise Exception("Invalid command - magic byte was '{}', expected '17'" 1110 | .format(self.bytes[0])) 1111 | 1112 | 1113 | @property 1114 | def min_temperature(self): 1115 | return self._bytes[7] 1116 | 1117 | 1118 | @property 1119 | def min_temperature(self): 1120 | return self._bytes[8] 1121 | 1122 | 1123 | @property 1124 | def configuration(self): 1125 | return self._configuration 1126 | --------------------------------------------------------------------------------