├── MANIFEST.in
├── dev_requirements.txt
├── fast5_research
    ├── test
    │   ├── data
    │   │   ├── recursive
    │   │   │   ├── 1
    │   │   │   │   └── fake.fast5
    │   │   │   ├── 2
    │   │   │   │   └── fake.fast5
    │   │   │   ├── fake1.fast5
    │   │   │   ├── fake2.fast5
    │   │   │   └── fake3.fast5
    │   │   ├── example_template.bc_scale
    │   │   ├── abf2bulkfast5.fast5
    │   │   ├── elec3_example.fast5
    │   │   ├── example_template.map_scale
    │   │   ├── additional_test_file.fast5
    │   │   ├── example_basecall_squiggle_mapping.fast5
    │   │   ├── example_template.bc_path
    │   │   ├── example_template.map_path
    │   │   ├── example_template.events
    │   │   └── example_template.map_post
    │   ├── test_iterate.py
    │   ├── test_fast5_basecalling_and_mapping.py
    │   ├── test_fast5.py
    │   └── test_fast5_bulk.py
    ├── __init__.py
    ├── util.py
    ├── extract.py
    └── fast5_bulk.py
├── .gitignore
├── requirements.txt
├── .travis.yml
├── docs
    ├── index.rst
    ├── cmdline.rst
    ├── examples.rst
    └── conf.py
├── Makefile
├── setup.py
├── CHANGELOG.md
├── README.md
└── LICENSE.md


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | 


--------------------------------------------------------------------------------
/dev_requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools
2 | nose
3 | 


--------------------------------------------------------------------------------
/fast5_research/test/data/recursive/1/fake.fast5:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/fast5_research/test/data/recursive/2/fake.fast5:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/fast5_research/test/data/recursive/fake1.fast5:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/fast5_research/test/data/recursive/fake2.fast5:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/fast5_research/test/data/recursive/fake3.fast5:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *~
 3 | *.swp
 4 | *.swo
 5 | *.so
 6 | *.egg-info
 7 | .eggs
 8 | build
 9 | dist
10 | docs/_build/
11 | venv*
12 | 


--------------------------------------------------------------------------------
/fast5_research/test/data/example_template.bc_scale:
--------------------------------------------------------------------------------
1 | -0.00211363485711	1.00484079515	-0.00629128442239	0.647664177638	0.933711803503	28.3709450329
2 | 


--------------------------------------------------------------------------------
/fast5_research/test/data/abf2bulkfast5.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/fast5_research/HEAD/fast5_research/test/data/abf2bulkfast5.fast5


--------------------------------------------------------------------------------
/fast5_research/test/data/elec3_example.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/fast5_research/HEAD/fast5_research/test/data/elec3_example.fast5


--------------------------------------------------------------------------------
/fast5_research/test/data/example_template.map_scale:
--------------------------------------------------------------------------------
1 | -0.000425261825492	1.00000660841	2.07396170589e-05	0.0288695991923	0.926918438703	579.501041091
2 | 


--------------------------------------------------------------------------------
/fast5_research/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.2.23'
2 | 
3 | from fast5_research.fast5 import Fast5, iterate_fast5
4 | from fast5_research.fast5_bulk import BulkFast5
5 | 


--------------------------------------------------------------------------------
/fast5_research/test/data/additional_test_file.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/fast5_research/HEAD/fast5_research/test/data/additional_test_file.fast5


--------------------------------------------------------------------------------
/fast5_research/test/data/example_basecall_squiggle_mapping.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/fast5_research/HEAD/fast5_research/test/data/example_basecall_squiggle_mapping.fast5


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # fast5_research requirements. 
2 | # Use comment lines or inline comments to document why we are using particular versions
3 | futures
4 | h5py<2.9.0     # causes some tests to fail
5 | numpy>=1.14.0  # 1.14 made some relatively big changes
6 | progressbar2
7 | pysam
8 | 
9 | 


--------------------------------------------------------------------------------
/fast5_research/test/data/example_template.bc_path:
--------------------------------------------------------------------------------
 1 | 8.280000000000000000e+02
 2 | 2.410000000000000000e+02
 3 | 9.660000000000000000e+02
 4 | 7.930000000000000000e+02
 5 | 1.000000000000000000e+02
 6 | 4.030000000000000000e+02
 7 | 5.910000000000000000e+02
 8 | 3.190000000000000000e+02
 9 | 2.520000000000000000e+02
10 | 1.009000000000000000e+03
11 | 9.650000000000000000e+02
12 | 7.900000000000000000e+02
13 | 8.800000000000000000e+01
14 | 3.520000000000000000e+02
15 | 3.840000000000000000e+02
16 | 5.130000000000000000e+02
17 | 5.000000000000000000e+00
18 | 2.300000000000000000e+01
19 | 9.400000000000000000e+01
20 | 3.780000000000000000e+02
21 | 4.900000000000000000e+02
22 | 9.370000000000000000e+02
23 | 6.760000000000000000e+02
24 | 6.560000000000000000e+02
25 | 5.790000000000000000e+02
26 | 


--------------------------------------------------------------------------------
/fast5_research/test/data/example_template.map_path:
--------------------------------------------------------------------------------
 1 | 0.000000000000000000e+00
 2 | 1.000000000000000000e+00
 3 | 2.000000000000000000e+00
 4 | 3.000000000000000000e+00
 5 | 4.000000000000000000e+00
 6 | 5.000000000000000000e+00
 7 | 6.000000000000000000e+00
 8 | 7.000000000000000000e+00
 9 | 8.000000000000000000e+00
10 | 9.000000000000000000e+00
11 | 1.000000000000000000e+01
12 | 1.100000000000000000e+01
13 | 1.200000000000000000e+01
14 | 1.300000000000000000e+01
15 | 1.400000000000000000e+01
16 | 1.500000000000000000e+01
17 | 1.600000000000000000e+01
18 | 1.700000000000000000e+01
19 | 1.800000000000000000e+01
20 | 1.900000000000000000e+01
21 | 2.000000000000000000e+01
22 | 2.100000000000000000e+01
23 | 2.200000000000000000e+01
24 | 2.300000000000000000e+01
25 | 2.400000000000000000e+01
26 | 


--------------------------------------------------------------------------------
/fast5_research/test/data/example_template.events:
--------------------------------------------------------------------------------
 1 | mean	stdv	length	start
 2 | 57.307656	1.352701	1.000000	0.000000
 3 | 58.422004	1.185755	1.000000	1.000000
 4 | 65.627289	1.191993	1.000000	2.000000
 5 | 68.845879	1.104547	1.000000	3.000000
 6 | 71.449841	1.184341	1.000000	4.000000
 7 | 76.549487	1.253127	1.000000	5.000000
 8 | 69.556924	1.515694	1.000000	6.000000
 9 | 54.685719	1.612181	1.000000	7.000000
10 | 52.595864	1.167940	1.000000	8.000000
11 | 57.696494	1.216995	1.000000	9.000000
12 | 63.905448	1.110766	1.000000	10.000000
13 | 67.273390	1.100466	1.000000	11.000000
14 | 73.397066	1.182740	1.000000	12.000000
15 | 76.326802	1.216704	1.000000	13.000000
16 | 77.069438	1.321304	1.000000	14.000000
17 | 69.062284	1.417715	1.000000	15.000000
18 | 60.748061	1.162331	1.000000	16.000000
19 | 64.499068	1.123995	1.000000	17.000000
20 | 69.120248	1.115507	1.000000	18.000000
21 | 63.973174	1.202806	1.000000	19.000000
22 | 64.451044	1.195180	1.000000	20.000000
23 | 62.669594	1.379852	1.000000	21.000000
24 | 68.471935	1.513985	1.000000	22.000000
25 | 77.861964	1.349168	1.000000	23.000000
26 | 73.011127	1.299344	1.000000	24.000000
27 | 


--------------------------------------------------------------------------------
/fast5_research/test/test_iterate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | import unittest
 4 | from uuid import uuid4
 5 | 
 6 | from fast5_research import Fast5, iterate_fast5
 7 | 
 8 | 
 9 | class IterateFiles(unittest.TestCase):
10 |     def setUp(self):
11 |         self.path = (os.path.join(
12 |             os.path.dirname(__file__), 'data', 'recursive'
13 |         ))
14 | 
15 |     def test_000_single_layer(self):
16 |         fnames = list(iterate_fast5(self.path, paths=True))
17 |         self.assertEqual(len(fnames), 3)
18 | 
19 |     def test_001_recursive(self):
20 |         fnames = list(iterate_fast5(self.path, paths=True, recursive=True))
21 |         self.assertEqual(len(fnames), 5)
22 | 
23 |     def test_002_from_file(self):
24 |         tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))
25 |         with open(tmp_file, 'w') as fh:
26 |             fh.write('filename\tjunk\n')
27 |             for i, fname in enumerate(iterate_fast5(self.path, paths=True)):
28 |                 fh.write('{}\t{}\n'.format(os.path.basename(fname), i))
29 |         fnames = list(iterate_fast5(self.path, paths=True, strand_list=tmp_file))
30 |         self.assertEqual(len(fnames), 3)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     unittest.main()
35 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |     - "2.7"
 4 |     - "3.5"
 5 | dist: xenial
 6 | 
 7 | before_install:
 8 |     - pip install --upgrade pip
 9 | 
10 | install: "pip install -r dev_requirements.txt"
11 | 
12 | 
13 | script:
14 |     - pip install .
15 |     - nosetests
16 | 
17 | 
18 | before_deploy:
19 |     - make docs
20 | 
21 | 
22 | deploy:
23 |   - provider: pypi
24 |     user: ontresearch
25 |     password:
26 |       secure: "vrNEDv4dw6FuVBRId3dC5F3FFFIgFP1AJ2PFpBNCQ2q3Qn5iUpTOUQDRHDFIcxvWMfunzOORUD/7M1f2l7x1fHSQ28L/peXVdTuVmiwHnkcPqtug+UOKysgd8K8X0cxcO2/0MOQoev+AFiOXf815Za4/GnA822NcXcyzhugTzTfyqWTyDoGXJ7b3i4Upkty+d2j+nBRpKl4N3mX040gKbDszuTUAqsjO433qJf8SXPH8SJdW2TcM0KsPWf5kvOflqMKb4CLHI5m4NpNLBjd56PnPdVA9czazlR8ZW584+zaYyW6yTtfgg+44WuxNDfXv48cnsCloqiQusCfsl3bQ4LKGk1gg8tTaVOGfD9TI7OBXpLR6dG9SPOaER9flq0gUR9AOjI6zNw2B98RzpOlm82nJIbOYrRUdVy66uZaOt7se3OeYG182k487lrfGHYw217Z3x/Ua0CFMmmp0+WXDBOkozEywEw1ScPi17oLi25nyUHOOyBRKo3Wa6pgGaOdK7SQOEGtmQxmKmB18KOhoKQWsS6sLSKLEv316YhbxDNevwFILZzQ86t3qhDYQkl6y+oYkl6Ha5DRl6jnCEQDAIMG++kmUZbGeqkfPJBy/XpsPePY3HiK0WQWwu37J1nvDWbayfdGtciuQ82bkX/XdymrCgcFSSFoMVB+z/HFqLMk="
27 |     on:
28 |       tags: true
29 |       python: "3.5"
30 |   - provider: pages
31 |     skip_cleanup: true
32 |     github_token: $GHPAGES_TOKEN
33 |     local_dir: docs/_build/html
34 |     target_branch: gh-pages
35 |     on:
36 |       tags: true
37 |       python: "3.5"
38 | 
39 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to Fast5's documentation!
 2 | =================================
 3 | 
 4 | This package comprises an API to HDF containers used by the research groups
 5 | within Oxford Nanopore Technologies. It complements the
 6 | `official API <https://github.com/nanoporetech/ont_fast5_api>`_. Reading and
 7 | writing of read files can be accomplished as well as reading of bulk .fast5
 8 | files.
 9 | 
10 | .. admonition:: Research Release
11 | 
12 |     Research releases are provided as technology demonstrators to provide early
13 |     access to features or stimulate Community development of tools. Support for
14 |     this software will be minimal and is only provided directly by the developers.
15 |     Feature requests, improvements, and discussions are welcome and can be
16 |     implemented by forking and pull requests. However much as we would
17 |     like to rectify every issue and piece of feedback users may have, the 
18 |     developers may have limited resource for support of this software. Research
19 |     releases may be unstable and subject to rapid iteration by Oxford Nanopore
20 |     Technologies.
21 | 
22 | 
23 | Installation
24 | ------------
25 | 
26 | The package can been installed via pip with:
27 | 
28 | .. code-block:: bash
29 | 
30 |     pip install fast5_research
31 | 
32 | See _`examples` for details of basic use.
33 | 
34 | 
35 | Contents
36 | --------
37 | 
38 | .. toctree::
39 |    :maxdepth: 2
40 | 
41 |    examples
42 |    cmdline
43 | 
44 | Full API reference
45 | ------------------
46 | 
47 | .. toctree::
48 |    :maxdepth: 3
49 |       
50 |    fast5_research
51 | 
52 | 
53 | Indices and tables
54 | ------------------
55 | 
56 | * :ref:`genindex`
57 | * :ref:`modindex`
58 | * :ref:`search`
59 | 
60 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | ###
 2 | # This Makefile is simply for testing and making docs, to install
 3 | # the project it should be sufficient to use python setup.py <cmd>
 4 | 
 5 | 
 6 | .PHONY: docs clean test test_py2 test_py3
 7 | 
 8 | 
 9 | venv_py2/bin/activate:
10 | 	test -d venv_py2 || virtualenv venv_py2 --prompt '(fast5_py2) ' --python=python2
11 | 	. $@ && pip install pip --upgrade
12 | 	. $@ && pip install "setuptools<45"
13 | 	. $@ && pip install -r dev_requirements.txt 
14 | 	. $@ && pip install -r requirements.txt; 
15 | 
16 | test_py2: venv_py2/bin/activate
17 | 	. $< && python setup.py nosetests
18 | 
19 | 
20 | venv_py3/bin/activate:
21 | 	test -d venv_py3 || virtualenv venv_py3 --prompt '(fast5_py3) ' --python=python3
22 | 	. $@ && pip install pip --upgrade
23 | 	. $@ && pip install -r dev_requirements.txt 
24 | 	. $@ && pip install -r requirements.txt; 
25 | 
26 | test_py3: venv_py3/bin/activate
27 | 	. $< && python setup.py nosetests
28 | 
29 | 
30 | test: test_py2 test_py3
31 | 
32 | clean:
33 | 	rm -rf build dist *.egg-info venv_* 
34 | 
35 | # You can set these variables from the command line.
36 | SPHINXOPTS    =
37 | SPHINXBUILD   = sphinx-build
38 | PAPER         =
39 | BUILDDIR      = _build
40 | 
41 | # Internal variables.
42 | PAPEROPT_a4     = -D latex_paper_size=a4
43 | PAPEROPT_letter = -D latex_paper_size=letter
44 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
45 | 
46 | DOCSRC = docs
47 | 
48 | docs: venv_py3/bin/activate
49 | 	. $< && pip install sphinx sphinx_rtd_theme sphinx-argparse
50 | 	. $< && cd $(DOCSRC) && $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
51 | 	rm -rf docs/modules.rst docs/fast5_research.rst
52 | 	@echo
53 | 	@echo "Build finished. The HTML pages are in $(DOCSRC)/$(BUILDDIR)/html."
54 | 	touch $(DOCSRC)/$(BUILDDIR)/html/.nojekyll
55 | 


--------------------------------------------------------------------------------
/docs/cmdline.rst:
--------------------------------------------------------------------------------
 1 | Command line Programs
 2 | =====================
 3 | 
 4 | `fast5_research` comes with two commandline programs for conversion of sequencing
 5 | read data.
 6 | 
 7 | **extract_reads** - extracts reads from a bulk ``.fast5`` to either single- or multi-read
 8 | ``.fast5``:
 9 | 
10 | .. code-block:: bash
11 | 
12 |     usage: extract_reads [-h] [--multi | --single] [--flat] [--by_id]
13 |                          [--prefix PREFIX]
14 |                          [--channel_range CHANNEL_RANGE CHANNEL_RANGE]
15 |                          [--workers WORKERS] [--limit LIMIT]
16 |                          input output
17 |     
18 |     Bulk .fast5 to read .fast5 conversion.
19 |     
20 |     positional arguments:
21 |       input                 Bulk .fast5 file for input.
22 |       output                Output folder.
23 |     
24 |     optional arguments:
25 |       -h, --help            show this help message and exit
26 |       --multi               Output multi-read files.
27 |       --single              Output single-read files.
28 |       --flat                Create all .fast5 files in one directory
29 |       --by_id               Name single-read .fast5 files by read_id.
30 |       --prefix PREFIX       Read file prefix.
31 |       --channel_range CHANNEL_RANGE CHANNEL_RANGE
32 |                             Channel range (inclusive).
33 |       --workers WORKERS     Number of worker processes.
34 |       --limit LIMIT         Limit reads per channel.
35 | 
36 | 
37 | **filter_reads** - extracts a subset of reads from a set of multi-read ``.fast5`` files.
38 | 
39 | .. code-block:: bash
40 | 
41 |     usage: filter_reads [-h] [--tsv_field TSV_FIELD] [--multi | --single]
42 |                         [--prefix PREFIX] [--recursive] [--workers WORKERS]
43 |                         input output filter
44 |     
45 |     Extract reads from multi-read .fast5 files.
46 |     
47 |     positional arguments:
48 |       input                 Path to input multi-read .fast5 files.
49 |       output                Output folder.
50 |       filter                A .tsv file with column `read_id` defining required
51 |                             reads.
52 |     
53 |     optional arguments:
54 |       -h, --help            show this help message and exit
55 |       --tsv_field TSV_FIELD
56 |                             Field name from `filter` file to obtain read IDs.
57 |       --multi               Output multi-read files.
58 |       --single              Output single-read files.
59 |       --prefix PREFIX       Read file prefix.
60 |       --recursive           Search recursively under `input` for source files.
61 |       --workers WORKERS     Number of worker processes.
62 | 


--------------------------------------------------------------------------------
/docs/examples.rst:
--------------------------------------------------------------------------------
 1 | Fast5 Examples
 2 | ==============
 3 | 
 4 | The following code snippets demonstrate basic IO using key features of the API.
 5 | 
 6 | Read Files
 7 | ----------
 8 | 
 9 | The library provides the `Fast5` class which extends `h5py.File` with methods
10 | for acquiring common datasets and attributes from files without requiring
11 | knowledge of the file structure. To read a file and obtain a useful summary:
12 | 
13 | .. code-block:: python
14 | 
15 |     from fast5_research import Fast5
16 | 
17 |     filename='my.fast5'
18 | 
19 |     with Fast5(filename) as fh:
20 |         raw = fh.get_read(raw=True)
21 |         summary = fh.summary()
22 |     print('Raw is {} samples long.'.format(len(raw)))
23 |     print('Summary {}.'.format(summary))
24 | 
25 | Note that in this example the raw data will be provided in pA s.
26 | 
27 | The library also allows writing of files which are conformant with Oxford
28 | Nanopore Technologies' software. Certain meta data are needed, which the
29 | library will enforce are present:
30 | 
31 | .. code-block:: python
32 | 
33 |     import numpy as np
34 |     from fast5_research import Fast5
35 | 
36 |     filename='my_new.fast5'
37 |     mean, stdv, n = 40.0, 2.0, 10000
38 |     raw_data = np.random.laplace(mean, stdv/np.sqrt(2), int(dwell))
39 | 
40 |     # example of how to digitize data 
41 |     start, stop = int(min(raw_data - 1)), int(max(raw_data + 1))
42 |     rng = stop - start
43 |     digitisation = 8192.0
44 |     bins = np.arange(start, stop, rng / digitisation)
45 |     # np.int16 is required, the library will refuse to write anything other
46 |     raw_data = np.digitize(raw_data, bins).astype(np.int16)
47 |     
48 |     # The following are required meta data
49 |     channel_id = {
50 |         'digitisation': digitisation,
51 |         'offset': 0,
52 |         'range': rng,
53 |         'sampling_rate': 4000,
54 |         'channel_number': 1,
55 |         }
56 |     read_id = {
57 |         'start_time': 0,
58 |         'duration': len(raw_data),
59 |         'read_number': 1,
60 |         'start_mux': 1,
61 |         'read_id': str(uuid4()),
62 |         'scaling_used': 1,
63 |         'median_before': 0,
64 |     }
65 |     tracking_id = {
66 |         'exp_start_time': '1970-01-01T00:00:00Z',
67 |         'run_id': str(uuid4()).replace('-',''),
68 |         'flow_cell_id': 'FAH00000',
69 |     }
70 |     context_tags = {}
71 |     
72 |     with Fast5.New(filename, 'w', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h:
73 |         h.set_raw(raw_data, meta=read_id, read_number=1)
74 | 
75 | 
76 | Bulk Files
77 | ----------
78 | 
79 | The library exposes data within bulk `.fast5` files through the `BulkFast5` class:
80 | 
81 | .. code-block:: python
82 | 
83 |     from fast5_research import BulkFast5
84 | 
85 |     filename = 'my_bulk.fast5'
86 |     channel = 100
87 |     samples = [1000, 100000]
88 | 
89 |     with BulkFast5(filename) as fh:
90 |         raw = fh.get_raw(channel, raw_indices=samples)
91 |         multiplexer_changes = get_mux_changes_in_window(
92 |             channel, raw_indices=samples)
93 | 
94 | The `BulkFast5` class provides in-memory caching of many intermediate results,
95 | to optimize repeated calls to the same methods.
96 | 
97 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import sys
 4 | from setuptools import setup, find_packages
 5 | 
 6 | 
 7 | __pkg_name__ = 'fast5_research'
 8 | __author__ = 'cwright'
 9 | __description__ = 'ONT Research .fast5 read/write API.'
10 | 
11 | # Use readme as long description and say its github-flavour markdown
12 | from os import path
13 | this_directory = path.abspath(path.dirname(__file__))
14 | kwargs = {'encoding':'utf-8'} if sys.version_info.major == 3 else {}
15 | with open(path.join(this_directory, 'README.md'), **kwargs) as f:
16 |     __long_description__ = f.read()
17 | __long_description_content_type__ = 'text/markdown'
18 | 
19 | 
20 | # Get the version number from __init__.py
21 | verstrline = open(os.path.join(__pkg_name__, '__init__.py'), 'r').read()
22 | vsre = r"^__version__ = ['\"]([^'\"]*)['\"]"
23 | mo = re.search(vsre, verstrline, re.M)
24 | if mo:
25 |     __version__ = mo.group(1)
26 | else:
27 |     raise RuntimeError('Unable to find version string in "{}/__init__.py".'.format(__pkg_name__))
28 | 
29 | dir_path = os.path.dirname(__file__)
30 | with open(os.path.join(dir_path, 'requirements.txt')) as fh:
31 |     install_requires = [
32 |         r.split('#')[0].strip()
33 |         for r in fh.read().splitlines() if not r.strip().startswith('#')
34 |     ]
35 | 
36 | extra_requires={}
37 | 
38 | py2only_requirements = ['futures']
39 | if len(py2only_requirements) > 0:
40 |     extra_requires[':python_version == "2.7"'] = []
41 | 
42 | for requirement in py2only_requirements:
43 |     install_requires.remove(requirement)
44 |     extra_requires[':python_version == "2.7"'].append(requirement)
45 | 
46 | 
47 | setup(
48 |     name=__pkg_name__,
49 |     version=__version__,
50 |     url='https://github.com/nanoporetech/{}'.format(__pkg_name__),
51 |     author=__author__,
52 |     author_email='{}@nanoporetech.com'.format(__author__),
53 |     description=__description__,
54 |     long_description=__long_description__,
55 |     long_description_content_type=__long_description_content_type__,
56 |     entry_points={
57 |         'console_scripts': [
58 |             'index_reads = {}.extract:build_read_index'.format(__pkg_name__),
59 |             'extract_reads = {}.extract:extract_reads'.format(__pkg_name__),
60 |             'read_summary = {}.extract:extract_read_summary'.format(__pkg_name__),
61 |             'filter_reads = {}.extract:filter_multi_reads'.format(__pkg_name__),
62 |             'filter_from_bam = {}.extract:filter_file_from_bam'.format(__pkg_name__),
63 |         ]
64 |     },
65 |     license='Mozilla Public License 2.0',
66 |     dependency_links=[],
67 |     install_requires=install_requires,
68 |     tests_require=['nose>=1.3.7'].extend(install_requires),
69 |     extras_require=extra_requires,
70 |     packages=find_packages(exclude=['*.test', '*.test.*', 'test.*', 'test']),
71 |     package_data={},
72 |     zip_safe=True,
73 |     classifiers=[
74 |         'Development Status :: 4 - Beta',
75 |         'Intended Audience :: Science/Research',
76 |         'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
77 |         'Natural Language :: English',
78 |         'Programming Language :: Python',
79 |         'Topic :: Software Development :: Libraries :: Python Modules',
80 |         'Topic :: Scientific/Engineering :: Bio-Informatics'
81 |     ],
82 |     keywords='ONT Research fast5 API',
83 | )
84 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | v1.2.23
  2 | -------
  3 | * Build using python3.5
  4 | * Fix bug with shuffle option within fast5.py
  5 | * Allow `None` as an input to BulkFast5.get_raw
  6 | * Allow summary file to use `strand_duration` instead of `strand` as column name
  7 | 
  8 | v1.2.22
  9 | -------
 10 | * Update way reads classifications are parsed from the bulk to use modal classification as Minknow now does.
 11 | 
 12 | v1.2.21
 13 | -------
 14 | * Add filter_from_bam command to create filter tsv file from BAM and sequencing summary file.
 15 | 
 16 | v1.2.20
 17 | -------
 18 | * Add `index_reads` program to build read_id->file index tsv file.
 19 | 
 20 | v1.2.19
 21 | -------
 22 | * Add program to produce read summary text file from Bulk .fast5.
 23 | 
 24 | v1.2.18
 25 | -------
 26 | * Allow `extract_reads` to extract only reads present in a given read_summary.txt
 27 | 
 28 | v1.2.17
 29 | -------
 30 | * Allow `filter_reads` to be given full filename/read_id information
 31 | 
 32 | v1.2.16
 33 | -------
 34 | * Fix bug in `filter_reads` resulting in the last worker's reads not being written. 
 35 | 
 36 | v1.2.15
 37 | -------
 38 | * Calculate drift from read table as it will no longer be present in the read table.
 39 | 
 40 | v1.2.14
 41 | -------
 42 | * Fix issue affecting conversion of |u1 dtypes
 43 | 
 44 | v1.2.13
 45 | -------
 46 | * Use filename if possible when extracting reads
 47 | 
 48 | v1.2.12
 49 | -------
 50 | * Clear up a deprecation warning: https://github.com/nanoporetech/fast5_research/issues/30
 51 | 
 52 | v1.2.11
 53 | -------
 54 | * Add `filter_reads` program to extract a subset of reads from multi-reads.
 55 | 
 56 | v1.2.10
 57 | -------
 58 | * Minor syntax fix in `extract.py` for python2
 59 | 
 60 | v1.2.9
 61 | ------
 62 | * Add basic support for creation of multi-read files from bulk files.
 63 | 
 64 | v1.2.8
 65 | ------
 66 | * Small refactor of writing of mapping data.
 67 | 
 68 | v1.2.6
 69 | ------
 70 | * Fix slow creation of mapping table
 71 | 
 72 | v1.2.5
 73 | ------
 74 | * Ensure event structures containing text data are returned as strings rather than bytes under python3.
 75 | 
 76 | v1.2.3
 77 | ------
 78 | * Fixes issue with numpy 1.15 on reading type of views of structured data.
 79 | * Updated documentation (https://nanoporetech.github.io/fast5_research/)
 80 | 
 81 | v1.2.2
 82 | ------
 83 | * Conversion from bulk to reads.
 84 | * Require numpy >= 1.14.
 85 | * A bit more python3 bytes cleaning.
 86 | * Enforce types in raw, and required tracking_id attributes.
 87 | 
 88 | v1.1.0
 89 | ------
 90 | * Python3 compatibility changes
 91 | * Add data cleaning steps for stringly types
 92 | * Unpin numpy version
 93 | 
 94 | v1.0.12
 95 | -------
 96 | * Enforce some typing constraints on meta data for compatibility with some basecallers.
 97 | 
 98 | v1.0.11
 99 | -------
100 | * Ignore h5py warnings on import
101 | 
102 | v1.0.10
103 | -------
104 | * Fix bug finding attributes when EventDetection not present
105 | 
106 | v1.0.8
107 | ------
108 | * Easy import of core classes and functions:
109 |     `from fast5_research import Fast5, BulkFast5, iterate_fast5`
110 | * Enable recursive (lazy) search in `iterate_fast5`.
111 | 
112 | v1.0.9
113 | ------
114 | * Fix itertools import
115 | 
116 | v1.0.6
117 | ------
118 | * Ensure returned events have same dtype
119 | * fast5.py: all returned event arrays same dtype by passing them through self._get_read_data()
120 | * requirements: use any version of numpy
121 | * bump version to 1.0.6 
122 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![Oxford Nanopore Technologies logo](https://github.com/nanoporetech/medaka/raw/master/images/ONT_logo_590x106.png)
  2 | 
  3 | 
  4 | fast5_research
  5 | ==============
  6 | 
  7 | [![Build Status](https://travis-ci.org/nanoporetech/fast5_research.svg?branch=master)](https://travis-ci.org/nanoporetech/fast5_research)
  8 | 
  9 | Python fast5 reading and writing functionality provided by ONT Research.
 10 | 
 11 | © 2018 Oxford Nanopore Technologies Ltd.
 12 | 
 13 | Features
 14 | --------
 15 | 
 16 |  * Read interface bulk `.fast5` file for extracting reads, channel states, voltage, ...
 17 |  * Read/Write interface to single read files guaranteeing conformity.
 18 |  * Works on Linux, MacOS, and Windows.
 19 |  * Open source (Mozilla Public License 2.0).
 20 | 
 21 | Documentation can be found at https://nanoporetech.github.io/fast5_research/.
 22 | 
 23 | Installation
 24 | ------------
 25 | 
 26 | `fast5_research` is available from pypi can can be installed with pip:
 27 | 
 28 |     pip install fast5_research
 29 | 
 30 | 
 31 | Usage
 32 | -----
 33 | 
 34 | Full documentation can be found at the link above, below are two simple examples.
 35 | 
 36 | To read a file:
 37 | 
 38 |     from fast5_research import Fast5
 39 |     
 40 |     filename='my.fast5'
 41 |     
 42 |     with Fast5(filename) as fh:
 43 |         raw = fh.get_read(raw=True)
 44 |         summary = fh.summary()
 45 |     print('Raw is {} samples long.'.format(len(raw)))
 46 |     print('Summary {}.'.format(summary))
 47 | 
 48 | Write a file, the library will check the given meta data, ensure that all required
 49 | values are present, and covert all values to their defined types.
 50 | 
 51 |     from uuid import uuid4
 52 |     import numpy as np
 53 |     from fast5_research import Fast5
 54 |     
 55 |     filename='my_new.fast5'
 56 |     mean, stdv, n = 40.0, 2.0, 10000
 57 |     raw_data = np.random.laplace(mean, stdv/np.sqrt(2), int(dwell))
 58 |     
 59 |     # example of how to digitize data
 60 |     start, stop = int(min(raw_data - 1)), int(max(raw_data + 1))
 61 |     rng = stop - start
 62 |     digitisation = 8192.0
 63 |     bins = np.arange(start, stop, rng / digitisation)
 64 |     # np.int16 is required, the library will refuse to write anything other
 65 |     raw_data = np.digitize(raw_data, bins).astype(np.int16)
 66 |     
 67 |     # The following are required meta data
 68 |     channel_id = {
 69 |         'digitisation': digitisation,
 70 |         'offset': 0,
 71 |         'range': rng,
 72 |         'sampling_rate': 4000,
 73 |         'channel_number': 1,
 74 |         }
 75 |     read_id = {
 76 |         'start_time': 0,
 77 |         'duration': len(raw_data),
 78 |         'read_number': 1,
 79 |         'start_mux': 1,
 80 |         'read_id': str(uuid4()),
 81 |         'scaling_used': 1,
 82 |         'median_before': 0,
 83 |     }
 84 |     tracking_id = {
 85 |         'exp_start_time': '1970-01-01T00:00:00Z',
 86 |         'run_id': str(uuid4()).replace('-',''),
 87 |         'flow_cell_id': 'FAH00000',
 88 |     }
 89 |     context_tags = {}
 90 |     
 91 |     with Fast5.New(filename, 'w', tracking_id=tracking_id, context_tags=context_tags, channel_id=channel_id) as h:
 92 |         h.set_raw(raw_data, meta=read_id, read_number=1)
 93 | 
 94 | 
 95 | Help
 96 | ----
 97 | 
 98 | **Licence and Copyright**
 99 | 
100 | © 2018 Oxford Nanopore Technologies Ltd.
101 | 
102 | `medaka` is distributed under the terms of the Mozilla Public License 2.0.
103 | 
104 | **Research Release**
105 | 
106 | Research releases are provided as technology demonstrators to provide early
107 | access to features or stimulate Community development of tools. Support for
108 | this software will be minimal and is only provided directly by the developers.
109 | Feature requests, improvements, and discussions are welcome and can be
110 | implemented by forking and pull requests. However much as we would
111 | like to rectify every issue and piece of feedback users may have, the 
112 | developers may have limited resource for support of this software. Research
113 | releases may be unstable and subject to rapid iteration by Oxford Nanopore
114 | Technologies.
115 | 


--------------------------------------------------------------------------------
/fast5_research/test/test_fast5_basecalling_and_mapping.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | import os
  3 | import sys
  4 | import tempfile
  5 | import unittest
  6 | 
  7 | import numpy as np
  8 | import numpy.testing as nptest
  9 | 
 10 | from fast5_research import Fast5
 11 | from fast5_research.util import _sanitize_data_for_reading
 12 | 
 13 | class Fast5BasecallerAndMapper(unittest.TestCase):
 14 | 
 15 |     @classmethod
 16 |     def get_file_path(self,filename):
 17 |         file_path = os.path.join(os.path.dirname(__file__), 'data', filename)
 18 |         return file_path
 19 | 
 20 |     @classmethod
 21 |     def setUpClass(self):
 22 |         """Create a read fast5 from scratch with previously simulated mapping and basecall 1D data"""
 23 |         print('* Fast5 Basecaller and Mapper')
 24 | 
 25 |         self.seq = 'CATTACGCATTTACCGAAACCTGGGCAAA'
 26 |         self.qstring = '!'*len(self.seq)
 27 |         self.model_file = 'example_template.model'
 28 |         self.events_file = 'example_template.events'
 29 |         self.model_file = 'example_template.model'
 30 |         self.bc_scale_file = 'example_template.bc_scale'
 31 |         self.bc_path_file = 'example_template.bc_path'
 32 |         self.map_scale_file = 'example_template.map_scale'
 33 |         self.map_path_file = 'example_template.map_path'
 34 |         self.map_post_file = 'example_template.map_post'
 35 |         self.ref_name = 'test_seq'
 36 |         self.npstr_dtype = 'U'
 37 |         if sys.version_info[0] < 3:
 38 |             self.npstr_dtype = 'S'
 39 | 
 40 |         # Open new file
 41 |         header = ['channel_number', 'offset', 'range', 'digitisation', 'sampling_rate']
 42 |         channel_id = {x:0 for x in header}
 43 |         tracking_id = tracking_id = {
 44 |             'exp_start_time': '1970-01-00T00:00:00Z',
 45 |             'run_id': 'a'*32,
 46 |             'flow_cell_id': 'FAH00000',
 47 |         }
 48 |         fakefile = tempfile.NamedTemporaryFile()
 49 |         self.fh = Fast5.New(fakefile.name, channel_id=channel_id, tracking_id=tracking_id, read='a')
 50 | 
 51 |         # load data to set within fast5 file
 52 |         self.model = np.genfromtxt(self.get_file_path(self.model_file), dtype=None, delimiter='\t', names=True, encoding='utf8')
 53 | 
 54 |         self.model['kmer'] = self.model['kmer'].astype(str)
 55 | 
 56 |         self.events = np.genfromtxt(self.get_file_path(self.events_file), dtype=None, delimiter='\t', names=True)
 57 | 
 58 |         # use namedtuple to imitate a Scale object
 59 |         Scale = namedtuple('Scale', ['shift', 'scale', 'drift', 'var', 'scale_sd', 'var_sd'])
 60 | 
 61 |         bc_scale = Scale(*np.genfromtxt(self.get_file_path(self.bc_scale_file), dtype=None, delimiter='\t'))
 62 |         bc_path = np.genfromtxt(self.get_file_path(self.bc_path_file), dtype=np.int32, delimiter='\t')
 63 | 
 64 |         self.fh.set_basecall_data(self.events, bc_scale, bc_path, self.model, self.seq)
 65 | 
 66 |         map_scale = Scale(*np.genfromtxt(self.get_file_path(self.map_scale_file), dtype=None, delimiter='\t'))
 67 |         map_path = np.genfromtxt(self.get_file_path(self.map_path_file), dtype=np.int32, delimiter='\t')
 68 |         map_post = np.genfromtxt(self.get_file_path(self.map_post_file), delimiter='\t')
 69 | 
 70 |         n_states = len(self.seq) - len(self.model['kmer'][0]) + 1
 71 |         self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name)
 72 |         self.fh.set_mapping_data(self.events, map_scale, map_path, self.model, self.seq, self.ref_name, post=map_post)
 73 | 
 74 |     @classmethod
 75 |     def tearDownClass(self):
 76 |         self.fh.close()
 77 | 
 78 |     def test_000_basic_folder_structure(self):
 79 |         """Test root folder structure creation"""
 80 | 
 81 |         self.assertEqual(list(self.fh.keys()), ['Analyses', 'UniqueGlobalKey'])
 82 |         self.assertEqual(list(self.fh['/Analyses'].keys()), ['Basecall_1D_000', 'Squiggle_Map_000', 'Squiggle_Map_001'])
 83 | 
 84 |     def test_005_basecall_1d_folder_structure(self):
 85 |         """Test basecall 1d folder structure creation"""
 86 | 
 87 |         self.assertEqual(list(self.fh['/Analyses/Basecall_1D_000'].keys()), ['BaseCalled_template', 'Summary'])
 88 |         self.assertEqual(list(self.fh['/Analyses/Basecall_1D_000/BaseCalled_template'].keys()), ['Events', 'Fastq', 'Model'])
 89 | 
 90 |     def test_010_mapping_folder_structure(self):
 91 |         """Test mapping structure creation"""
 92 | 
 93 |         self.assertEqual(list(self.fh['/Analyses/Squiggle_Map_000'].keys()), ['SquiggleMapped_template', 'Summary'])
 94 |         self.assertEqual(list(self.fh['/Analyses/Squiggle_Map_000/SquiggleMapped_template'].keys()), ['Events', 'Model'])
 95 |         self.assertEqual(list(self.fh['/Analyses/Squiggle_Map_000/Summary'].keys()), ['squiggle_map_template'])
 96 | 
 97 |         self.assertEqual(list(self.fh['/Analyses/Squiggle_Map_001'].keys()), ['SquiggleMapped_template', 'Summary'])
 98 |         self.assertEqual(list(self.fh['/Analyses/Squiggle_Map_001/SquiggleMapped_template'].keys()), ['Events',  'Model'])
 99 |         self.assertEqual(list(self.fh['/Analyses/Squiggle_Map_001/Summary'].keys()), ['squiggle_map_template'])
100 | 
101 |     def test_015_fastq(self):
102 |         """ Test fastq assembly and writing """
103 | 
104 |         fastq = '@unknown\n{}\n+\n{}\n'.format(self.seq, self.qstring)
105 |         self.assertEqual(_sanitize_data_for_reading(self.fh['/Analyses/Basecall_1D_000/BaseCalled_template/Fastq'][()]), fastq)
106 | 
107 |     def test_020_basecall_1d_event_writing(self):
108 |         """Test basecall event writing"""
109 | 
110 |         input_events = self.events['mean']
111 |         output_events = self.fh['/Analyses/Basecall_1D_000/BaseCalled_template/Events']['mean'][()]
112 |         nptest.assert_array_equal(input_events, output_events)
113 | 
114 |     def test_025_basecall_1d_event_reading(self):
115 |         """Test basecall event reading with the getter function"""
116 | 
117 |         input_events = self.events['mean']
118 |         output_events = self.fh.get_basecall_data()['mean']
119 |         nptest.assert_array_equal(input_events, output_events)
120 | 
121 |     def test_030_mapping_event_writing(self):
122 |         """Test mapping event writing"""
123 | 
124 |         input_events = self.events['mean']
125 |         output_events = self.fh['/Analyses/Squiggle_Map_000/SquiggleMapped_template/Events']['mean'][()]
126 |         output_events_with_post = self.fh['/Analyses/Squiggle_Map_001/SquiggleMapped_template/Events']['mean'][()]
127 | 
128 |         nptest.assert_array_equal(input_events, output_events)
129 |         nptest.assert_array_equal(input_events, output_events_with_post)
130 | 
131 |     def test_035_mapping_event_reading(self):
132 |         """Test mapping event reading with the getter function"""
133 | 
134 |         input_means = self.events['mean']
135 |         events = self.fh.get_mapping_data()
136 |         nptest.assert_array_equal(input_means, events['mean'])
137 |         self.assertEqual(events['kmer'].dtype, np.dtype('|{}5'.format(self.npstr_dtype)))
138 | 
139 |     def test_036_mapping_event_reading_any(self):
140 |         """Test mapping event reading with the I don't care function"""
141 | 
142 |         input_means = self.events['mean']
143 |         events = self.fh.get_mapping_data()
144 |         nptest.assert_array_equal(input_means, events['mean'])
145 |         self.assertEqual(events['kmer'].dtype, np.dtype('|{}5'.format(self.npstr_dtype)))
146 | 
147 | 
148 | if __name__ == '__main__':
149 |     unittest.main()
150 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is execfile()d with the current directory set to its containing dir.
  4 | #
  5 | # Note that not all possible configuration values are present in this
  6 | # autogenerated file.
  7 | #
  8 | # All configuration values have a default; values that are commented out
  9 | # serve to show the default.
 10 | 
 11 | import sys, os, re, subprocess
 12 | import sphinx_rtd_theme
 13 | 
 14 | # If extensions (or modules to document with autodoc) are in another directory,
 15 | # add these directories to sys.path here. If the directory is relative to the
 16 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 17 | sys.path.insert(0, os.path.abspath('..'))
 18 | 
 19 | # -- General configuration -----------------------------------------------------
 20 | 
 21 | # If your documentation needs a minimal Sphinx version, state it here.
 22 | #needs_sphinx = '1.0'
 23 | 
 24 | # Add any Sphinx extension module names here, as strings. They can be extensions
 25 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 26 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.intersphinx',
 27 |               'sphinx.ext.mathjax']
 28 | mathjax_path = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"
 29 | 
 30 | # Add any paths that contain templates here, relative to this directory.
 31 | templates_path = ['_templates']
 32 | 
 33 | # The suffix of source filenames.
 34 | source_suffix = '.rst'
 35 | 
 36 | # The encoding of source files.
 37 | #source_encoding = 'utf-8-sig'
 38 | 
 39 | # The master toctree document.
 40 | master_doc = 'index'
 41 | 
 42 | # General information about the project.
 43 | __pkg_name__ = u'fast5_research'
 44 | project = __pkg_name__.capitalize()
 45 | copyright = u'2017, Oxford Nanopore Technologies'
 46 | 
 47 | # Generate API documentation:
 48 | if subprocess.call(['sphinx-apidoc', '-o', './', "../{}".format(__pkg_name__)]) != 0:
 49 |     sys.stderr.write('Failed to generate API documentation!\n')
 50 | 
 51 | # The version info for the project you're documenting, acts as replacement for
 52 | # |version| and |release|, also used in various other places throughout the
 53 | # built documents.
 54 | #
 55 | 
 56 | # Get the version number from __init__.py
 57 | verstrline = open(os.path.join('..', __pkg_name__, '__init__.py'), 'r').read()
 58 | vsre = r"^__version__ = ['\"]([^'\"]*)['\"]"
 59 | mo = re.search(vsre, verstrline, re.M)
 60 | if mo:
 61 |     __version__ = mo.group(1)
 62 | else:
 63 |     raise RuntimeError('Unable to find version string in "{}/__init__.py".'.format(__pkg_name__))
 64 | 
 65 | # The short X.Y version.
 66 | version = __version__
 67 | # The full version, including alpha/beta/rc tags.
 68 | release = __version__
 69 | 
 70 | # The language for content autogenerated by Sphinx. Refer to documentation
 71 | # for a list of supported languages.
 72 | #language = None
 73 | 
 74 | # There are two options for replacing |today|: either, you set today to some
 75 | # non-false value, then it is used:
 76 | #today = ''
 77 | # Else, today_fmt is used as the format for a strftime call.
 78 | #today_fmt = '%B %d, %Y'
 79 | 
 80 | # List of patterns, relative to source directory, that match files and
 81 | # directories to ignore when looking for source files.
 82 | exclude_patterns = ['_build','*test*']
 83 | 
 84 | # The reST default role (used for this markup: `text`) to use for all documents.
 85 | #default_role = None
 86 | 
 87 | # If true, '()' will be appended to :func: etc. cross-reference text.
 88 | #add_function_parentheses = True
 89 | 
 90 | # If true, the current module name will be prepended to all description
 91 | # unit titles (such as .. function::).
 92 | #add_module_names = True
 93 | 
 94 | # If true, sectionauthor and moduleauthor directives will be shown in the
 95 | # output. They are ignored by default.
 96 | #show_authors = False
 97 | 
 98 | # The name of the Pygments (syntax highlighting) style to use.
 99 | pygments_style = 'sphinx'
100 | 
101 | # A list of ignored prefixes for module index sorting.
102 | #modindex_common_prefix = []
103 | 
104 | 
105 | # -- Options for HTML output ---------------------------------------------------
106 | 
107 | # The theme to use for HTML and HTML Help pages.  See the documentation for
108 | # a list of builtin themes.
109 | html_theme = 'sphinx_rtd_theme'
110 | 
111 | # Theme options are theme-specific and customize the look and feel of a theme
112 | # further.  For a list of options available for each theme, see the
113 | # documentation.
114 | #html_theme_options = {}
115 | 
116 | # Add any paths that contain custom themes here, relative to this directory.
117 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
118 | 
119 | # The name for this set of Sphinx documents.  If None, it defaults to
120 | # "<project> v<release> documentation".
121 | #html_title = None
122 | 
123 | # A shorter title for the navigation bar.  Default is the same as html_title.
124 | #html_short_title = None
125 | 
126 | # The name of an image file (relative to this directory) to place at the top
127 | # of the sidebar.
128 | #html_logo = None
129 | 
130 | # The name of an image file (within the static path) to use as favicon of the
131 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
132 | # pixels large.
133 | #html_favicon = None
134 | 
135 | # Add any paths that contain custom static files (such as style sheets) here,
136 | # relative to this directory. They are copied after the builtin static files,
137 | # so a file named "default.css" will overwrite the builtin "default.css".
138 | #html_static_path = ['_static']
139 | 
140 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
141 | # using the given strftime format.
142 | #html_last_updated_fmt = '%b %d, %Y'
143 | 
144 | # If true, SmartyPants will be used to convert quotes and dashes to
145 | # typographically correct entities.
146 | #html_use_smartypants = True
147 | 
148 | # Custom sidebar templates, maps document names to template names.
149 | #html_sidebars = {}
150 | 
151 | # Additional templates that should be rendered to pages, maps page names to
152 | # template names.
153 | #html_additional_pages = {}
154 | 
155 | # If false, no module index is generated.
156 | #html_domain_indices = True
157 | 
158 | # If false, no index is generated.
159 | #html_use_index = True
160 | 
161 | # If true, the index is split into individual pages for each letter.
162 | #html_split_index = False
163 | 
164 | # If true, links to the reST sources are added to the pages.
165 | #html_show_sourcelink = True
166 | 
167 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
168 | #html_show_sphinx = True
169 | 
170 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
171 | #html_show_copyright = True
172 | 
173 | # If true, an OpenSearch description file will be output, and all pages will
174 | # contain a <link> tag referring to it.  The value of this option must be the
175 | # base URL from which the finished HTML is served.
176 | #html_use_opensearch = ''
177 | 
178 | # This is the file name suffix for HTML files (e.g. ".xhtml").
179 | #html_file_suffix = None
180 | 
181 | # Output file base name for HTML help builder.
182 | htmlhelp_basename = '{}doc'.format(project)
183 | 
184 | 
185 | # -- Options for LaTeX output --------------------------------------------------
186 | 
187 | latex_elements = {
188 | # The paper size ('letterpaper' or 'a4paper').
189 | #'papersize': 'letterpaper',
190 | 
191 | # The font size ('10pt', '11pt' or '12pt').
192 | #'pointsize': '10pt',
193 | 
194 | # Additional stuff for the LaTeX preamble.
195 | #'preamble': '',
196 | }
197 | 
198 | # Grouping the document tree into LaTeX files. List of tuples
199 | # (source start file, target name, title, author, documentclass [howto/manual]).
200 | latex_documents = [
201 |   ('index', '{}.tex'.format(project), u'{} Documentation'.format(project),
202 |    u'Oxford Nanopore Technologies', 'manual'),
203 | ]
204 | 
205 | # The name of an image file (relative to this directory) to place at the top of
206 | # the title page.
207 | #latex_logo = None
208 | 
209 | # For "manual" documents, if this is true, then toplevel headings are parts,
210 | # not chapters.
211 | #latex_use_parts = False
212 | 
213 | # If true, show page references after internal links.
214 | #latex_show_pagerefs = False
215 | 
216 | # If true, show URL addresses after external links.
217 | #latex_show_urls = False
218 | 
219 | # Documents to append as an appendix to all manuals.
220 | #latex_appendices = []
221 | 
222 | # If false, no module index is generated.
223 | #latex_domain_indices = True
224 | 
225 | 
226 | # -- Options for manual page output --------------------------------------------
227 | 
228 | # One entry per manual page. List of tuples
229 | # (source start file, name, description, authors, manual section).
230 | man_pages = [
231 |     ('index', project, u'{} Documentation'.format(project),
232 |      [u'Oxford Nanopore Technologies'], 1)
233 | ]
234 | 
235 | # If true, show URL addresses after external links.
236 | #man_show_urls = False
237 | 
238 | 
239 | # -- Options for Texinfo output ------------------------------------------------
240 | 
241 | # Grouping the document tree into Texinfo files. List of tuples
242 | # (source start file, target name, title, author,
243 | #  dir menu entry, description, category)
244 | texinfo_documents = [
245 |   ('index', project, u'{} Documentation'.format(project),
246 |    u'Oxford Nanopore Technologies', project, 'One line description of project.',
247 |    'Miscellaneous'),
248 | ]
249 | 
250 | # Documents to append as an appendix to all manuals.
251 | #texinfo_appendices = []
252 | 
253 | # If false, no module index is generated.
254 | #texinfo_domain_indices = True
255 | 
256 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
257 | #texinfo_show_urls = 'footnote'
258 | 
259 | 
260 | # -- Options for Epub output ---------------------------------------------------
261 | 
262 | # Bibliographic Dublin Core info.
263 | epub_title = project
264 | epub_author = u'Oxford Nanopore Technologies'
265 | epub_publisher = u'Oxford Nanopore Technologies'
266 | epub_copyright = u'2017, Oxford Nanopore Technologies'
267 | 
268 | # The language of the text. It defaults to the language option
269 | # or en if the language is not set.
270 | #epub_language = ''
271 | 
272 | # The scheme of the identifier. Typical schemes are ISBN or URL.
273 | #epub_scheme = ''
274 | 
275 | # The unique identifier of the text. This can be a ISBN number
276 | # or the project homepage.
277 | #epub_identifier = ''
278 | 
279 | # A unique identification for the text.
280 | #epub_uid = ''
281 | 
282 | # A tuple containing the cover image and cover page html template filenames.
283 | #epub_cover = ()
284 | 
285 | # HTML files that should be inserted before the pages created by sphinx.
286 | # The format is a list of tuples containing the path and title.
287 | #epub_pre_files = []
288 | 
289 | # HTML files shat should be inserted after the pages created by sphinx.
290 | # The format is a list of tuples containing the path and title.
291 | #epub_post_files = []
292 | 
293 | # A list of files that should not be packed into the epub file.
294 | #epub_exclude_files = []
295 | 
296 | # The depth of the table of contents in toc.ncx.
297 | #epub_tocdepth = 3
298 | 
299 | # Allow duplicate toc entries.
300 | #epub_tocdup = True
301 | 


--------------------------------------------------------------------------------
/fast5_research/test/test_fast5.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import types
  4 | import unittest
  5 | from uuid import uuid4
  6 | 
  7 | import h5py
  8 | import numpy as np
  9 | 
 10 | from fast5_research import Fast5
 11 | 
 12 | class Fast5API(unittest.TestCase):
 13 |     test_file = 'example_basecall_squiggle_mapping.fast5'
 14 |     additional_file = 'additional_test_file.fast5'
 15 | 
 16 |     def setUp(self):
 17 |         self.h = Fast5(os.path.join(
 18 |             os.path.dirname(__file__), 'data', self.test_file
 19 |         ))
 20 | 
 21 | 
 22 |         self.additional_h = Fast5(os.path.join(
 23 |             os.path.dirname(__file__), 'data', self.additional_file
 24 |         ))
 25 | 
 26 |         # Use to create new temp files
 27 |         self.tmp_events_float = np.array(
 28 |             [(0.0, 1.0, 10.0, 2.0)],
 29 |             dtype=[(x, 'float') for x in ['start','length', 'mean', 'stdv']]
 30 |         )
 31 |         self.tmp_events_int = np.array(
 32 |             [(0, 5000, 10.0, 2.0)],
 33 |             dtype=[
 34 |                 ('start', 'uint32'), ('length', 'uint32'),
 35 |                 ('mean', 'float'), ('stdv', 'float')
 36 |             ]
 37 |         )
 38 |         self.tmp_raw = np.ones(15, dtype=np.int16)
 39 | 
 40 |         self.tmp_channel_id = {
 41 |             'channel_number': 1,
 42 |             'range': 1.0,
 43 |             'digitisation': 1.0,
 44 |             'offset': 0.0,
 45 |             'sample_rate': 5000.0,
 46 |             'sampling_rate': 5000.0
 47 |         }
 48 |         self.tmp_read_id = {
 49 |             'start_time': 0.0,
 50 |             'duration': 1.0,
 51 |             'read_number': 1,
 52 |             'start_mux': 1,
 53 |             'read_id': str(uuid4()),
 54 |             'scaling_used': 1,
 55 |             'median_before': 0
 56 |         }
 57 |         self.tmp_tracking_id = {
 58 |             'exp_start_time': '1970-01-01T00:00:00Z',
 59 |             'run_id': str(uuid4()).replace('-',''),
 60 |             'flow_cell_id': 'FAH00000',
 61 |         }
 62 | 
 63 | 
 64 |     def tearDown(self):
 65 |         self.h.close()
 66 |         self.additional_h.close()
 67 | 
 68 |     @classmethod
 69 |     def setUpClass(self):
 70 |         print('* Fast5 API')
 71 | 
 72 | 
 73 |     def test_000_basic_functions(self):
 74 |         # Just test an inherited member
 75 |         self.assertEqual(
 76 |             os.path.basename(self.h.filename), self.test_file,
 77 |             'Inherited member attribute not correct.'
 78 |         )
 79 | 
 80 |         # We shouldn't be writable by default
 81 |         self.assertFalse(self.h.writable, 'File is not non-writable by default.')
 82 | 
 83 |     def test_010_get_meta(self):
 84 |         self.assertSetEqual(
 85 |             set(self.h.attributes.keys()),
 86 |             {
 87 |              'scaling_used', 'median_before',
 88 |              'start_time', 'read_number',
 89 |              'abasic_found', 'duration', 'start_mux'
 90 |              },
 91 |             '.attributes does not contain expected fields.'
 92 |         )
 93 | 
 94 |         self.assertSetEqual(
 95 |             set(self.h.channel_meta.keys()),
 96 |             {
 97 |              'channel_number', 'range', 'offset',
 98 |              'digitisation', 'sampling_rate',
 99 |              },
100 |             '.channel_meta does not contain expected fields.'
101 |         )
102 | 
103 |         self.assertTrue(
104 |             {
105 |              'strand_duration', 'pore_before', 'abasic',
106 |              'start_time', 'mux', 'channel', 'filename'
107 |             }.issubset(self.h.summary().keys()),
108 |             '.summary does not contain expected fields.'
109 |         )
110 | 
111 |         # Duration and start_time should be int, not float (samples, not times)
112 |         for key in ['duration', 'start_time']:
113 |             self.assertIsInstance(
114 |                 self.h.attributes[key], int
115 |             )
116 | 
117 |     def test_020_get_reads_et_al(self):
118 |         reads = self.h.get_reads()
119 |         try:
120 |             read = reads.next()
121 |         except AttributeError:
122 |             read = next(reads)
123 |         self.assertIsInstance(
124 |             reads, types.GeneratorType,
125 |             '.get_reads() does not give generator.'
126 |         )
127 |         self.assertIsInstance(
128 |             read, np.ndarray,
129 |             '.get_reads().next() does not give numpy array by default.'
130 |         )
131 |         self.assertSequenceEqual(
132 |             read.dtype.names, ['start', 'length', 'mean', 'stdv'],
133 |             '.get_reads().next() does not give "event data".'
134 |         )
135 |         reads = self.h.get_reads(group=True)
136 |         try:
137 |             read = reads.next()
138 |         except AttributeError:
139 |             read = next(reads)
140 |         self.assertIsInstance(
141 |             read, h5py._hl.group.Group,
142 |             '.get_reads().next() does not give h5py group when asked.'
143 |         )
144 | 
145 |     def test_030_analysis_locations(self):
146 |         path = self.h.get_analysis_latest('Basecall_1D')
147 |         self.assertEqual(
148 |             '/Analyses/Basecall_1D_000', path,
149 |             '.get_analysis_latest() does not return correct.'
150 |         )
151 | 
152 |         path = self.h.get_analysis_new('Basecall_1D')
153 |         self.assertEqual(
154 |             '/Analyses/Basecall_1D_001', path,
155 |             '.get_analysis_new() does not return correct.'
156 |         )
157 | 
158 |     def test_040_split_data_legacy(self):
159 |         indices = self.h.get_section_indices()
160 |         self.assertIsInstance(
161 |             indices, tuple,
162 |             '.get_section_indices() does not give tuple'
163 |         )
164 | 
165 |         for i in range(2):
166 |             self.assertIsInstance(
167 |                 indices[i], tuple,
168 |                 '.get_section_indices() does not give tuple of tuple, item {}'.format(i)
169 |             )
170 | 
171 |     def test_042_split_data_linear(self):
172 |         indices = self.additional_h.get_section_indices()
173 |         self.assertIsInstance(
174 |             indices, tuple,
175 |             '.get_section_indices() does not give tuple'
176 |         )
177 | 
178 |         for i in range(2):
179 |             self.assertIsInstance(
180 |                 indices[i], tuple,
181 |                 '.get_section_indices() does not give tuple of tuple, item {}'.format(i)
182 |             )
183 | 
184 |     def test_045_split_data_events(self):
185 |         for section in ('template', 'complement'):
186 |             read = self.h.get_section_events(section)
187 |             self.assertIsInstance(
188 |                 read, np.ndarray,
189 |                 '.get_section_events({}) does not give numpy array by default.'.format(section)
190 |             )
191 | 
192 | 
193 |     def test_050_sequence_data(self):
194 |         for section in ('template', 'complement'):
195 |             call = self.h.get_fastq(
196 |                 'Basecall_1D', section
197 |             )
198 |             self.assertIsInstance(call, str, '{} call is not str.'.format(section))
199 | 
200 |         # Check ValueError raised when requesting absent data
201 |         self.assertRaises(
202 |             ValueError, self.h.get_fastq, 'Basecall_1D', '2D'
203 |         )
204 | 
205 | 
206 |     def test_060_construct_new_file_checks(self):
207 |         tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))
208 | 
209 |         with self.assertRaises(IOError):
210 |             fh = Fast5.New(tmp_file, 'r')
211 |             fh = Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id)
212 |             fh = Fast5.New(tmp_file, 'a', tracking_id=self.tmp_tracking_id)
213 | 
214 |         # This should be fine
215 |         with Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h:
216 |             h.set_read(self.tmp_events_float, self.tmp_read_id)
217 | 
218 | 
219 |     def test_061_write_read_float_data(self):
220 |         tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))
221 | 
222 |         with Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h:
223 |             h.set_read(self.tmp_events_float, self.tmp_read_id)
224 | 
225 |         # Metadata duration and start_time should be integers, not floats
226 |         with Fast5(tmp_file, 'r') as h:
227 |             for key in ['duration', 'start_time']:
228 |                 self.assertIsInstance(
229 |                    h.attributes[key], int
230 |             )
231 | 
232 | 
233 |         with Fast5(tmp_file) as h:
234 |             events = h.get_read()
235 |             self.assertEqual(events['start'].dtype.descr[0][1], '<f8',
236 |                 'Writing float data did not give float data on read.'
237 |             )
238 |             actual = events['start'][0]
239 |             expected = self.tmp_events_float['start'][0]
240 |             self.assertEqual(actual, expected,
241 |                 'Write float, data on read not scaled correctly, got {} not {}'.format(
242 |                     actual, expected
243 |                 )
244 |             )
245 | 
246 |         os.unlink(tmp_file)
247 | 
248 |     def test_065_write_int_read_float_data(self):
249 |         tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))
250 | 
251 |         with Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h:
252 |             h.set_read(self.tmp_events_int, self.tmp_read_id)
253 | 
254 |         with Fast5(tmp_file) as h:
255 |             events = h.get_read()
256 |             self.assertEqual(events['start'].dtype.descr[0][1], '<f8',
257 |                 'Writing uint data did not give float data on read.'
258 |             )
259 |             actual = events['start'][0]
260 |             expected = self.tmp_events_float['start'][0]
261 |             self.assertEqual(actual, expected,
262 |                 'Write unit, data on read not scaled correctly, got {} not {}'.format(
263 |                     actual, expected
264 |                 )
265 |             )
266 | 
267 |         os.unlink(tmp_file)
268 | 
269 |     def test_067_write_raw_data(self):
270 |         tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4()))
271 |         with Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h:
272 |             h.set_raw(self.tmp_raw, meta=self.tmp_read_id, read_number=1)
273 | 
274 |         with self.assertRaises(TypeError):
275 |             with Fast5.New(tmp_file, 'a', channel_id = self.tmp_channel_id, tracking_id=self.tmp_tracking_id) as h:
276 |                 h.set_raw(self.tmp_raw.astype(float), meta=self.tmp_read_id, read_number=1)
277 | 
278 |     def test_070_reference_fasta(self):
279 |         for section in ('template', 'complement'):
280 |             call = self.h.get_reference_fasta('Alignment', section)
281 |             self.assertIsInstance(call, str, '{} call is not str.'.format(section))
282 | 
283 |     def test_080_parse_temperature(self):
284 |         temps = self.h.get_temperature()
285 |         expected = np.array([(1.0, 32.75), (8.0, 32.88), (9.0, 32.75)],
286 |             dtype=[('time', '<f8'), ('minion_heatsink_temperature', '<f8')]
287 |         )
288 |         self.assertTupleEqual(temps.dtype.names, expected.dtype.names)
289 |         for field in expected.dtype.names:
290 |             np.testing.assert_allclose(
291 |                 expected[field], temps[0:3][field])
292 | 
293 | 
294 | if __name__ == "__main__":
295 |     unittest.main()
296 | 


--------------------------------------------------------------------------------
/fast5_research/test/data/example_template.map_post:
--------------------------------------------------------------------------------
 1 | 9.972981110500181989e-01	2.675561242775453298e-03	4.299697947913091903e-17	3.034351349950701481e-48	2.885638723334952768e-35	1.916515817230235985e-101	1.027385227382815364e-37	2.010956013223812123e-29	7.268625813068740084e-39	7.990079141851841795e-34	4.641463422791118198e-50	9.491483784400641821e-63	4.034464738872980122e-67	1.014387622722296636e-134	1.414485770205609590e-137	1.139494367698334202e-69	5.735571532554530794e-70	3.522631659056987237e-72	7.862437440032323573e-100	2.687890980624027743e-91	6.721094215387954307e-89	3.319084839525593045e-91	4.196846854383640779e-84	6.268239612760857604e-149	4.185381435806295687e-162
 2 | 1.014391784743335755e-03	9.989748479622072574e-01	1.269718586443931377e-10	1.304317097542462249e-38	9.364603401872576436e-31	2.933310751006665407e-92	3.169932086721243079e-35	1.874874673362634030e-29	8.428435897815710576e-41	4.170415789837340596e-33	3.420413255191920191e-42	7.613768321527391943e-55	1.194342518203828912e-62	2.324415661530324873e-125	9.339012633050726830e-129	1.670544725422023905e-67	8.997010224305381560e-66	9.483751648773586555e-70	2.973372387761271555e-94	2.318387946349030546e-86	3.238133101230620244e-85	2.523804199472379066e-86	2.369032531211824291e-83	1.386594964858917412e-143	4.973577939822951601e-154
 3 | 2.090311062578065298e-20	9.658774789914179515e-11	9.999952898195970663e-01	3.557400597699479034e-07	2.262905508934240396e-12	1.450188545216314546e-42	9.452280856355819320e-21	4.718863432700030128e-38	4.319227806683993508e-99	1.865054570744621566e-44	4.157469905143719026e-33	6.345676115510894650e-37	3.619487292147490061e-47	2.836044567325440836e-77	1.454848203173887299e-81	5.017390424366337300e-56	2.786306280667162122e-71	3.471243213279994821e-63	9.979423490625239294e-69	2.119222726954732921e-74	6.305909736255067216e-76	3.818541477188481307e-77	2.439838351275920709e-78	3.438169155777076300e-113	1.089851376032876882e-112
 4 | 1.998773380009966880e-40	3.946956827592847930e-26	2.044777853933633016e-04	9.997144612229191818e-01	7.767908851001909692e-05	2.354956247838483971e-24	8.809697062315226382e-15	2.372281182213618607e-41	3.806616013647771638e-139	1.516555921879825494e-55	1.157922697335804990e-36	4.139491151559307010e-33	1.585069675333529668e-38	6.420351657886132910e-58	4.147971731027999612e-64	3.604490462491148974e-52	1.354616851450067795e-87	6.407328574605210377e-63	6.081743338323143642e-63	3.795670588364968572e-75	7.378852919152918469e-75	4.690167739949488631e-84	3.262259333357612553e-75	2.934543147846479866e-100	1.108408131443577869e-98
 5 | 2.613061553160406440e-61	7.102113806254398585e-43	3.989540923589159809e-11	3.488068894237619701e-04	9.996468143132924977e-01	1.856326161672421106e-10	3.651997059123478205e-10	7.676525206143934733e-43	5.001432225560064109e-175	2.362692165865748001e-64	1.976881590142963569e-41	1.954040094691592487e-33	1.476075395086381518e-33	1.948690851938896544e-44	2.777090938828218491e-50	8.552818654223203085e-49	1.021567477554153073e-105	6.472508995277291054e-63	4.024868235462412491e-61	3.808831293882741708e-79	3.844192286985602392e-76	1.591781791898068782e-95	2.143569420205649480e-72	2.190623142935053586e-90	4.356189277802892786e-91
 6 | 4.593085967259807913e-104	1.489754776049387876e-75	4.781426974701029513e-26	1.938499646300751932e-19	2.383605397368165077e-05	9.999726996849537253e-01	9.739002593058520222e-08	1.995946563400549348e-56	1.239243685900408307e-268	3.955330642305007703e-94	6.879341739020612365e-58	2.280324009473937190e-43	1.522042686833143200e-33	7.850913816575066406e-36	2.971904388019342035e-39	6.930058048695094001e-47	1.314083753535921537e-165	9.912151462011283012e-71	2.786700350668537097e-65	3.259645519107950871e-105	1.262472501511843105e-91	8.430715076145929451e-135	6.363273111543683108e-70	2.138874784527701333e-82	4.466928087365359301e-88
 7 | 5.788091925725924829e-60	1.689652781390052800e-46	1.836925984516376092e-21	4.878915553010860267e-14	1.274229664999048236e-09	9.707809653750826460e-16	9.999941592580404714e-01	1.792077262981316211e-24	1.243818735280349438e-132	3.345981615691442712e-43	2.495663971373765164e-27	1.094895142326816127e-27	3.085051200097488194e-33	6.428092011724981401e-45	1.403897337300158746e-47	7.102064459911334889e-39	1.949786932770644428e-77	8.787016193711222882e-51	3.448123818045821033e-54	9.358950534032630166e-64	5.510753839094755558e-63	1.614746251281742751e-76	2.023059558451965494e-65	6.692875982543033232e-87	1.111007059883743568e-85
 8 | 3.879604224751678099e-36	4.919815440229869309e-35	2.012343298095608150e-40	1.615576774013040321e-71	3.439475662721539389e-41	2.774482396464087186e-114	8.643152309474110309e-27	9.997974739066670669e-01	1.960938047728823079e-04	5.601009280728601938e-13	4.478580337798801022e-38	9.630090084169109945e-56	6.228803224217319234e-54	2.873462199611165503e-135	3.950107456901386834e-140	5.443758508605892832e-56	3.775428703242319724e-62	1.956575221421989474e-57	2.676637442117819219e-95	1.281580621621040410e-83	8.137827963294047026e-79	1.844405203472490835e-87	1.196048268152272001e-68	3.667712868757998188e-139	1.821219895544491278e-160
 9 | 1.498017159467312309e-44	2.930616819291596498e-43	4.910752378255150844e-51	2.408622994680653695e-92	6.328641343755488595e-54	1.454884868633074598e-140	6.780647566739387750e-40	3.845883577089871983e-05	9.999583204966130401e-01	4.258930788038283299e-11	2.558292540181687510e-41	3.194075893469838247e-60	2.327606457491400340e-54	7.247638297417384904e-151	3.403579194664641857e-156	2.736793496267314391e-59	1.155130614556690313e-77	4.381683354546404951e-62	2.319666660779401552e-105	4.309591174921475258e-94	8.067941840200124828e-85	1.446049442418931687e-99	6.752848848236575162e-67	4.039913293218027087e-147	3.051043009922203616e-175
10 | 1.743581986884957566e-42	1.979576142347647272e-40	4.923568716802801140e-42	2.621701470560654808e-61	1.748585364990812424e-43	2.669897960126952434e-100	2.432897343111055338e-31	8.202486221400358654e-10	1.974361472300486098e-12	9.999968410623719661e-01	4.265990591729542131e-14	1.227658288025412551e-29	5.534510852179839293e-35	2.790012225916890675e-99	1.560073816253852171e-105	7.207343790185695755e-42	2.269356695057808258e-40	2.792319200992674986e-43	2.092714309102041535e-70	2.436396291718648537e-58	1.445748215483916238e-57	4.958783430655277860e-61	4.796778763623521282e-57	9.954675643923046847e-115	3.103768788162434228e-127
11 | 1.186789439624790831e-52	4.945401067645229460e-46	2.005787039165092611e-38	1.758460040111460320e-39	2.550039334025012975e-35	1.723645671430854924e-63	2.152917744610940423e-26	1.144668681563933757e-21	2.797265848695874735e-62	2.481789693893331141e-10	9.999961533093419952e-01	9.929945730941497950e-07	2.616257253488666024e-17	2.176576679007760774e-51	2.032048322769866828e-58	6.232998039426966014e-28	7.627127281470270481e-34	1.869457225885882226e-33	2.631447883791912881e-43	4.969782609789626401e-41	2.163449840052225243e-43	4.766057270403075673e-46	9.782040508650847956e-50	1.182126017908911311e-84	5.045050081262878149e-86
12 | 3.740564864455621688e-69	9.411385173231223714e-59	1.213903849090206892e-40	3.660918830039146407e-37	6.373649449230254545e-36	5.873919615112980963e-52	9.291599945308248575e-29	4.371487761309304840e-35	9.630229255709782539e-108	2.355368150010698093e-27	4.377818320169024114e-05	9.999529921184019399e-01	4.088392541490176534e-07	3.936835572462491414e-29	6.455079840896629902e-36	8.094597352224413392e-22	8.110434966920551800e-44	3.191901801167345842e-31	4.810923234168798925e-34	1.198218659394878443e-39	8.755379758927437786e-41	1.226836484506012953e-48	1.218919684597474378e-45	7.863476055087933820e-70	7.652920724090617350e-69
13 | 2.244188004239983011e-108	7.896221816644823276e-88	1.747500714701262424e-48	1.006748648893234270e-42	3.977175766751808006e-37	1.743179596177994625e-38	5.757480157775908269e-32	6.405154344575242219e-59	1.077553619173464889e-211	4.200943258027459127e-65	1.400280258019398589e-24	1.346921744854651499e-08	9.999869195439373692e-01	4.808835373618294992e-06	1.949816459505260774e-11	2.886965128781557830e-17	1.274638934085014831e-93	2.524959624683195611e-35	4.205041387699670550e-31	5.294908766465814491e-57	2.484824533181551436e-50	1.164958797211121403e-78	7.171877813748072511e-43	1.995977950750754078e-53	3.595945692508391447e-57
14 | 9.123542022186476260e-136	5.397153886611048004e-109	5.517567645509181395e-59	9.383652011003683520e-55	1.455690876508692842e-40	1.834673368432954873e-37	7.889359952551347665e-38	1.782016293933284144e-75	4.478905157967656659e-277	1.757164072024743409e-92	9.535496386623506664e-44	9.932011395393140667e-22	9.485359807945533458e-04	9.986043873868141185e-01	4.394573660250436389e-04	1.003350063184957697e-13	1.521573306108150939e-127	5.553278003991365906e-38	1.092856633066857352e-32	1.257561416567293127e-71	1.903902930194164668e-58	2.457082159089194380e-101	5.870015628115773420e-40	2.649121184212292503e-48	2.477571513630090555e-55
15 | 9.821267601951780800e-147	3.211804344792400390e-118	4.100884843208845683e-65	4.571817900625516564e-62	2.721602421985132203e-45	3.362001038505124628e-39	4.104653094566770409e-42	3.021112104459756483e-83	2.091050205805857405e-298	1.880971108019182741e-104	1.547890582498776138e-54	2.573178658343574221e-31	1.742448865574734898e-08	2.102999820742791799e-03	9.978918350286886119e-01	3.382679387882022031e-08	1.856820180821550648e-132	6.064964601401940143e-34	3.327823153756224089e-29	1.199844988712873404e-72	1.160597373128820753e-57	1.244573159533439042e-104	7.144003981684544228e-36	1.494195570925678375e-45	5.419940525804723443e-52
16 | 4.404502650882845796e-95	1.336542539493886488e-82	4.189881042724302398e-57	4.931300798610183314e-53	5.017457173537509954e-48	4.076951113469768349e-55	1.429104530330845733e-39	5.293487474221964438e-58	4.484866107654426671e-153	6.690042713792375856e-57	3.020692113980481430e-30	4.301655950925193397e-22	6.239284904013104353e-14	5.048717786261997944e-20	4.078736119968849291e-18	9.999945140655377429e-01	1.444045918163056421e-32	1.066841819433384841e-11	1.409225788004152123e-15	7.347017405439693037e-25	2.478883174992313902e-25	8.435335594903273188e-38	3.359505417145976863e-31	2.749650668315786590e-51	7.529220013554134433e-50
17 | 1.642079682457300378e-73	1.401926351708138881e-69	6.218305652862606058e-63	7.874287456932357098e-74	4.543426539670428603e-62	4.105953808751731830e-102	8.377024131779235422e-51	3.371944860321219373e-43	5.571619837882498456e-66	6.409113215516659601e-37	1.247744117469739331e-33	1.388084486780798962e-34	5.763669639463345138e-30	5.393577812103149018e-68	4.791340279681421702e-67	4.924203641595258081e-11	9.999953120167262011e-01	1.945527600574606406e-06	5.912344114927144767e-26	1.848688775549795456e-17	1.643048560547925794e-20	1.893840576751284165e-23	1.029405523421943498e-29	3.647816613585120078e-73	3.199900794574160126e-79
18 | 2.672943576625164055e-85	7.273866778901795839e-77	8.045773199776979928e-64	1.042003597555267515e-65	3.958793838404954209e-60	5.306658176677462318e-84	5.585786413524803576e-50	1.783490015377568641e-53	4.667206605299533687e-102	1.356501356201256614e-47	1.413343709002344804e-34	1.419698174084786667e-31	1.595372510310954024e-28	4.150790506029031566e-50	1.204810072580356336e-49	1.073875666836839677e-12	5.309928114104220704e-08	9.999967106734269917e-01	2.730544703216284408e-09	1.927372010591122078e-09	2.022036251618310753e-13	4.374200864593129264e-18	2.190739081069774688e-24	6.854388664453205122e-55	1.379674567036951636e-55
19 | 9.628719857275456702e-109	2.781780189695366063e-94	3.364859323542624324e-67	7.864865671765091677e-64	2.928659561470238327e-60	8.656441894192813077e-69	1.526797115678781532e-51	4.291969159267374931e-69	1.185924206700561082e-165	3.761119623315365242e-69	3.752180096283684167e-41	1.240727178501496099e-33	3.415114834230293040e-29	7.703500724216052826e-37	2.872222001430548121e-35	5.654708900662274898e-15	1.648241373798991518e-38	9.374078485068330517e-06	9.999877986039561462e-01	1.571888067810209854e-09	3.081335722826161506e-10	1.023392155154681454e-23	4.227468680743177945e-19	2.730780223417385417e-37	2.297958761381681526e-36
20 | 8.663530270449759470e-93	1.231697958462314269e-84	7.210338398431933872e-70	2.955209845364672531e-73	1.956862529865996763e-68	5.973069218486672037e-96	9.562603954154413691e-58	1.144228286919789419e-59	3.695579593024760863e-105	2.310647058767430702e-54	2.325065451450408062e-41	1.421015932812960837e-38	2.045925984901786458e-38	4.072895013858277845e-64	1.832402774455419994e-64	4.441940370085134828e-22	7.844268346603670356e-18	1.497295247089426136e-08	3.836304140459791873e-10	9.990443790454711648e-01	9.481007549825432859e-04	8.695527706560288611e-08	7.601448092223186185e-16	3.516395154888708236e-46	3.135215504492265491e-48
21 | 6.669382227565925191e-97	7.123494560144609851e-88	1.647121863578578424e-71	8.168389660884614544e-76	2.029016674526171134e-72	4.630550921702787540e-98	7.588198236844457175e-62	1.164149935733865264e-64	4.552840930839409089e-113	2.192646581910778151e-58	2.106832178720076858e-42	2.157816009297191048e-42	1.940660906874702383e-42	2.865467289483971387e-66	6.594878951072011762e-67	1.015972350499484935e-27	7.006217898886794009e-26	5.483531601600755717e-13	5.333404238209458776e-15	1.758785111067086640e-03	9.974833987037106109e-01	7.527814458338492236e-04	6.471438804783592895e-10	5.508487926040881058e-38	2.441167577907484607e-40
22 | 1.026814921154543686e-96	6.511460464182799123e-89	1.278054138499422710e-71	1.355599549729282541e-79	1.768737886645697921e-77	2.331368358527782682e-112	1.703564365092505532e-68	2.438123230582425714e-66	3.361404012818393294e-101	5.489691982800258603e-59	1.851575906679630263e-42	5.085657454896113701e-46	1.014088827858070490e-48	1.661287410437424492e-79	3.893839299578396612e-81	4.665071983835733170e-34	2.001215229485707919e-27	1.580108260103064401e-17	1.330600921289788881e-23	2.708518586371246867e-08	1.403058813035552182e-04	9.998318916321906613e-01	2.190355917179419168e-05	3.582079761049834993e-38	1.327496235842538522e-42
23 | 4.396951271336978215e-123	2.071549183312301116e-108	1.796594468430064019e-75	1.331104633004730352e-71	1.220013125942280561e-71	1.658865579775654058e-85	2.547470051297606650e-67	7.800212047621325078e-83	6.159232796977816059e-174	1.025162372082287005e-83	6.098535746998206867e-49	1.881665635709862857e-42	2.547260052251453519e-42	4.598941025692798952e-52	4.035126817491124711e-54	4.930496573549406189e-34	4.679269116649769627e-54	1.563343183537189661e-21	5.932969588318468201e-18	4.572368197208718154e-16	5.007982069561371171e-10	1.987393246833164303e-13	9.999909675330007630e-01	3.258661174372980781e-15	1.473715819849710733e-15
24 | 2.547938513188812945e-191	8.856838974143800963e-158	7.225532435869120278e-91	2.326926940193748914e-88	1.736296562682115418e-74	3.047958680744737306e-71	1.074001487474667757e-71	2.252557691757241851e-118	0.000000000000000000e+00	7.206960078072867460e-147	1.616503311206698797e-85	2.693078621896017883e-60	4.585286615549602506e-43	4.771647337533396177e-42	2.317595049714119735e-39	1.926955280611357141e-38	3.158872352611991261e-166	2.051856872292200059e-41	4.761864269797106014e-31	2.651599954736033283e-70	1.619681934541330046e-46	1.133655831037626589e-91	1.117758123730268759e-06	9.999940726700221605e-01	2.438674436433830010e-07
25 | 1.046228930885092795e-152	3.265370597747559660e-131	7.034498397917449665e-84	6.635540580725474496e-79	2.579797347598952804e-74	8.141422499041106750e-76	1.641210594070039732e-68	3.523083167162086178e-98	1.128932070626447892e-249	5.800579167298568930e-112	7.429542447847339567e-66	5.100405485424362244e-51	4.159767608736196926e-43	4.098178453152993143e-44	2.392556080248849744e-44	2.888293905257683796e-35	4.682895481366441209e-99	2.844264783009469160e-30	1.060757248790252720e-20	4.376538339844581220e-38	3.875777358513953944e-27	3.003583132760665426e-48	4.233799844635246636e-08	2.004850979318377887e-06	9.999699522379797978e-01
26 | 


--------------------------------------------------------------------------------
/fast5_research/test/test_fast5_bulk.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import unittest
  4 | 
  5 | from fast5_research import BulkFast5
  6 | 
  7 | 
  8 | class BulkFast5Test(unittest.TestCase):
  9 | 
 10 |     example_file = 'elec3_example.fast5'
 11 | 
 12 |     def setUp(self):
 13 |         self.filepath = os.path.join(
 14 |             os.path.dirname(__file__), 'data', self.example_file
 15 |         )
 16 |         self.fh = BulkFast5(self.filepath)
 17 | 
 18 |     def tearDown(self):
 19 |         self.fh.close()
 20 | 
 21 |     @classmethod
 22 |     def setUpClass(self):
 23 |         print('\n* Bulk Fast5')
 24 | 
 25 |     # tests specific to this file
 26 | 
 27 |     def test_parse_experimental_metadata(self):
 28 |         """Test if the experiment metadata is parsed correctly."""
 29 | 
 30 |         expected = {'asic_id': '3503485095',
 31 |                     'asic_id_17': '61607',
 32 |                     'asic_id_eeprom': '0',
 33 |                     'asic_temp': '34.09',
 34 |                     'department': 'research',
 35 |                     'device_id': 'MN15971',
 36 |                     'exp_script_hash': 'e8b64319a2a8c0238d924a48d27b3ed433fa6e74',
 37 |                     'exp_script_name': './python/recipes/EXPERIMENT_RESEARCH_Static_Flips_Script_CP1_increase.py',
 38 |                     'exp_script_purpose': 'platform_qc',
 39 |                     'exp_start_time': '1465229980',
 40 |                     'experiment_type': 'chip_res',
 41 |                     'filename': 'minicol095_20160606_fnfad16824_mn15971_chip_res_jun03f02_elec0',
 42 |                     'flow_cell_id': 'FAD16824',
 43 |                     'heatsink_temp': '37.02',
 44 |                     'hostname': 'MINICOL095',
 45 |                     'protocol_run_id': '7ce62553-1ed5-42eb-aaf4-984efa212945',
 46 |                     'protocols_version_name': '0.51.1.69',
 47 |                     'rc_wiggle_test': 'true',
 48 |                     'read_classifier': 'platform_qc',
 49 |                     'read_classifier_hash': '82be2305a7bc4a86ad34cd734e05350a4f347d2c',
 50 |                     'read_classifier_is_ont_standard': '1',
 51 |                     'read_classifier_reference_hash': '82be2305a7bc4a86ad34cd734e05350a4f347d2c',
 52 |                     'run_id': '3aada3b43b81da733010adef6c89d18357fbd682',
 53 |                     'user_filename_input': 'jun03f02',
 54 |                     'version': '0.51.1.62 b201602101407',
 55 |                     'version_name': '0.51.1.62 b201602101407'}
 56 | 
 57 |         found = self.fh.exp_metadata
 58 |         del found['script_options']
 59 | 
 60 |         self.assertDictEqual(found, expected)
 61 | 
 62 |     def test_parse_temperature(self):
 63 |         temps = self.fh.get_temperature()
 64 |         expected = np.array([(2.0, 36.97), (6.0, 36.98), (8.0, 37.0)],
 65 |             dtype=[('time', '<f8'), ('minion_heatsink_temperature', '<f8')]
 66 |         )
 67 |         self.assertTupleEqual(temps.dtype.names, expected.dtype.names)
 68 |         for field in expected.dtype.names:
 69 |             np.testing.assert_allclose(
 70 |                 expected[field], temps[0:3][field])
 71 | 
 72 |     def test_parse_waveform_timings(self):
 73 |         timings = self.fh.get_waveform_timings()
 74 |         self.assertEqual(len(timings), 8)
 75 |         expected_first = np.array((313.23200000000003, 607.77099999999996))
 76 |         np.testing.assert_allclose(expected_first, np.array(timings[0]))
 77 | 
 78 |     def test_raw_data_raises_exception_if_absent(self):
 79 |         """Test parsing raw data from a channel without raw data raises an exception."""
 80 | 
 81 |         with self.assertRaises(KeyError):
 82 |             self.fh.get_raw(1)
 83 | 
 84 |     def test_parse_raw_data(self):
 85 |         """Test parsing the whole raw dataset"""
 86 | 
 87 |         raw = self.fh.get_raw(self.fh.channels[0])
 88 |         self.assertEqual(len(raw), 1212525)
 89 | 
 90 | 
 91 |     def test_parse_times_with_None(self):
 92 |         """Test that times=(None, None) defaults to the whole raw data"""
 93 | 
 94 |         raw = self.fh.get_raw(self.fh.channels[0])
 95 |         raw_with_Nones = self.fh.get_raw(self.fh.channels[0], times=(None, None))
 96 |         self.assertEqual(len(raw), len(raw_with_Nones))
 97 | 
 98 | 
 99 |     def test_parse_event_data_len(self):
100 |         """Test parsing the whole event dataset"""
101 | 
102 |         events = self.fh.get_events(self.fh.channels[0])
103 |         self.assertEqual(len(events), 30710)
104 | 
105 |     def test_get_mux_changes(self):
106 |         """Test parsing of mux changes"""
107 |         mux_changes = list(self.fh.get_mux_changes(self.fh.channels[0]))
108 |         self.assertEqual(len(mux_changes), 6)
109 |         self.assertTupleEqual((3030000, 2), tuple(mux_changes[2]))
110 |         # now test another channel - this might fail if caching has gone wrong
111 |         mux_changes = list(self.fh.get_mux_changes(self.fh.channels[1]))
112 |         self.assertTupleEqual((50000, 0), tuple(mux_changes[2]))
113 | 
114 |     # tests which have been designed to work for our elec3 example and converted
115 |     # ABF file
116 | 
117 |     def test_parse_mux_by_time(self):
118 |         """Test getting the mux for a time point"""
119 | 
120 |         mux = self.fh.get_mux(self.fh.channels[0], time=400)
121 |         self.assertEqual(mux, 1)
122 | 
123 |     def test_parse_mux_by_raw_index(self):
124 |         """Test getting the mux for a raw index"""
125 | 
126 |         mux = self.fh.get_mux(self.fh.channels[0], raw_index=400*self.fh.sample_rate)
127 |         self.assertEqual(mux, 1)
128 | 
129 |     # general tests which should work for any file
130 | 
131 |     def test_parse_raw_data_by_index(self):
132 |         """Test parsing the raw dataset sliced by indices"""
133 | 
134 |         raw = self.fh.get_raw(self.fh.channels[0], raw_indices=[100, 900])
135 |         self.assertEqual(len(raw), 800)
136 | 
137 |     def test_parse_raw_data_by_time(self):
138 |         """Test parsing the raw dataset sliced by times"""
139 | 
140 |         raw = self.fh.get_raw(self.fh.channels[0], times=[0, 0.05])
141 |         self.assertEqual(len(raw), int(0.05 * self.fh.sample_rate))
142 | 
143 |     def test_parse_event_data_names(self):
144 |         """Test parsing event data column names."""
145 | 
146 |         events = self.fh.get_events(self.fh.channels[0])
147 |         for name in ['start', 'length', 'mean', 'stdv']:
148 |             self.assertIn(name, list(events.dtype.names))
149 | 
150 |     def test_parse_event_data_by_raw_index(self):
151 |         """Test parsing the event dataset sliced by raw indices"""
152 | 
153 |         events = self.fh.get_events(self.fh.channels[0])
154 |         start = events[1]['start']
155 |         end = events[4]['start']
156 |         events = self.fh.get_events(self.fh.channels[0], raw_indices=[start, end])
157 |         self.assertEqual(len(events), 3)
158 | 
159 |     def test_parse_event_data_by_raw_index(self):
160 |         """Test parsing the event dataset sliced by event indices"""
161 | 
162 |         events = self.fh.get_events(self.fh.channels[0], event_indices=[1, 3])
163 |         self.assertEqual(len(events), 2)
164 | 
165 |     def test_parse_event_data_by_time(self):
166 |         """Test parsing the event dataset sliced by times"""
167 | 
168 |         events = self.fh.get_events(self.fh.channels[0])
169 |         start = float(events[1]['start'] / self.fh.sample_rate)
170 |         end = float(events[4]['start'] / self.fh.sample_rate)
171 |         events = self.fh.get_events(self.fh.channels[0], times=[start, end])
172 |         self.assertEqual(len(events), 3)
173 | 
174 |     def test_parse_read_data(self):
175 |         """Test parsing the reads dataset """
176 | 
177 |         reads = list(self.fh.get_reads(self.fh.channels[0], transitions=False, multi_row_class='penultimate'))
178 |         self.assertEqual(len(reads), 953)
179 |         reads = list(self.fh.get_reads(self.fh.channels[0], transitions=True, multi_row_class='final'))
180 |         self.assertEqual(len(reads), 967)
181 | 
182 |         # check a single-row read
183 |         self.assertEqual(reads[1]['event_index_start'], 16)
184 |         self.assertEqual(reads[1]['event_index_end'], 18)
185 |         self.assertEqual(reads[1]['read_start'], 45060)
186 |         self.assertEqual(reads[1]['read_length'], 1247)
187 | 
188 |         # check the more complicated case of a multi-row read which was stitched
189 |         expected = {
190 |             'classification': 'zero',
191 |             'drift': 0.76819804090134713,
192 |             'event_index_end': 5939,
193 |             'event_index_start': 18,
194 |             'median': -0.74220257568359393,
195 |             'median_dwell': 78.0,
196 |             'median_sd': 4.6702038236038623,
197 |             'range': 36.423590332031246,
198 |             'read_id': 'cac35512-2520-4a89-ac92-23db908ba45f',
199 |             'read_length': 871039,
200 |             'read_start': 46307
201 |         }
202 |         for key in ['event_index_start', 'event_index_end', 'classification',
203 |                     'read_length', 'read_id']:
204 |             self.assertEqual(reads[2][key], expected[key])
205 |         for key in ['drift', 'median', 'median_dwell', 'range', 'read_start',
206 |                     'median_sd']:
207 |             self.assertAlmostEqual(reads[2][key], expected[key])
208 | 
209 |         # test the multi_row_class option
210 |         reads = list(self.fh.get_reads(self.fh.channels[0], multi_row_class='penultimate'))
211 |         self.assertEqual(reads[2]['classification'], 'user1')
212 |         reads = list(self.fh.get_reads(self.fh.channels[0], multi_row_class='final'))
213 |         self.assertEqual(reads[2]['classification'], 'zero')
214 |         reads = list(self.fh.get_reads(self.fh.channels[0], multi_row_class='modal'))
215 |         self.assertEqual(reads[2]['classification'], 'unavailable')
216 |         reads = list(self.fh.get_reads(self.fh.channels[0], multi_row_class='auto'))
217 |         self.assertEqual(reads[2]['classification'], 'unavailable')
218 |         with self.assertRaises(ValueError):
219 |             reads = list(self.fh.get_reads(self.fh.channels[0], multi_row_class='not_a_choice'))
220 | 
221 |     def test_parse_voltage_by_index(self):
222 |         """Test parsing the voltage dataset sliced by indices"""
223 |         voltage = self.fh.get_voltage(raw_indices=[100, 900])
224 |         self.assertEqual(len(voltage), 800)
225 | 
226 |     def test_parse_voltage_by_time(self):
227 |         """Test parsing the voltage dataset sliced by times"""
228 | 
229 |         raw = self.fh.get_voltage(times=[0, 0.05])
230 |         self.assertEqual(len(raw), int(0.05 * self.fh.sample_rate))
231 | 
232 |     def test_voltage_scaling(self):
233 |         """Test scaling of the voltage"""
234 |         voltage = self.fh.get_voltage()
235 |         # find index of 1st non-zero voltage
236 |         index = np.where(np.abs(voltage) > 0)[0][0]
237 |         unscaled_voltage = self.fh.get_voltage(use_scaling=False)
238 |         self.assertNotEqual(voltage[index], unscaled_voltage[index])
239 | 
240 |     def test_parse_state_data(self):
241 |         """Test parsing of state data"""
242 |         states = self.fh.get_state_changes(self.fh.channels[0])
243 |         self.assertEqual(len(states), 43)
244 | 
245 |     def test_get_state_by_raw_index(self):
246 |         """Test channel state at a give raw index"""
247 | 
248 |         state = self.fh.get_state(self.fh.channels[0], raw_index=100)
249 |         self.assertEqual(state, 'unclassified')
250 | 
251 |         state = self.fh.get_state(self.fh.channels[0], raw_index=61000)
252 |         self.assertEqual(state, 'inrange')
253 | 
254 |         # now test another channel - this might fail if caching has gone wrong
255 |         state = self.fh.get_state(self.fh.channels[1], raw_index=61000)
256 |         self.assertEqual(state, 'saturated')
257 | 
258 |     def test_get_state_by_time(self):
259 |         """Test channel state at a give raw index"""
260 |         state = self.fh.get_state(self.fh.channels[0], time=100/self.fh.sample_rate)
261 |         self.assertEqual(state, 'unclassified')
262 | 
263 |         state = self.fh.get_state(self.fh.channels[0], time=61000/self.fh.sample_rate)
264 |         self.assertEqual(state, 'inrange')
265 | 
266 |         state = self.fh.get_state(self.fh.channels[1], time=61000/self.fh.sample_rate)
267 |         self.assertEqual(state, 'saturated')
268 | 
269 |     def test_get_states_in_window_by_raw_index(self):
270 |         """Test get_states_in_window using a window specified in raw indices"""
271 |         inds = (3045000, 3930001)
272 |         states = self.fh.get_states_in_window(self.fh.channels[0], raw_indices=inds)
273 |         expected = np.array(['above', 'inrange', 'unclassified_following_reset', 'unusable_pore'], dtype='U28')
274 | 
275 |         assert np.all(states == expected)
276 |         states = self.fh.get_states_in_window(self.fh.channels[1], raw_indices=inds)
277 |         expected = np.array(['above', 'inrange', 'unclassified_following_reset'], dtype='U28')
278 |         assert np.all(states == expected)
279 | 
280 |     def test_get_states_in_window_by_times(self):
281 |         """Test get_states_in_window using a window specified in times"""
282 |         times = (3045000.0 / self.fh.sample_rate, 3930001.0 / self.fh.sample_rate)
283 |         states = self.fh.get_states_in_window(self.fh.channels[0], times=times)
284 |         expected = np.array(['above', 'inrange', 'unclassified_following_reset', 'unusable_pore'], dtype='U28')
285 |         assert np.all(states == expected)
286 |         states = self.fh.get_states_in_window(self.fh.channels[1], times=times)
287 |         expected = np.array(['above', 'inrange', 'unclassified_following_reset'], dtype='U28')
288 |         assert np.all(states == expected)
289 | 
290 | 
291 | class BulkABFFast5Test(BulkFast5Test):
292 | 
293 |     example_file = 'abf2bulkfast5.fast5'
294 | 
295 |     def setUp(self):
296 |         self.filepath = os.path.join(
297 |             os.path.dirname(__file__), 'data', self.example_file
298 |         )
299 |         self.fh = BulkFast5(self.filepath)
300 | 
301 |     def tearDown(self):
302 |         self.fh.close()
303 | 
304 |     @classmethod
305 |     def setUpClass(self):
306 |         print('\n* Bulk ABF Fast5')
307 | 
308 |     # tests to skip
309 |     @unittest.skip("Skipping test_parse_experimental_metadata")
310 |     def test_parse_experimental_metadata(self):
311 |         pass
312 | 
313 |     @unittest.skip("Skipping test_parse_temperature")
314 |     def test_parse_temperature(self):
315 |         pass
316 | 
317 |     @unittest.skip("Skipping test_parse_waveform_timings")
318 |     def test_parse_waveform_timings(self):
319 |         pass
320 | 
321 |     @unittest.skip("Skipping test_parse_read_data")
322 |     def test_parse_read_data(self):
323 |         pass
324 | 
325 |     @unittest.skip("Skipping test_parse_state_data")
326 |     def test_parse_state_data(self):
327 |         """Test parsing of state data"""
328 |         pass
329 | 
330 |     @unittest.skip("Skipping test_get_state_by_raw_index")
331 |     def test_get_state_by_raw_index(self):
332 |         """Test channel state at a give raw index"""
333 |         pass
334 | 
335 |     @unittest.skip("Skipping test_get_state_by_time")
336 |     def test_get_state_by_time(self):
337 |         """Test channel state at a give raw index"""
338 |         pass
339 | 
340 |     @unittest.skip("Skipping test_get_states_in_window_by_raw_index")
341 |     def test_get_states_in_window_by_raw_index(self):
342 |         """Test get_states_in_window using a window specified in raw indices"""
343 |         pass
344 | 
345 |     @unittest.skip("Skipping test_get_states_in_window_by_times")
346 |     def test_get_states_in_window_by_times(self):
347 |         """Test get_states_in_window using a window specified in times"""
348 |         pass
349 | 
350 |     def test_raw_data_raises_exception_if_absent(self):
351 |         """Test parsing raw data from a channel without raw data raises an exception."""
352 |         with self.assertRaises(KeyError):
353 |             self.fh.get_raw(2)
354 | 
355 |     def test_parse_raw_data(self):
356 |         """Test parsing the whole raw dataset"""
357 |         raw = self.fh.get_raw(self.fh.channels[0])
358 |         self.assertEqual(len(raw), 10000)
359 | 
360 |     def test_parse_event_data_len(self):
361 |         """Test parsing the whole event dataset"""
362 |         events = self.fh.get_events(self.fh.channels[0])
363 |         self.assertEqual(len(events), 5)
364 | 
365 |     def test_get_mux_changes(self):
366 |         """Test parsing of mux changes"""
367 |         mux_changes = list(self.fh.get_mux_changes(self.fh.channels[0]))
368 |         self.assertEqual(len(mux_changes), 1)
369 |         self.assertTupleEqual((0, 1), tuple(mux_changes[0]))
370 | 
371 | 
372 | if __name__ == "__main__":
373 |     unittest.main()
374 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | This Source Code Form is subject to the terms of the Mozilla Public
  2 | License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | (c) 2016 Oxford Nanopore Technologies Ltd.
  6 | 
  7 | 
  8 | Mozilla Public License Version 2.0
  9 | ==================================
 10 | 
 11 | ### 1. Definitions
 12 | 
 13 | **1.1. “Contributor”**  
 14 |     means each individual or legal entity that creates, contributes to
 15 |     the creation of, or owns Covered Software.
 16 | 
 17 | **1.2. “Contributor Version”**  
 18 |     means the combination of the Contributions of others (if any) used
 19 |     by a Contributor and that particular Contributor's Contribution.
 20 | 
 21 | **1.3. “Contribution”**  
 22 |     means Covered Software of a particular Contributor.
 23 | 
 24 | **1.4. “Covered Software”**  
 25 |     means Source Code Form to which the initial Contributor has attached
 26 |     the notice in Exhibit A, the Executable Form of such Source Code
 27 |     Form, and Modifications of such Source Code Form, in each case
 28 |     including portions thereof.
 29 | 
 30 | **1.5. “Incompatible With Secondary Licenses”**  
 31 |     means
 32 | 
 33 | * **(a)** that the initial Contributor has attached the notice described
 34 |     in Exhibit B to the Covered Software; or
 35 | * **(b)** that the Covered Software was made available under the terms of
 36 |     version 1.1 or earlier of the License, but not also under the
 37 |     terms of a Secondary License.
 38 | 
 39 | **1.6. “Executable Form”**  
 40 |     means any form of the work other than Source Code Form.
 41 | 
 42 | **1.7. “Larger Work”**  
 43 |     means a work that combines Covered Software with other material, in 
 44 |     a separate file or files, that is not Covered Software.
 45 | 
 46 | **1.8. “License”**  
 47 |     means this document.
 48 | 
 49 | **1.9. “Licensable”**  
 50 |     means having the right to grant, to the maximum extent possible,
 51 |     whether at the time of the initial grant or subsequently, any and
 52 |     all of the rights conveyed by this License.
 53 | 
 54 | **1.10. “Modifications”**  
 55 |     means any of the following:
 56 | 
 57 | * **(a)** any file in Source Code Form that results from an addition to,
 58 |     deletion from, or modification of the contents of Covered
 59 |     Software; or
 60 | * **(b)** any new file in Source Code Form that contains any Covered
 61 |     Software.
 62 | 
 63 | **1.11. “Patent Claims” of a Contributor**  
 64 |     means any patent claim(s), including without limitation, method,
 65 |     process, and apparatus claims, in any patent Licensable by such
 66 |     Contributor that would be infringed, but for the grant of the
 67 |     License, by the making, using, selling, offering for sale, having
 68 |     made, import, or transfer of either its Contributions or its
 69 |     Contributor Version.
 70 | 
 71 | **1.12. “Secondary License”**  
 72 |     means either the GNU General Public License, Version 2.0, the GNU
 73 |     Lesser General Public License, Version 2.1, the GNU Affero General
 74 |     Public License, Version 3.0, or any later versions of those
 75 |     licenses.
 76 | 
 77 | **1.13. “Source Code Form”**  
 78 |     means the form of the work preferred for making modifications.
 79 | 
 80 | **1.14. “You” (or “Your”)**  
 81 |     means an individual or a legal entity exercising rights under this
 82 |     License. For legal entities, “You” includes any entity that
 83 |     controls, is controlled by, or is under common control with You. For
 84 |     purposes of this definition, “control” means **(a)** the power, direct
 85 |     or indirect, to cause the direction or management of such entity,
 86 |     whether by contract or otherwise, or **(b)** ownership of more than
 87 |     fifty percent (50%) of the outstanding shares or beneficial
 88 |     ownership of such entity.
 89 | 
 90 | 
 91 | ### 2. License Grants and Conditions
 92 | 
 93 | #### 2.1. Grants
 94 | 
 95 | Each Contributor hereby grants You a world-wide, royalty-free,
 96 | non-exclusive license:
 97 | 
 98 | * **(a)** under intellectual property rights (other than patent or trademark)
 99 |     Licensable by such Contributor to use, reproduce, make available,
100 |     modify, display, perform, distribute, and otherwise exploit its
101 |     Contributions, either on an unmodified basis, with Modifications, or
102 |     as part of a Larger Work; and
103 | * **(b)** under Patent Claims of such Contributor to make, use, sell, offer
104 |     for sale, have made, import, and otherwise transfer either its
105 |     Contributions or its Contributor Version.
106 | 
107 | #### 2.2. Effective Date
108 | 
109 | The licenses granted in Section 2.1 with respect to any Contribution
110 | become effective for each Contribution on the date the Contributor first
111 | distributes such Contribution.
112 | 
113 | #### 2.3. Limitations on Grant Scope
114 | 
115 | The licenses granted in this Section 2 are the only rights granted under
116 | this License. No additional rights or licenses will be implied from the
117 | distribution or licensing of Covered Software under this License.
118 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
119 | Contributor:
120 | 
121 | * **(a)** for any code that a Contributor has removed from Covered Software;
122 |     or
123 | * **(b)** for infringements caused by: **(i)** Your and any other third party's
124 |     modifications of Covered Software, or **(ii)** the combination of its
125 |     Contributions with other software (except as part of its Contributor
126 |     Version); or
127 | * **(c)** under Patent Claims infringed by Covered Software in the absence of
128 |     its Contributions.
129 | 
130 | This License does not grant any rights in the trademarks, service marks,
131 | or logos of any Contributor (except as may be necessary to comply with
132 | the notice requirements in Section 3.4).
133 | 
134 | #### 2.4. Subsequent Licenses
135 | 
136 | No Contributor makes additional grants as a result of Your choice to
137 | distribute the Covered Software under a subsequent version of this
138 | License (see Section 10.2) or under the terms of a Secondary License (if
139 | permitted under the terms of Section 3.3).
140 | 
141 | #### 2.5. Representation
142 | 
143 | Each Contributor represents that the Contributor believes its
144 | Contributions are its original creation(s) or it has sufficient rights
145 | to grant the rights to its Contributions conveyed by this License.
146 | 
147 | #### 2.6. Fair Use
148 | 
149 | This License is not intended to limit any rights You have under
150 | applicable copyright doctrines of fair use, fair dealing, or other
151 | equivalents.
152 | 
153 | #### 2.7. Conditions
154 | 
155 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
156 | in Section 2.1.
157 | 
158 | 
159 | ### 3. Responsibilities
160 | 
161 | #### 3.1. Distribution of Source Form
162 | 
163 | All distribution of Covered Software in Source Code Form, including any
164 | Modifications that You create or to which You contribute, must be under
165 | the terms of this License. You must inform recipients that the Source
166 | Code Form of the Covered Software is governed by the terms of this
167 | License, and how they can obtain a copy of this License. You may not
168 | attempt to alter or restrict the recipients' rights in the Source Code
169 | Form.
170 | 
171 | #### 3.2. Distribution of Executable Form
172 | 
173 | If You distribute Covered Software in Executable Form then:
174 | 
175 | * **(a)** such Covered Software must also be made available in Source Code
176 |     Form, as described in Section 3.1, and You must inform recipients of
177 |     the Executable Form how they can obtain a copy of such Source Code
178 |     Form by reasonable means in a timely manner, at a charge no more
179 |     than the cost of distribution to the recipient; and
180 | 
181 | * **(b)** You may distribute such Executable Form under the terms of this
182 |     License, or sublicense it under different terms, provided that the
183 |     license for the Executable Form does not attempt to limit or alter
184 |     the recipients' rights in the Source Code Form under this License.
185 | 
186 | #### 3.3. Distribution of a Larger Work
187 | 
188 | You may create and distribute a Larger Work under terms of Your choice,
189 | provided that You also comply with the requirements of this License for
190 | the Covered Software. If the Larger Work is a combination of Covered
191 | Software with a work governed by one or more Secondary Licenses, and the
192 | Covered Software is not Incompatible With Secondary Licenses, this
193 | License permits You to additionally distribute such Covered Software
194 | under the terms of such Secondary License(s), so that the recipient of
195 | the Larger Work may, at their option, further distribute the Covered
196 | Software under the terms of either this License or such Secondary
197 | License(s).
198 | 
199 | #### 3.4. Notices
200 | 
201 | You may not remove or alter the substance of any license notices
202 | (including copyright notices, patent notices, disclaimers of warranty,
203 | or limitations of liability) contained within the Source Code Form of
204 | the Covered Software, except that You may alter any license notices to
205 | the extent required to remedy known factual inaccuracies.
206 | 
207 | #### 3.5. Application of Additional Terms
208 | 
209 | You may choose to offer, and to charge a fee for, warranty, support,
210 | indemnity or liability obligations to one or more recipients of Covered
211 | Software. However, You may do so only on Your own behalf, and not on
212 | behalf of any Contributor. You must make it absolutely clear that any
213 | such warranty, support, indemnity, or liability obligation is offered by
214 | You alone, and You hereby agree to indemnify every Contributor for any
215 | liability incurred by such Contributor as a result of warranty, support,
216 | indemnity or liability terms You offer. You may include additional
217 | disclaimers of warranty and limitations of liability specific to any
218 | jurisdiction.
219 | 
220 | 
221 | ### 4. Inability to Comply Due to Statute or Regulation
222 | 
223 | If it is impossible for You to comply with any of the terms of this
224 | License with respect to some or all of the Covered Software due to
225 | statute, judicial order, or regulation then You must: **(a)** comply with
226 | the terms of this License to the maximum extent possible; and **(b)**
227 | describe the limitations and the code they affect. Such description must
228 | be placed in a text file included with all distributions of the Covered
229 | Software under this License. Except to the extent prohibited by statute
230 | or regulation, such description must be sufficiently detailed for a
231 | recipient of ordinary skill to be able to understand it.
232 | 
233 | 
234 | ### 5. Termination
235 | 
236 | **5.1.** The rights granted under this License will terminate automatically
237 | if You fail to comply with any of its terms. However, if You become
238 | compliant, then the rights granted under this License from a particular
239 | Contributor are reinstated **(a)** provisionally, unless and until such
240 | Contributor explicitly and finally terminates Your grants, and **(b)** on an
241 | ongoing basis, if such Contributor fails to notify You of the
242 | non-compliance by some reasonable means prior to 60 days after You have
243 | come back into compliance. Moreover, Your grants from a particular
244 | Contributor are reinstated on an ongoing basis if such Contributor
245 | notifies You of the non-compliance by some reasonable means, this is the
246 | first time You have received notice of non-compliance with this License
247 | from such Contributor, and You become compliant prior to 30 days after
248 | Your receipt of the notice.
249 | 
250 | **5.2.** If You initiate litigation against any entity by asserting a patent
251 | infringement claim (excluding declaratory judgment actions,
252 | counter-claims, and cross-claims) alleging that a Contributor Version
253 | directly or indirectly infringes any patent, then the rights granted to
254 | You by any and all Contributors for the Covered Software under Section
255 | 2.1 of this License shall terminate.
256 | 
257 | **5.3.** In the event of termination under Sections 5.1 or 5.2 above, all
258 | end user license agreements (excluding distributors and resellers) which
259 | have been validly granted by You or Your distributors under this License
260 | prior to termination shall survive termination.
261 | 
262 | 
263 | ### 6. Disclaimer of Warranty
264 | 
265 | > Covered Software is provided under this License on an “as is”
266 | > basis, without warranty of any kind, either expressed, implied, or
267 | > statutory, including, without limitation, warranties that the
268 | > Covered Software is free of defects, merchantable, fit for a
269 | > particular purpose or non-infringing. The entire risk as to the
270 | > quality and performance of the Covered Software is with You.
271 | > Should any Covered Software prove defective in any respect, You
272 | > (not any Contributor) assume the cost of any necessary servicing,
273 | > repair, or correction. This disclaimer of warranty constitutes an
274 | > essential part of this License. No use of any Covered Software is
275 | > authorized under this License except under this disclaimer.
276 | 
277 | ### 7. Limitation of Liability
278 | 
279 | > Under no circumstances and under no legal theory, whether tort
280 | > (including negligence), contract, or otherwise, shall any
281 | > Contributor, or anyone who distributes Covered Software as
282 | > permitted above, be liable to You for any direct, indirect,
283 | > special, incidental, or consequential damages of any character
284 | > including, without limitation, damages for lost profits, loss of
285 | > goodwill, work stoppage, computer failure or malfunction, or any
286 | > and all other commercial damages or losses, even if such party
287 | > shall have been informed of the possibility of such damages. This
288 | > limitation of liability shall not apply to liability for death or
289 | > personal injury resulting from such party's negligence to the
290 | > extent applicable law prohibits such limitation. Some
291 | > jurisdictions do not allow the exclusion or limitation of
292 | > incidental or consequential damages, so this exclusion and
293 | > limitation may not apply to You.
294 | 
295 | 
296 | ### 8. Litigation
297 | 
298 | Any litigation relating to this License may be brought only in the
299 | courts of a jurisdiction where the defendant maintains its principal
300 | place of business and such litigation shall be governed by laws of that
301 | jurisdiction, without reference to its conflict-of-law provisions.
302 | Nothing in this Section shall prevent a party's ability to bring
303 | cross-claims or counter-claims.
304 | 
305 | 
306 | ### 9. Miscellaneous
307 | 
308 | This License represents the complete agreement concerning the subject
309 | matter hereof. If any provision of this License is held to be
310 | unenforceable, such provision shall be reformed only to the extent
311 | necessary to make it enforceable. Any law or regulation which provides
312 | that the language of a contract shall be construed against the drafter
313 | shall not be used to construe this License against a Contributor.
314 | 
315 | 
316 | ### 10. Versions of the License
317 | 
318 | #### 10.1. New Versions
319 | 
320 | Mozilla Foundation is the license steward. Except as provided in Section
321 | 10.3, no one other than the license steward has the right to modify or
322 | publish new versions of this License. Each version will be given a
323 | distinguishing version number.
324 | 
325 | #### 10.2. Effect of New Versions
326 | 
327 | You may distribute the Covered Software under the terms of the version
328 | of the License under which You originally received the Covered Software,
329 | or under the terms of any subsequent version published by the license
330 | steward.
331 | 
332 | #### 10.3. Modified Versions
333 | 
334 | If you create software not governed by this License, and you want to
335 | create a new license for such software, you may create and use a
336 | modified version of this License if you rename the license and remove
337 | any references to the name of the license steward (except to note that
338 | such modified license differs from this License).
339 | 
340 | #### 10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses
341 | 
342 | If You choose to distribute Source Code Form that is Incompatible With
343 | Secondary Licenses under the terms of this version of the License, the
344 | notice described in Exhibit B of this License must be attached.
345 | 
346 | ## Exhibit A - Source Code Form License Notice
347 | 
348 |     This Source Code Form is subject to the terms of the Mozilla Public
349 |     License, v. 2.0. If a copy of the MPL was not distributed with this
350 |     file, You can obtain one at http://mozilla.org/MPL/2.0/.
351 | 
352 | If it is not possible or desirable to put the notice in a particular
353 | file, then You may include the notice in a location (such as a LICENSE
354 | file in a relevant directory) where a recipient would be likely to look
355 | for such a notice.
356 | 
357 | You may add additional accurate notices of copyright ownership.
358 | 
359 | ## Exhibit B - “Incompatible With Secondary Licenses” Notice
360 | 
361 |     This Source Code Form is "Incompatible With Secondary Licenses", as
362 |     defined by the Mozilla Public License, v. 2.0.
363 | 
364 | 
365 | 


--------------------------------------------------------------------------------
/fast5_research/util.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | from itertools import tee
  3 | from math import pow, log10
  4 | import os
  5 | import sys
  6 | 
  7 | import numpy as np
  8 | import numpy.lib.recfunctions as nprf
  9 | 
 10 | 
 11 | def qstring_to_phred(quality):
 12 |     """Compute standard phred scores from a quality string."""
 13 |     qscores = [ord(q) - 33 for q in quality]
 14 |     return qscores
 15 | 
 16 | 
 17 | def mean_qscore(scores):
 18 |     """Returns the phred score corresponding to the mean of the probabilities
 19 |     associated with the phred scores provided. Taken from chimaera.common.utilities.
 20 | 
 21 | 
 22 |     :param scores: Iterable of phred scores.
 23 | 
 24 |     :returns: Phred score corresponding to the average error rate, as
 25 |         estimated from the input phred scores.
 26 |     """
 27 |     if len(scores) == 0:
 28 |         return 0.0
 29 |     sum_prob = 0.0
 30 |     for val in scores:
 31 |         sum_prob += pow(10, -0.1 * val)
 32 |     mean_prob = sum_prob / len(scores)
 33 |     return -10.0 * log10(mean_prob)
 34 | 
 35 | 
 36 | def kmer_overlap_gen(kmers, moves=None):
 37 |     """From a list of kmers return the character shifts between them.
 38 |     (Movement from i to i+1 entry, e.g. [AATC,ATCG] returns [0,1]).
 39 |     Allowed moves may be specified in moves argument in order of preference.
 40 |     Taken from dragonet.bio.seq_tools
 41 | 
 42 |     :param kmers: sequence of kmer strings.
 43 |     :param moves: allowed movements, if None all movements to length of kmer
 44 |         are allowed.
 45 |     """
 46 | 
 47 |     first = True
 48 |     yield 0
 49 |     for last_kmer, this_kmer in window(kmers, 2):
 50 |         if first:
 51 |             if moves is None:
 52 |                 l = len(this_kmer)
 53 |                 moves = range(l + 1)
 54 |             first = False
 55 | 
 56 |         l = len(this_kmer)
 57 |         for j in moves:
 58 |             if j < 0:
 59 |                 if last_kmer[:j] == this_kmer[-j:]:
 60 |                     yield j
 61 |                     break
 62 |             elif j > 0 and j < l:
 63 |                 if last_kmer[j:l] == this_kmer[0:-j]:
 64 |                     yield j
 65 |                     break
 66 |             elif j == 0:
 67 |                 if last_kmer == this_kmer:
 68 |                     yield 0
 69 |                     break
 70 |             else:
 71 |                 yield l
 72 |                 break
 73 | 
 74 | 
 75 | def build_mapping_table(events, ref_seq, post, scale, path, model):
 76 |     """Build a mapping table based on output of a dragonet.mapper style object.
 77 |     Taken from chimaera.common.utilities.
 78 | 
 79 |     :param events: Numpy record array of events. Must contain the mean,
 80 |         stdv, start and length fields.
 81 |     :param ref_seq: String representation of the reference sequence.
 82 |     :param post: Numpy 2D array containing the posteriors (event, state).
 83 |     :param scale: Scaling object.
 84 |     :param path: Numpy 1D array containing position in reference. May contain
 85 |         negative values, which will be interpreted as "bad emissions".
 86 |     :param model: Model object to use.
 87 | 
 88 |     :returns: numpy record array containing summary fields. One record per event.
 89 | 
 90 |     ====================   =====================================================
 91 |     Output Field           Description
 92 |     ====================   =====================================================
 93 |     *mean*                 mean value of event samples (level)
 94 |     *scaled_mean*          *mean* scaled to the bare level emission (mean/mode)
 95 |     *stdv*                 standard deviation of event samples (noise)
 96 |     *scaled_stdv*          *stdv* scaled to the bare stdv emission (mode)
 97 |     *start*                start time of event /s
 98 |     *length*               length of event /s
 99 |     *model_level*          modelled event level, i.e. the level emission
100 |                            associated with the kmer *kmer*, scaled to the data
101 |     *model_scaled_level*   bare level emission
102 |     *model_sd*             modelled event noise, i.e. the sd emission associated
103 |                            with the kmer *kmer*, scaled  to the data
104 |     *model_scaled_sd*      bare noise emission
105 |     *seq_pos*              aligned sequence position, position on Viterbi path
106 |     *p_seq_pos*            posterior probability of states on Viterbi path
107 |     *kmer*                 kmer identity of *seq_pos*
108 |     *mp_pos*               aligned sequence position, position with highest
109 |                            posterioir
110 |     *p_mp_pos*             posterior probability of most probable states
111 |     *mp_kmer*              kmer identity of *mp_kmer*
112 |     *good_emission*        whether or not the HMM has tagged event as fitting
113 |                            the model
114 |     ====================   =====================================================
115 | 
116 |     """
117 |     kmer_len = len(model['kmer'][0])
118 | 
119 |     kmer_index = seq_to_kmers(ref_seq, kmer_len)
120 |     label_index = dict((j,i) for i,j in enumerate(model['kmer']))
121 |     kmer_dtype = '|S{}'.format(kmer_len)
122 | 
123 |     column_names = ['mean', 'scaled_mean', 'stdv', 'scaled_stdv', 'start', 'length',
124 |                     'model_level', 'model_scaled_level', 'model_sd', 'model_scaled_sd',
125 |                     'p_seq_pos', 'p_mp_pos', 'seq_pos', 'mp_pos', 'move', 'good_emission',
126 |                     'kmer', 'mp_kmer']
127 |     column_types = [float] * 12 + [int] * 3 + [bool] + [kmer_dtype] * 2
128 |     table = np.zeros(events.size, dtype=list(zip(column_names, column_types)))
129 | 
130 |     zero_start = events['start'] - events['start'][0]
131 | 
132 |     # Sequence position
133 |     seq_pos = np.where(path >= 0, path, np.abs(path) - 1)
134 |     seq_kmer = [kmer_index[x] for x in seq_pos]
135 |     seq_kmer_i = [label_index[i] for i in seq_kmer]
136 | 
137 |     table['seq_pos'] = seq_pos
138 |     table['kmer'] = seq_kmer
139 |     table['p_seq_pos'] = post[range(post.shape[0]), seq_pos]
140 |     table['move'] = np.ediff1d(seq_pos, to_begin=[0])
141 |     # Highest posterior positions
142 |     mp_pos = np.argmax(post, axis=1)
143 |     table['mp_pos'] = mp_pos
144 |     table['mp_kmer'] = [kmer_index[x] for x in mp_pos]
145 |     table['p_mp_pos'] = post[range(post.shape[0]), table['mp_pos']]
146 |     # The data
147 |     for x in ('mean', 'start','length', 'stdv'):
148 |         table[x] = events[x]
149 |     # scaling data to model
150 |     table['scaled_mean'] = (table['mean'] - scale.shift - scale.drift * zero_start) / scale.scale
151 |     table['scaled_stdv'] = table['stdv'] / scale.scale_sd
152 |     # The model
153 |     table['model_scaled_level'] = model['level_mean'][seq_kmer_i]
154 |     table['model_scaled_sd']  = model['sd_mean'][seq_kmer_i]
155 |     # The model scaled to the data
156 |     table['model_level'] = scale.shift + scale.drift * zero_start + scale.scale * table['model_scaled_level']
157 |     table['model_sd'] = scale.scale_sd * table['model_scaled_sd']
158 |     # Tag ignore and outlier states
159 |     table['good_emission'] = [x >= 0 for x in path]
160 |     return table
161 | 
162 | 
163 | def build_mapping_summary_table(mapping_summary):
164 |     """Build a mapping summary table
165 | 
166 |     :param mapping_summary: List of curr_map dictionaries
167 | 
168 |     :returns: Numpy record array containing summary contents. One record per array element of mapping_summary
169 | 
170 |     """
171 |     # Set memory allocation for variable length strings
172 |     # This works, but there must be a better way
173 |     max_len_name = 1
174 |     max_len_direction = 1
175 |     max_len_seq = 1
176 |     for summary_line in mapping_summary:
177 |         len_name = len(summary_line['name'])
178 |         if len_name > max_len_name:
179 |             max_len_name = len_name
180 | 
181 |         len_direction = len(summary_line['direction'])
182 |         if len_direction > max_len_direction:
183 |             max_len_direction = len_direction
184 | 
185 |         len_seq = len(summary_line['seq'])
186 |         if len_seq > max_len_seq:
187 |             max_len_seq = len_seq
188 | 
189 |     column_names = ['name', 'direction', 'is_best', 'score', 'scale', 'shift', 'drift', 'scale_sd', 'var_sd', 'var', 'seq']
190 |     column_types = ['|S{}'.format(max_len_name)] + ['|S{}'.format(max_len_direction)] + [bool] + [float] * 7 + ['|S{}'.format(max_len_seq)]
191 | 
192 |     table = np.zeros(len(mapping_summary), dtype=list(zip(column_names, column_types)))
193 |     for table_line, summary_line, in zip(table,mapping_summary):
194 |         table_line['name'] = summary_line['name']
195 |         table_line['direction'] = summary_line['direction']
196 |         table_line['score'] = summary_line['score']
197 |         table_line['scale'] = summary_line['scale'].scale
198 |         table_line['shift'] = summary_line['scale'].shift
199 |         table_line['drift'] = summary_line['scale'].drift
200 |         table_line['scale_sd'] = summary_line['scale'].scale_sd
201 |         table_line['var_sd'] = summary_line['scale'].var_sd
202 |         table_line['var'] = summary_line['scale'].var
203 |         table_line['seq'] = summary_line['seq']
204 | 
205 |     table['is_best'] = False
206 |     is_best = np.argmin([line['score'] for line in mapping_summary])
207 |     table[is_best]['is_best'] = True
208 | 
209 |     return table
210 | 
211 | 
212 | def create_basecall_1d_output(raw_events, scale, path, model, post=None):
213 |     """Create the annotated event table and basecalling summaries similiar to chimaera.
214 | 
215 |     :param raw_events: :class:`np.ndarray` with fields mean, stdv, start and,
216 |         length fields.
217 |     :param scale: :class:`dragonet.basecall.scaling.Scaler` object (or object
218 |         with attributes `shift`, `scale`, `drift`, `var`, `scale_sd`, `var_sd`,
219 |         and `var_sd`.
220 |     :param path: list containing state indices with respect to `model`.
221 |     :param model: `:class:dragonet.util.model.Model` object.
222 |     :param post: Two-dimensional :class:`np.ndarray` containing posteriors (event, state).
223 |     :param quality_data: :class:np.ndarray Array containing quality_data, used to annotate events.
224 | 
225 |     :returns: A tuple of:
226 | 
227 |         * the annotated input event table
228 |         * a dict of result
229 |     """
230 | 
231 |     events = raw_events.copy()
232 |     model_state = np.array([model[x]['kmer'] for x in path])
233 |     raw_model_level = np.array([model[x]['level_mean'] for x in path])
234 |     move = np.array(list(kmer_overlap_gen(model_state)))
235 |     counts = np.bincount(move)
236 |     stays = counts[0]
237 |     skips = counts[2] if len(counts) > 2 else 0
238 | 
239 |     # Extend the event table
240 |     read_start = events[0]['start']
241 |     model_level = scale.shift + scale.scale * raw_model_level +\
242 |                   scale.drift * (events['start'] - read_start)
243 |     new_columns = ['model_state', 'model_level', 'move']
244 |     column_data = [model_state, model_level, move]
245 | 
246 |     if post is not None:
247 |         weights = np.sum(post, axis=1)
248 |         new_columns.append('weights')
249 |         column_data.append(weights)
250 | 
251 |     drop_first = set(new_columns) & set(events.dtype.names)
252 |     events = nprf.drop_fields(events, drop_first)
253 |     table = nprf.append_fields(events, new_columns, data=column_data, asrecarray=True)
254 | 
255 |     # Compile the results
256 |     results = {
257 |         'num_events': events.size,
258 |         'called_events': events.size,
259 |         'shift': scale.shift,
260 |         'scale': scale.scale,
261 |         'drift': scale.drift,
262 |         'var': scale.var,
263 |         'scale_sd': scale.scale_sd,
264 |         'var_sd': scale.var_sd,
265 |         'num_stays': stays,
266 |         'num_skips': skips
267 |     }
268 | 
269 |     return table, results
270 | 
271 | 
272 | def create_mapping_output(raw_events, scale, path, model, seq, post=None, n_states=None, is_reverse=False, substates=False):
273 |     """Create the annotated event table and summaries similiar to chimaera
274 | 
275 |     :param raw_events: :class:`np.ndarray` with fields `mean`, `stdv`, `start`,
276 |         and `length` fields.
277 |     :param scale: :class:`dragonet.basecall.scaling.Scaler` object (or object
278 |         with attributes `shift`, `scale`, `drift`, `var`, `scale_sd`, `var_sd`,
279 |         and `var_sd`.
280 |     :param path: list containing state indices with respect to `model`.
281 |     :param model: `:class:dragonet.util.model.Model` object.
282 |     :param seq: String representation of the reference sequence.
283 |     :param post: Two-dimensional :class:`np.ndarray` containing posteriors (event, state).
284 |     :param is_reverse: Mapping refers to '-' strand (bool).
285 |     :param substate: Mapping contains substates?
286 | 
287 |     :returns: A tuple of:
288 |         * the annotated input event table,
289 |         * a dict of result.
290 |     """
291 | 
292 |     events = raw_events.copy()
293 |     direction = '+' if not is_reverse else '-'
294 |     has_post = True
295 | 
296 |     # If we don't have a posterior, pass a mock object
297 |     if post is None:
298 |         if n_states is None:
299 |             raise ValueError('n_states is required if post is None.')
300 |         has_post = False
301 |         post = MockZeroArray((len(events), n_states))
302 |     table = build_mapping_table(events, seq, post, scale, path, model)
303 | 
304 |     # Delete mocked out columns
305 |     if not has_post:
306 |         to_delete = ['p_seq_pos', 'mp_pos', 'mp_kmer', 'p_mp_pos']
307 |         table = nprf.drop_fields(table, to_delete)
308 | 
309 |     if direction == '-':
310 |         events['seq_pos'] = len(seq) - table['seq_pos']
311 |         ref_start = table['seq_pos'][-1]
312 |         ref_stop = table['seq_pos'][0]
313 |     else:
314 |         ref_start = table['seq_pos'][0]
315 |         ref_stop = table['seq_pos'][-1]
316 | 
317 |     # Compute movement stats.
318 |     _, stays, skips = compute_movement_stats(path)
319 | 
320 |     results = {
321 |         'direction': direction,
322 |         'reference': seq,
323 |         'ref_start': ref_start,
324 |         'ref_stop': ref_stop,
325 |         'shift': scale.shift,
326 |         'scale': scale.scale,
327 |         'drift': scale.drift,
328 |         'var': scale.var,
329 |         'scale_sd': scale.scale_sd,
330 |         'var_sd': scale.var_sd,
331 |         'num_stays': stays,
332 |         'num_skips': skips
333 |     }
334 | 
335 |     return table, results
336 | 
337 | 
338 | class MockZeroArray(np.ndarray):
339 |     def __init__(self, shape):
340 |         """Mock enough of ndarray interface to be passable as a posterior matrix
341 |         to chimaera build_mapping_table
342 | 
343 |         :param shape: tuple, shape of array
344 |       
345 |         """
346 |         self.shape = shape
347 | 
348 |     def argmax(self, axis=0):
349 |         """Fake argmax values of an array."""
350 |         return np.zeros(self.shape[1-axis], dtype=int)
351 | 
352 | 
353 | def validate_event_table(table):
354 |     """Check if an object contains all columns of a basic event array."""
355 | 
356 |     if not isinstance(table, np.ndarray):
357 |         raise TypeError('Table is not a ndarray.')
358 | 
359 |     req_fields = ['mean', 'stdv', 'start', 'length']
360 |     if not set(req_fields).issubset(table.dtype.names):
361 |         raise KeyError(
362 |             'Array does not contain fields for event array: {}, got {}.'.format(
363 |                 req_fields, table.dtype.names
364 |             )
365 |         )
366 | 
367 | 
368 | def validate_model_table(table):
369 |     """Check if an object contains all columns of a dragonet Model."""
370 |     if not isinstance(table, np.ndarray):
371 |         raise TypeError('Table is not a ndarray.')
372 | 
373 |     req_fields = ['kmer', 'level_mean', 'level_stdv', 'sd_mean', 'sd_stdv']
374 |     if not set(req_fields).issubset(table.dtype.names):
375 |         raise KeyError(
376 |             'Object does not contain fields required for Model: {}, got {}.'.format(
377 |                 req_fields, table.dtype.names
378 |             )
379 |         )
380 | 
381 | 
382 | def validate_scale_object(obj):
383 |     """Check if an object contains all attributes of dragonet Scaler."""
384 | 
385 |     req_attributes = ['shift', 'scale', 'drift', 'var', 'scale_sd', 'var_sd']
386 |     msg = 'Object does not contain attributes required for Scaler: {}'.format(req_attributes)
387 |     assert all([hasattr(obj, attr) for attr in req_attributes]), msg
388 | 
389 | 
390 | def compute_movement_stats(path):
391 |     """Compute movement stats from a mapping state path
392 | 
393 |     :param path: :class:`np.ndarry` containing position in reference.
394 |         Negative values are interpreted as "bad emissions".
395 |     """
396 | 
397 |     vitstate_indices = np.where(path >= 0, path, np.abs(path) - 1)
398 |     move = np.ediff1d(vitstate_indices, to_begin=0)
399 |     counts = np.bincount(move)
400 |     stays = counts[0]
401 |     skips = counts[2] if len(counts) > 2 else 0
402 | 
403 |     return move, stays, skips
404 | 
405 | 
406 | def seq_to_kmers(seq, length):
407 |     """Turn a string into a list of (overlapping) kmers.
408 | 
409 |     e.g. perform the transformation:
410 | 
411 |     'ATATGCG' => ['ATA','TAT', 'ATG', 'TGC', 'GCG']
412 | 
413 |     :param seq: character string
414 |     :param length: length of kmers in output
415 | 
416 |     :returns: A list of overlapping kmers
417 |     """
418 |     return [seq[x:x + length] for x in range(0, len(seq) - length + 1)]
419 | 
420 | 
421 | def window(iterable, size):
422 |     """Create an iterator returning a sliding window from another iterator
423 | 
424 |     :param iterable: Iterator
425 |     :param size: Size of window
426 | 
427 |     :returns: an iterator returning a tuple containing the data in the window
428 | 
429 |     """
430 |     assert size > 0, "Window size for iterator should be strictly positive, got {0}".format(size)
431 |     iters = tee(iterable, size)
432 |     for i in range(1, size):
433 |         for each in iters[i:]:
434 |             next(each, None)
435 |     return list(zip(*iters))
436 | 
437 | 
438 | def readtsv(fname, fields=None, **kwargs):
439 |     """Read a tsv file into a numpy array with required field checking
440 | 
441 |     :param fname: filename to read. If the filename extension is
442 |         gz or bz2, the file is first decompressed.
443 |     :param fields: list of required fields.
444 |     """
445 | 
446 |     if not file_has_fields(fname, fields):
447 |         raise KeyError('File {} does not contain requested required fields {}'.format(fname, fields))
448 | 
449 |     for k in ['names', 'delimiter', 'dtype']:
450 |         kwargs.pop(k, None)
451 |     table = np.genfromtxt(fname, names=True, delimiter='\t', dtype=None, encoding='utf8', **kwargs)
452 |     #  Numpy tricks to force single element to be array of one row
453 |     return table.reshape(-1)
454 | 
455 | 
456 | def docstring_parameter(*sub):
457 |     """Allow docstrings to contain parameters."""
458 |     def dec(obj):
459 |         obj.__doc__ = obj.__doc__.format(*sub)
460 |         return obj
461 |     return dec
462 | 
463 | 
464 | def med_mad(data, factor=None, axis=None, keepdims=False):
465 |     """Compute the Median Absolute Deviation, i.e., the median
466 |     of the absolute deviations from the median, and the median
467 | 
468 |     :param data: A :class:`ndarray` object
469 |     :param factor: Factor to scale MAD by. Default (None) is to be consistent
470 |         with the standard deviation of a normal distribution
471 |         (i.e. mad( N(0,\sigma^2) ) = \sigma).
472 |     :param axis: For multidimensional arrays, which axis to calculate over
473 |     :param keepdims: If True, axis is kept as dimension of length 1
474 | 
475 |     :returns: a tuple containing the median and MAD of the data
476 | 
477 |     """
478 |     if factor is None:
479 |         factor = 1.4826
480 |     dmed = np.median(data, axis=axis, keepdims=True)
481 |     dmad = factor * np.median(abs(data - dmed), axis=axis, keepdims=True)
482 |     if axis is None:
483 |         dmed = dmed.flatten()[0]
484 |         dmad = dmad.flatten()[0]
485 |     elif not keepdims:
486 |         dmed = dmed.squeeze(axis)
487 |         dmad = dmad.squeeze(axis)
488 |     return dmed, dmad
489 | 
490 | 
491 | def mad(data, factor=None, axis=None, keepdims=False):
492 |     """Compute the Median Absolute Deviation, i.e., the median
493 |     of the absolute deviations from the median, and (by default)
494 |     adjust by a factor for asymptotically normal consistency.
495 | 
496 |     :param data: A :class:`ndarray` object
497 |     :param factor: Factor to scale MAD by. Default (None) is to be consistent
498 |         with the standard deviation of a normal distribution
499 |         (i.e. mad( N(0,\sigma^2) ) = \sigma).
500 |     :param axis: For multidimensional arrays, which axis to calculate the median over.
501 |     :param keepdims: If True, axis is kept as dimension of length 1
502 | 
503 |     :returns: the (scaled) MAD
504 | 
505 |     """
506 |     _ , dmad = med_mad(data, factor=factor, axis=axis, keepdims=keepdims)
507 |     return dmad
508 | 
509 | 
510 | def file_has_fields(fname, fields=None):
511 |     """Check that a tsv file has given fields
512 | 
513 |     :param fname: filename to read. If the filename extension is
514 |         gz or bz2, the file is first decompressed.
515 |     :param fields: list of required fields.
516 | 
517 |     :returns: boolean
518 |     """
519 | 
520 |     # Allow a quick return
521 |     req_fields = deepcopy(fields)
522 |     if isinstance(req_fields, str):
523 |         req_fields = [fields]
524 |     if req_fields is None or len(req_fields) == 0:
525 |         return True
526 |     req_fields = set(req_fields)
527 | 
528 |     inspector = open
529 |     ext = os.path.splitext(fname)[1]
530 |     if ext == '.gz':
531 |         inspector = gzopen
532 |     elif ext == '.bz2':
533 |         inspector = bzopen
534 | 
535 |     has_fields = None
536 |     with inspector(fname, 'r') as fh:
537 |         present_fields = set(fh.readline().rstrip('\n').split('\t'))
538 |         has_fields = req_fields.issubset(present_fields)
539 |     return has_fields
540 | 
541 | 
542 | def get_changes(data, ignore_cols=None, use_cols=None):
543 |     """Return only rows of a structured array which are not equal to the previous row.
544 | 
545 |     :param data: Numpy record array.
546 |     :param ignore_cols: iterable of column names to ignore in checking for equality between rows.
547 |     :param use_cols: iterable of column names to include in checking for equality between rows (only used if ignore_cols is None).
548 | 
549 |     :returns: Numpy record array.
550 |     """
551 |     cols = list(data.dtype.names)
552 |     if ignore_cols is not None:
553 |         for col in ignore_cols:
554 |             cols.remove(col)
555 |     elif use_cols is not None:
556 |         cols = list(use_cols)
557 |     changed_inds = np.where(data[cols][1:] != data[cols][:-1])[0] + 1
558 |     changed_inds = [0] + [i for i in changed_inds]
559 |     return data[(changed_inds,)]
560 | 
561 | 
562 | def _clean(value):
563 |     """Convert numpy numeric types to their python equivalents."""
564 |     if isinstance(value, np.ndarray):
565 |         if value.dtype.kind == 'S':
566 |             return np.char.decode(value).tolist()
567 |         else:
568 |             return value.tolist()
569 |     elif type(value).__module__ == np.__name__:
570 |         conversion = value.item()
571 |         if sys.version_info.major == 3 and isinstance(conversion, bytes):
572 |             conversion = conversion.decode()
573 |         return conversion
574 |     elif sys.version_info.major == 3 and isinstance(value, bytes):
575 |         return value.decode()
576 |     else:
577 |         return value
578 | 
579 | 
580 | def _clean_attrs(attrs):
581 |     return {_clean(k): _clean(v) for k, v in attrs.items()}
582 | 
583 | 
584 | def _sanitize_data_for_writing(data):
585 |     if isinstance(data, str):
586 |         return data.encode()
587 |     elif isinstance(data, np.ndarray) and data.dtype.kind == np.dtype(np.unicode):
588 |         return data.astype('S')
589 |     elif isinstance(data, np.ndarray) and len(data.dtype) > 1:
590 |         dtypes = dtype_descr(data)
591 |         for index, entry in enumerate(dtypes):
592 |             type_check = entry[1]
593 |             if isinstance(type_check, tuple):
594 |                 # an enum?
595 |                 return data
596 |             if type_check.startswith('<U'):
597 |                 # numpy.astype can't handle empty string datafields for some
598 |                 # reason, so we'll explicitly state that.
599 |                 if len(entry[1]) <= 2 or (len(entry[1]) == 3 and
600 |                                           entry[1][2] == '0'):
601 |                     raise TypeError('Empty datafield {} cannot be converted'
602 |                                     ' by np.astype.'.format(entry[0]))
603 |                 dtypes[index] = (entry[0], '|S{}'.format(entry[1][2:]))
604 |         return data.astype(dtypes)
605 |     return data
606 | 
607 | 
608 | def _sanitize_data_for_reading(data):
609 |     if sys.version_info.major == 3:
610 |         if isinstance(data, bytes):
611 |             return data.decode()
612 |         elif isinstance(data, np.ndarray) and data.dtype.kind == 'S':
613 |             return np.char.decode(data)
614 |         elif isinstance(data, np.ndarray) and len(data.dtype) > 1:
615 |             dtypes = list(dtype_descr(data))
616 |             for index, entry in enumerate(dtypes):
617 |                 type_check = entry[1]
618 |                 if isinstance(type_check, tuple):
619 |                     # an enum?
620 |                     return data
621 |                 if entry[1].startswith('|S'):
622 |                     # numpy.astype can't handle empty datafields for some
623 |                     # reason, so we'll explicitly state that.
624 |                     if len(entry[1]) <= 2 or (len(entry[1]) == 3 and
625 |                                               entry[1][2] == '0'):
626 |                         raise TypeError('Empty datafield {} cannot be converted'
627 |                                         ' by np.astype.'.format(entry[0]))
628 |                     dtypes[index] = (entry[0], '<U{}'.format(entry[1][2:]))
629 | 
630 |                 if entry[1].startswith('|u'):
631 |                     # there seems to be a bug in astype
632 |                     dtypes[index] = (entry[0], entry[1].replace('|u', 'u'))
633 | 
634 |             return data.astype(dtypes)
635 |     return data
636 | 
637 | 
638 | def dtype_descr(arr):
639 |     """Get arr.dtype.descr
640 |     Views of structured arrays in which columns have been re-ordered nolonger support arr.dtype.descr
641 |     see https://github.com/numpy/numpy/commit/dd8a2a8e29b0dc85dca4d2964c92df3604acc212
642 |     """
643 |     try:
644 |         return arr.dtype.descr
645 |     except ValueError:
646 |         return tuple([(n, arr.dtype[n].descr[0][1]) for n in arr.dtype.names])
647 | 
648 | 
649 | def group_vector(arr):
650 |     """Group a vector by unique values.
651 | 
652 |     :param arr: input vector to be grouped.
653 | 
654 |     :returns: a dictionary mapping unique values to arrays of indices of the
655 |         input vector.
656 | 
657 |     """
658 |     groups, keys_as_int = np.unique(arr, return_inverse = True)
659 |     n_keys = max(keys_as_int)
660 |     indices = [[] for i in range(n_keys + 1)]
661 |     for i, k in enumerate(keys_as_int):
662 |         indices[k].append(i)
663 |     indices = [np.array(elt) for elt in indices]
664 |     return dict(zip(groups, indices))
665 | 
666 | 


--------------------------------------------------------------------------------
/fast5_research/extract.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from concurrent.futures import ProcessPoolExecutor, as_completed
  3 | import collections
  4 | import functools
  5 | from itertools import tee
  6 | import logging
  7 | import os
  8 | from timeit import default_timer as now
  9 | from uuid import uuid4
 10 | 
 11 | import h5py
 12 | import numpy as np
 13 | import numpy.lib.recfunctions as rfn
 14 | 
 15 | import pysam
 16 | import gzip
 17 | 
 18 | from fast5_research.fast5 import Fast5, iterate_fast5
 19 | from fast5_research.fast5_bulk import BulkFast5
 20 | from fast5_research.util import _sanitize_data_for_writing, readtsv, group_vector
 21 | 
 22 | def triplewise(iterable):
 23 |     a, b, c = tee(iterable, 3)
 24 |     next(b)
 25 |     next(c)
 26 |     next(c)
 27 |     yield from zip(a, b, c)
 28 | 
 29 | def extract_read_summary():
 30 |     logging.basicConfig(
 31 |         format='[%(asctime)s - %(name)s] %(message)s',
 32 |         datefmt='%H:%M:%S', level=logging.INFO
 33 |     )
 34 |     logger = logging.getLogger('Summarize Reads')
 35 |     parser = argparse.ArgumentParser(description='Summarize reads stored in a Bulk .fast5')
 36 |     parser.add_argument('input', help='Bulk .fast5 file for input.')
 37 |     parser.add_argument('output', help='Output text file.')
 38 |     parser.add_argument('--channel_range', nargs=2, type=int, default=None, help='Channel range (inclusive).')
 39 |     args = parser.parse_args()
 40 | 
 41 |     if args.channel_range is None:
 42 |         with BulkFast5(args.input) as src:
 43 |             channels = src.channels
 44 |     else:
 45 |         channels = range(args.channel_range[0], args.channel_range[1] + 1)
 46 | 
 47 |     with BulkFast5(args.input) as src, open(args.output, 'w') as out_fh:
 48 |         extract_read_summary_internal(src, channels, out_fh, logger)
 49 | 
 50 | 
 51 | def extract_read_summary_internal(src, channels, out_fh, logger):
 52 |     fields = [
 53 |         'run', 'channel', 'mux', 'read_id', 'classification',
 54 |         'start_time', 'duration', 'time_since_strand',
 55 |         'start_event', 'end_event', 'num_events',
 56 |         'median_current', 'median_current_before', 'median_current_after',
 57 |         'drift', 'median_sd', 'range_current', 'median_dwell'
 58 |     ]
 59 |     out_fh.write('\t'.join(fields))
 60 |     out_fh.write('\n')
 61 |     sample_rate = src.sample_rate
 62 |     run_id = src.get_tracking_meta()['run_id']
 63 |     for chan in channels:
 64 |         logger.info("Processing channel {}".format(chan))
 65 |         count = 0
 66 |         reads = src.get_reads(chan)
 67 |         last_strand = 0
 68 |         # this drops the first and last read
 69 |         for before, data, after in triplewise(reads):
 70 |             data['run'] = run_id
 71 |             data['channel'] = chan
 72 |             data['mux'] = src.get_mux(chan, raw_index=data['read_start'] + data['read_length'] // 2)
 73 |             data['median_current'] = data['median']
 74 |             data['median_current_before'] = before['median']
 75 |             data['median_current_after'] = after['median']
 76 |             data['num_events'] = data['event_index_end'] - data['event_index_start']
 77 |             data['start_time'] = float(data['read_start']) / sample_rate
 78 |             data['duration'] = float(data['read_length']) / sample_rate
 79 |             data['start_event'] = data['event_index_start']
 80 |             data['end_event'] = data['event_index_end']
 81 |             data['range_current'] = data['range']
 82 |             data['median_dwell'] /= sample_rate
 83 |             data['time_since_strand'] = data['start_time'] - last_strand
 84 |             out_fh.write('\t'.join(str(data[f]) for f in fields))
 85 |             out_fh.write('\n')
 86 |             if data['classification'] == 'strand':
 87 |                 last_strand = data['start_time'] + data['duration']
 88 |             count += 1
 89 |         logger.info("{} reads in channel {}".format(count, chan))
 90 | 
 91 | 
 92 | 
 93 | def extract_reads():
 94 |     logging.basicConfig(
 95 |         format='[%(asctime)s - %(name)s] %(message)s',
 96 |         datefmt='%H:%M:%S', level=logging.INFO
 97 |     )
 98 |     logger = logging.getLogger('Extract Reads')
 99 |     parser = argparse.ArgumentParser(description='Bulk .fast5 to read .fast5 conversion.')
100 |     parser.add_argument('input', help='Bulk .fast5 file for input.')
101 |     parser.add_argument('output', help='Output folder.')
102 |     out_format = parser.add_mutually_exclusive_group()
103 |     out_format.add_argument('--multi', action='store_true', help='Output multi-read files.')
104 |     out_format.add_argument('--single', action='store_false', dest='multi', help='Output single-read files.')
105 |     parser.add_argument('--flat', default=False, action='store_true',
106 |                         help='Create all .fast5 files in one directory')
107 |     parser.add_argument('--by_id', default=False, action='store_true',
108 |                         help='Name single-read .fast5 files by read_id.')
109 |     parser.add_argument('--prefix', default="", help='Read file prefix.')
110 |     parser.add_argument('--channel_range', nargs=2, type=int, default=None, help='Channel range (inclusive).')
111 |     parser.add_argument('--summary', help='Strand summary file containing at least columns channel, start_time and duration).')
112 |     parser.add_argument('--workers', type=int, default=4, help='Number of worker processes.')
113 |     parser.add_argument('--limit', type=int, default=None, help='Limit reads per channel.')
114 |     args = parser.parse_args()
115 | 
116 |     if not os.path.exists(args.output):
117 |         os.makedirs(args.output)
118 |     else:
119 |         raise IOError('The output directory must not exist.')
120 | 
121 |     if args.summary is not None:
122 |         if not os.path.isfile(args.summary):
123 |             raise IOError('The summary file does not exist.')
124 |         else:
125 |             # load summary, keeping in mind possible alternative column names
126 |             args.summary = np.genfromtxt(args.summary, delimiter='\t', encoding=None, dtype=None, names=True)
127 |             args.summary = rfn.rename_fields(
128 |                 args.summary, {'strand_duration': 'duration'})
129 | 
130 |     worker = functools.partial(
131 |         extract_channel_reads,
132 |         args.input, args.output, args.prefix, args.flat, args.by_id,
133 |         args.limit, args.multi
134 |     )
135 | 
136 |     if args.channel_range is None:
137 |         with BulkFast5(args.input) as src:
138 |             channels = src.channels
139 |     else:
140 |         channels = range(args.channel_range[0], args.channel_range[1] + 1)
141 | 
142 |     if args.summary is not None:
143 |         # only process channels in the summary
144 |         summ_channels = set(args.summary['channel'])
145 |         channels = [ch for ch in channels if ch in summ_channels]
146 |         summary_by_ch = {ch: args.summary[np.where(args.summary['channel'] == ch)] for ch in channels}
147 |     else:
148 |         summary_by_ch = collections.defaultdict(lambda: None)
149 | 
150 |     if args.workers > 1:
151 |         with ProcessPoolExecutor(args.workers) as executor:
152 |             futures = [executor.submit(worker, c, summary=summary_by_ch[c]) for c in channels]
153 |             for future in as_completed(futures):
154 |                 try:
155 |                     n_reads, channel = future.result()
156 |                 except Exception as e:
157 |                     logger.warning("Error processing channel.")
158 |                     print(e)
159 |                 else:
160 |                     logger.info("Extracted {} reads from channel {}.".format(n_reads, channel))
161 |     else:
162 |         for channel in channels:
163 |             worker(channel, summary=summary_by_ch[channel])
164 |     logger.info("Finished.")
165 | 
166 | 
167 | def time_cast(time, sample_rate):
168 |     """
169 |     Convert a float time to sample index, or return time unmodified
170 |     """
171 |     if isinstance(time, float):
172 |         return int(time * sample_rate)
173 |     else:
174 |         return time
175 | 
176 | 
177 | def extract_channel_reads(source, output, prefix, flat, by_id, max_files, multi, channel, summary=None):
178 |     if flat:
179 |         out_path = output
180 |         # give multi files a channel prefix else they will
181 |         # conflict between channels. Singles already get
182 |         # a "ch" component in their name
183 |         if multi:
184 |             extra = 'ch{}'.format(channel)
185 |             if prefix == '':
186 |                 prefix = extra
187 |             else:
188 |                 prefix = '{}_{}'.format(prefix, extra)
189 |     else:
190 |         out_path = os.path.join(output, str(channel))
191 |         os.makedirs(out_path)
192 | 
193 |     with BulkFast5(source) as src:
194 |         meta = src.get_metadata(channel)
195 |         tracking_id = src.get_tracking_meta()
196 |         context_tags = src.get_context_meta()
197 |         channel_id = {
198 |             'channel_number': channel,
199 |             'range': meta['range'],
200 |             'digitisation': meta['digitisation'],
201 |             'offset': meta['offset'],
202 |             'sampling_rate': meta['sample_rate']
203 |         }
204 | 
205 |         Writer = MultiWriter if multi else SingleWriter
206 |         with Writer(out_path, by_id, prefix=prefix) as writer:
207 | 
208 |             median_before = None
209 |             counter = 1
210 |             raw_data = src.get_raw(channel, use_scaling=False)
211 | 
212 |             if summary is not None:
213 |                 # convert array into stream of dicts
214 |                 reads = ({k: row[k] for k in row.dtype.names} for row in summary)
215 |                 class_field = 'class'
216 |                 start_field = 'start_time'
217 |                 duration_field = 'duration'
218 |                 # if start_time is a float (seconds) we need to convert to
219 |                 # samples
220 |                 time_cols = ['start_time', 'duration']
221 |             else:
222 |                 reads = src.get_reads(channel)
223 |                 class_field = 'classification'
224 |                 start_field = 'read_start'
225 |                 duration_field = 'read_length'
226 | 
227 |             for read_number, read in enumerate(reads):
228 | 
229 |                 if summary is not None:
230 |                     if 'median_current_before' in read:
231 |                         median_before = read['median_current_before']
232 |                     else:
233 |                         median_before = 0.0
234 |                 elif median_before is None:
235 |                     median_before = read['median']
236 |                     continue
237 | 
238 |                 if summary is None and read[class_field] != 'strand':
239 |                     median_before = read['median']
240 |                 else:
241 |                     counter += 1
242 |                     start = time_cast(read[start_field], meta['sample_rate'])
243 |                     length = time_cast(read[duration_field], meta['sample_rate'])
244 |                     read_id = {
245 |                         'start_time': start,
246 |                         'duration': length,
247 |                         'read_number': read_number,
248 |                         'start_mux': src.get_mux(channel, raw_index=start, wells_only=True),
249 |                         'read_id': str(read['read_id']) if 'read_id' in read else str(uuid4()),
250 |                         'scaling_used': 1,
251 |                         'median_before': median_before
252 |                     }
253 | 
254 |                     raw_slice = raw_data[start:start+length]
255 |                     read = Read(read_id, read_number, tracking_id, channel_id, context_tags, raw_slice)
256 |                     writer.write_read(read)
257 |                     if counter == max_files:
258 |                         break
259 |     return counter, channel
260 | 
261 | 
262 | def build_read_index():
263 |     logging.basicConfig(
264 |         format='[%(asctime)s - %(name)s] %(message)s',
265 |         datefmt='%H:%M:%S', level=logging.INFO
266 |     )
267 |     logger = logging.getLogger('Index Reads')
268 | 
269 |     parser = argparse.ArgumentParser(description='Build index of reads within .fast5s. Output to stdout.')
270 |     parser.add_argument('input', help='.fast5 directory')
271 |     parser.add_argument('--recursive', action='store_true',
272 |         help='Search recursively under `input` for source files.')
273 |     parser.add_argument('--workers', type=int, default=8,
274 |         help='Number of worker processes.')
275 |     args = parser.parse_args()
276 | 
277 |     src_files = list(iterate_fast5(args.input, paths=True, recursive=args.recursive))
278 |     logger.info("Found {} files.".format(len(src_files)))
279 | 
280 |     with ProcessPoolExecutor(args.workers) as executor:
281 |          n_reads = 0
282 |          for i, (src, read_ids) in enumerate(
283 |                  zip(src_files, executor.map(reads_in_multi, src_files, chunksize=10))):
284 |              n_reads += len(read_ids)
285 |              for read in read_ids:
286 |                  print('\t'.join((read, os.path.abspath(src))))
287 |              if i % 10 == 0:
288 |                  logger.info("Indexed {}/{} files. {} reads".format(i, len(src_files), n_reads))
289 | 
290 | def filter_file_from_bam():
291 |     logging.basicConfig(
292 |         format='[%(asctime)s - %(name)s] %(message)s',
293 |         datefmt='%H:%M:%S', level=logging.INFO
294 |     )
295 |     logger = logging.getLogger('Filter')
296 |     parser = argparse.ArgumentParser(
297 |         description='Create filter file from BAM and sequencing summary')
298 |     parser.add_argument('--seperator',
299 |                         dest="SEP",
300 |                         default='\t',
301 |                         help="Seperator in sequencing summary files")
302 |     parser.add_argument('--id-col',
303 |                         dest="READID_COL",
304 |                         default='read_id',
305 |                         help="Column name for read_id in sequencing summary files")
306 |     parser.add_argument('--fname-col',
307 |                         dest="FNAME_COL",
308 |                         default='filename',
309 |                         help="Column name for fast5 filename in sequencing summary files")
310 |     parser.add_argument('-r', '--region',
311 |                         dest="REGION",
312 |                         default=None,
313 |                         help="Print reads only from this region")
314 |     parser.add_argument('--workers', type=int, default=4,
315 |         help='Number of worker processes.')
316 |     parser.add_argument('-p', '--primary-only',
317 |                         dest="PRIMARY",
318 |                         action='store_true',
319 |                         help="Ignore secondary and supplementary alignments")
320 | 
321 |     parser.add_argument('BAM', help='Path to BAM file')
322 |     parser.add_argument("SUMMARY",
323 |                         type=str,
324 |                         nargs='+',
325 |                         help="Sequencing summary files")
326 | 
327 |     args = parser.parse_args()
328 | 
329 |     region = args.REGION
330 |     primary_only = args.PRIMARY
331 |     bam_in = args.BAM
332 |     summary_files = args.SUMMARY
333 |     threads = args.workers
334 |     readid_col = args.READID_COL
335 |     fast5_col = args.FNAME_COL
336 |     sep = args.SEP
337 | 
338 |     if not region:
339 |         logger.info("No region specified. Extracting all reads from BAM file")
340 |     else:
341 |         logger.info("Extracting read ids from {}".format(region))
342 | 
343 |     read_ids = {}
344 |     with pysam.AlignmentFile(bam_in, "rb", threads=threads) as infile:
345 |         for read in infile.fetch(region=region):
346 |             if read.is_unmapped or (primary_only and (read.is_secondary or read.is_supplementary)):
347 |                 continue
348 |             read_ids[read.query_name] = None
349 | 
350 |     n = len(read_ids)
351 |     logger.info("Reads found in BAM file: {}".format(n))
352 |     if n == 0:
353 |         return
354 | 
355 |     # Print header
356 |     print("read_id", "filename", sep='\t')
357 | 
358 |     n_print = 0
359 |     for summary_file in summary_files:
360 |         logging.info("Opening: {}".format(summary_file))
361 |         with gzip.open(summary_file) as fh:
362 |             header = fh.readline().decode().strip()
363 |             header_cols = header.split(sep)
364 |             readid_idx = header_cols.index(readid_col)
365 |             path_idx = header_cols.index(fast5_col)
366 | 
367 |             for line in fh:
368 |                 line = line.decode().strip()
369 |                 if not line:
370 |                     continue
371 |                 cols = line.split(sep)
372 |                 readid = cols[readid_idx]
373 |                 f5_path = cols[path_idx]
374 |                 if readid not in read_ids:
375 |                     continue
376 | 
377 |                 if read_ids[readid]:
378 |                     logging.error("Two entries found for {} ({} and {})".format(readid, read_ids[readid], f5_path))
379 |                     continue
380 | 
381 |                 n_print += 1
382 |                 read_ids[readid] = f5_path
383 |                 print(readid, read_ids[readid], sep='\t')
384 |     logging.info("Filename found for {} reads ({}%)".format(n_print, round(n_print * 100.0 / n)))
385 | 
386 | def filter_multi_reads():
387 |     logging.basicConfig(
388 |         format='[%(asctime)s - %(name)s] %(message)s',
389 |         datefmt='%H:%M:%S', level=logging.INFO
390 |     )
391 |     logger = logging.getLogger('Filter')
392 |     parser = argparse.ArgumentParser(
393 |         description='Extract reads from multi-read .fast5 files.')
394 |     parser.add_argument('input',
395 |         help='Path to input multi-read .fast5 files (or list of files).')
396 |     parser.add_argument('output',
397 |         help='Output folder.')
398 |     parser.add_argument('filter',
399 |         help='A .tsv file with column `read_id` defining required reads. '
400 |              'If a `filename` column is present, this will be used as the '
401 |              'location of the read.')
402 |     parser.add_argument('--tsv_field', default='read_id',
403 |         help='Field name from `filter` file to obtain read IDs.')
404 |     parser.add_argument('--prefix', default="",
405 |         help='Read file prefix.')
406 |     parser.add_argument('--recursive', action='store_true',
407 |         help='Search recursively under `input` for source files.')
408 |     parser.add_argument('--workers', type=int, default=4,
409 |         help='Number of worker processes.')
410 | 
411 |     out_format = parser.add_mutually_exclusive_group()
412 |     out_format.add_argument('--multi', action='store_true', default=True,
413 |         help='Output multi-read files.')
414 |     out_format.add_argument('--single', action='store_false', dest='multi',
415 |         help='Output single-read files.')
416 | 
417 |     #parser.add_argument('--limit', type=int, default=None, help='Limit reads per channel.')
418 |     args = parser.parse_args()
419 | 
420 |     if not args.multi:
421 |         raise NotImplementedError('Extraction of reads to single read files is on the TODO list.')
422 | 
423 |     if not os.path.exists(args.output):
424 |         os.makedirs(args.output)
425 |     else:
426 |         raise IOError('The output directory must not exist.')
427 | 
428 |     # grab list of source files
429 |     logger.info("Searching for input files.")
430 |     try:
431 |         src_files = list(set(readtsv(args.input)['filename']))
432 |     except Exception as e:
433 |         logger.info('Failed to read `input` as filelist, assuming path to search. {}'.format(e))
434 |         src_files = list(iterate_fast5(args.input, paths=True, recursive=args.recursive))
435 |     n_files = len(src_files)
436 |     logger.info("Found {} source files.".format(n_files))
437 | 
438 |     logger.info("Reading filter file.")
439 |     read_table = readtsv(args.filter, fields=[args.tsv_field])
440 |     logger.info("Found {} reads in filter.".format(len(read_table)))
441 | 
442 |     try:
443 |         # try to build index from the filter file with 'filename' column
444 |         if 'filename' not in read_table.dtype.names:
445 |             raise ValueError("'filename' column not present in filter.")
446 |         logger.info("Attempting to build read index from input filter.")
447 |         src_path_files = {
448 |             os.path.basename(x):x for x in src_files
449 |         }
450 |         if len(src_path_files) != len(src_files):
451 |             raise ValueError('Found non-uniquely named source files')
452 |         read_index = dict()
453 |         for fname, indices in group_vector(read_table['filename']).items():
454 |             fpath = src_path_files[os.path.basename(fname)]
455 |             read_index[fpath] = read_table[args.tsv_field][indices]
456 |         logger.info("Successfully build read index from input filter.")
457 |     except Exception as e:
458 |         logger.info("Failed to build read index from summary: {}".format(e))
459 |         read_index = None
460 |         required_reads = set(read_table[args.tsv_field])
461 |         logger.info("Finding reads within {} source files.".format(n_files))
462 |         index_worker = functools.partial(reads_in_multi, filt=required_reads)
463 |         read_index = dict()
464 |         n_reads = 0
465 |         with ProcessPoolExecutor(args.workers) as executor:
466 |             i = 0
467 |             for src_file, read_ids in zip(src_files, executor.map(index_worker, src_files, chunksize=10)):
468 |                 i += 1
469 |                 n_reads += len(read_ids)
470 |                 read_index[src_file] = read_ids
471 |                 if i % 10 == 0:
472 |                     logger.info("Indexed {}/{} files. {}/{} reads".format(i, n_files, n_reads, len(required_reads)))
473 | 
474 |     n_reads = sum(len(x) for x in read_index.values())
475 |     # We don't go via creating Read objects, copying the data verbatim
476 |     # likely quicker and nothing should need the verification that the APIs
477 |     # provide (garbage in, garbage out).
478 |     logger.info("Extracting {} reads.".format(n_reads))
479 |     if args.prefix != '':
480 |         args.prefix = '{}_'.format(args.prefix)
481 | 
482 |     with ProcessPoolExecutor(args.workers) as executor:
483 |         reads_per_process = np.ceil(n_reads / args.workers)
484 |         proc_n_reads = 0
485 |         proc_reads = dict()
486 |         job = 0
487 |         futures = list()
488 |         for src in read_index.keys():
489 |             proc_reads[src] = read_index[src]
490 |             proc_n_reads += len(proc_reads[src])
491 |             if proc_n_reads > reads_per_process:
492 |                 proc_prefix = "{}{}_".format(args.prefix, job)
493 |                 futures.append(executor.submit(_subset_reads_to_file, proc_reads, args.output, proc_prefix, worker_id=job))
494 |                 job += 1
495 |                 proc_n_reads = 0
496 |                 proc_reads = dict()
497 |         if proc_n_reads > 0:  # processing remaining reads
498 |             proc_prefix = "{}{}_".format(args.prefix, job)
499 |             futures.append(executor.submit(_subset_reads_to_file, proc_reads, args.output, proc_prefix, worker_id=job))
500 | 
501 | 
502 |         for fut in as_completed(futures):
503 |             try:
504 |                 reads_written, prefix = fut.result()
505 |                 logger.info("Written {} reads to {}.".format(reads_written, prefix))
506 |             except Exception as e:
507 |                 logger.warning("Error: {}".format(e))
508 |     logger.info("Done.")
509 | 
510 | 
511 | def _subset_reads_to_file(read_index, output, prefix, worker_id=0):
512 |     logger = logging.getLogger('Worker-{}'.format(worker_id))
513 |     n_reads = sum(len(x) for x in read_index.values())
514 |     reads_written = 0
515 |     t0 = now()
516 |     with MultiWriter(output, None, prefix=prefix) as writer:
517 |         for src_file, read_ids in read_index.items():
518 |             reads_written += len(read_ids)
519 |             t1 = now()
520 |             if t1 - t0 > 30: # log update every 30 seconds
521 |                 logger.info("Written {}/{} reads ({:.0f}% done)".format(
522 |                     reads_written, n_reads, 100 * reads_written / n_reads
523 |                 ))
524 |                 t0 = t1
525 |             with h5py.File(src_file, 'r') as src_fh:
526 |                 for read_id in read_ids:
527 |                     try:
528 |                         read_grp = src_fh["read_{}".format(read_id)]
529 |                     except:
530 |                         logger.warning("Did not find {} in {}.".format(read_id, src_fh.filename))
531 |                     else:
532 |                         writer.write_read(read_grp)
533 |     return reads_written, prefix
534 | 
535 | 
536 | def reads_in_multi(src, filt=None):
537 |     """Get list of read IDs contained within a multi-read file.
538 | 
539 |     :param src: source file.
540 |     :param filt: perform filtering by given set.
541 |     :returns: set of read UUIDs (as string and recorded in hdf group name).
542 |     """
543 |     logger = logging.getLogger(os.path.splitext(os.path.basename(src))[0])
544 |     logger.debug("Finding reads.")
545 |     prefix = 'read_'
546 |     with h5py.File(src, 'r') as fh:
547 |         read_ids = set(grp[len(prefix):] for grp in fh if grp.startswith(prefix))
548 |     logger.debug("Found {} reads.".format(len(read_ids)))
549 |     if filt is not None:
550 |         read_ids = read_ids.intersection(filt)
551 |     logger.debug("Filtered to {} reads.".format(len(read_ids)))
552 |     return read_ids
553 | 
554 | 
555 | class Read(object):
556 |     # Just a sketch to help interchange of format
557 |     def __init__(self, read_id, read_number, tracking_id, channel_id, context_tags, raw):
558 |         self.read_id = read_id
559 |         self.read_number = read_number
560 |         self.tracking_id = tracking_id
561 |         self.channel_id = channel_id
562 |         self.context_tags = context_tags
563 |         self.raw = raw
564 | 
565 |         # ensure typing and required fields
566 |         self.channel_id = Fast5.convert_channel_id(self.channel_id)
567 |         self.tracking_id = Fast5.convert_tracking_id(self.tracking_id)
568 | 
569 | 
570 | class ReadWriter(object):
571 |     def __init__(self, out_path, by_id, prefix=""):
572 |         self.out_path = out_path
573 |         self.by_id = by_id
574 |         if prefix != "":
575 |             prefix = "{}_".format(prefix)
576 |         self.prefix = prefix
577 | 
578 |     def write_read(self):
579 |         raise NotImplementedError()
580 | 
581 |     def __enter__(self):
582 |         return self
583 | 
584 |     def __exit__(self, exception_type, exception_value, traceback):
585 |         pass
586 | 
587 | 
588 | class SingleWriter(ReadWriter):
589 |     def write_read(self, read):
590 |         if self.by_id:
591 |             filename = '{}.fast5'.format(read.read_id['read_id'])
592 |         else:
593 |             filename = '{}read_ch{}_file{}.fast5'.format(
594 |                 self.prefix, read.channel_id['channel_number'], read.read_number
595 |             )
596 |         filename = os.path.join(self.out_path, filename)
597 |         with Fast5.New(filename, 'a', tracking_id=read.tracking_id, context_tags=read.context_tags, channel_id=read.channel_id) as h:
598 |             h.set_raw(read.raw, meta=read.read_id, read_number=read.read_number)
599 | 
600 | 
601 | MULTI_READ_FILE_VERSION = "2.0"
602 | 
603 | class MultiWriter(ReadWriter):
604 |     def __init__(self, out_path, by_id, prefix="", reads_per_file=4000):
605 |         super(MultiWriter, self).__init__(out_path, by_id, prefix=prefix)
606 |         self.reads_per_file = reads_per_file
607 |         self.current_reads = 0 # reads in open file, used to signal new file condition
608 |         self.file_counter = 0
609 |         self.current_file = None
610 |         self.closed = False
611 | 
612 | 
613 |     def __exit__(self, exception_type, exception_value, traceback):
614 |         self.close()
615 | 
616 | 
617 |     def close(self):
618 |         if isinstance(self.current_file, h5py.File):
619 |             self.current_file.close()
620 | 
621 | 
622 |     def write_read(self, read):
623 |         """Write a read.
624 | 
625 |         :param read: either a `Read` object or an hdf group handle from a
626 |             source multi-read file.
627 |         """
628 |         if self.closed:
629 |             raise RuntimeError('Cannot write after closed.')
630 | 
631 |         if self.current_reads == 0:
632 |             # start a new file
633 |             self.close()
634 |             filename = '{}mreads_file{}.fast5'.format(
635 |                 self.prefix, self.file_counter
636 |             )
637 |             filename = os.path.join(self.out_path, filename)
638 |             self.current_file = h5py.File(filename, 'w')
639 |             self.current_file.attrs[_sanitize_data_for_writing('file_version')] = _sanitize_data_for_writing("2.0")
640 |             self.file_counter += 1
641 | 
642 |         # write data
643 |         if isinstance(read, Read):
644 |             self._write_read(read)
645 |         elif isinstance(read, h5py.Group):
646 |             self._copy_read_group(read)
647 |         else:
648 |             raise TypeError("Cannot write type {} to output file.")
649 |         self.current_reads += 1
650 | 
651 |         # update
652 |         if self.current_reads == self.reads_per_file:
653 |             self.current_reads = 0
654 | 
655 | 
656 |     def _write_read(self, read):
657 |         if read.raw.dtype != np.int16:
658 |             raise TypeError('Raw data must be of type int16.')
659 | 
660 |         read_group = '/read_{}'.format(read.read_id['read_id'])
661 |         Fast5._add_attrs_to_fh(self.current_file, {'run_id': read.tracking_id['run_id']}, read_group, convert=str)
662 | 
663 |         # add all attributes
664 |         for grp_name in ('tracking_id', 'context_tags'):
665 |             # spec has all of these as str
666 |             data = getattr(read, grp_name)
667 |             Fast5._add_attrs_to_fh(self.current_file, data, '{}/{}'.format(read_group, grp_name), convert=str)
668 |         Fast5._add_attrs_to_fh(self.current_file, read.channel_id, '{}/channel_id'.format(read_group))
669 | 
670 |         # add the data (and some more attrs)
671 |         data_path = '{}/Raw'.format(read_group)
672 |         read_id = Fast5._convert_meta_times(read.read_id, read.channel_id['sampling_rate'])
673 |         read_id = Fast5.convert_raw_meta(read_id)
674 |         Fast5._add_attrs_to_fh(self.current_file, read_id, data_path)
675 |         signal_path = '{}/Signal'.format(data_path)
676 |         self.current_file.create_dataset(
677 |             signal_path, data=read.raw, compression='gzip', shuffle=True, dtype='i2')
678 | 
679 | 
680 |     def _copy_read_group(self, read):
681 |         self.current_file.copy(read, read.name)
682 | 


--------------------------------------------------------------------------------
/fast5_research/fast5_bulk.py:
--------------------------------------------------------------------------------
   1 | import ast
   2 | from collections import defaultdict
   3 | from fast5_research.util import dtype_descr
   4 | import itertools
   5 | import re
   6 | from sys import version_info
   7 | from xml.dom import minidom
   8 | import warnings
   9 | 
  10 | with warnings.catch_warnings():
  11 |     warnings.simplefilter("ignore", category=FutureWarning)
  12 |     import h5py
  13 | 
  14 | import numpy as np
  15 | from numpy.lib.recfunctions import append_fields
  16 | 
  17 | 
  18 | from fast5_research.util import get_changes, _clean_attrs, _sanitize_data_for_writing, _sanitize_data_for_reading
  19 | 
  20 | if version_info[0] < 3:
  21 |     from StringIO import StringIO
  22 | else:
  23 |     from io import StringIO
  24 | 
  25 | 
  26 | class BulkFast5(h5py.File):
  27 |     """Class for reading data from a bulk fast5 file"""
  28 | 
  29 |     __tracking_path__ = '/UniqueGlobalKey/tracking_id'
  30 |     __pore_model_old__ = 'Meta/User/pore_model'
  31 |     __pore_model_new__ = 'Meta/User/analysis_conf'
  32 |     __context_path__ = '/UniqueGlobalKey/context_tags/'
  33 |     __intermediate_data__ = '/IntermediateData/'
  34 |     __voltage_meta__ = '/Device/VoltageMeta'
  35 |     __voltage_data__ = '/Device/MetaData'
  36 |     __channel_meta__ = '/IntermediateData/Channel_{}/Meta'
  37 |     __multiplex_data__ = '/MultiplexData/Channel_{}/Multiplex'
  38 | 
  39 |     __raw_data__ = "Raw/Channel_{}/Signal"
  40 |     __raw_meta__ = "Raw/Channel_{}/Meta"
  41 |     __event_data__ = "/IntermediateData/Channel_{}/Events"
  42 |     __read_data__ = "/IntermediateData/Channel_{}/Reads"
  43 |     __state_data__ = "/StateData/Channel_{}/States"
  44 | 
  45 |     # The below refers to MinION Mk1 ASIC, may change in future
  46 |     __mk1_asic_mux_states__ = {
  47 |         'common_voltage_1': 1,
  48 |         'common_voltage_2': 2,
  49 |         'common_voltage_3': 3,
  50 |         'common_voltage_4': 4,
  51 |         'gnd': 15,
  52 |         'gnd_through_resistor': 14,
  53 |         'open_pore': 0,
  54 |         'test_current_1': 10,
  55 |         'test_current_2': 11,
  56 |         'test_current_3': 12,
  57 |         'test_current_4': 13,
  58 |         'test_current_open_pore': 5,
  59 |         'unblock_voltage_1': 6,
  60 |         'unblock_voltage_2': 7,
  61 |         'unblock_voltage_3': 8,
  62 |         'unblock_voltage_4': 9
  63 |     }
  64 | 
  65 |     def __init__(self, filename, mode='r'):
  66 |         """Create an BulkFast5 instance.
  67 | 
  68 |         :param filename: path to a bulk fast5 file.
  69 |         :param mode: h5py opening mode.
  70 |         """
  71 | 
  72 |         super(BulkFast5, self).__init__(filename, mode)
  73 |         if mode == 'r':
  74 |             data = self[self.__intermediate_data__]
  75 |             self.channels = sorted([int(name.strip('Channel_')) for name in data.keys()])
  76 |             self.parsed_exp_history = None # we parse the history lazily
  77 | 
  78 |             # Parse experimental metadata
  79 |             self.exp_metadata = dict()
  80 |             for path in (self.__tracking_path__, self.__context_path__):
  81 |                 try:
  82 |                     self.exp_metadata.update(_clean_attrs(self[path].attrs))
  83 |                 except KeyError:
  84 |                     raise KeyError('Cannot read summary from {}'.format(path))
  85 | 
  86 |             # This should be safe
  87 |             try:
  88 |                 self.sample_rate = float(self['Meta'].attrs['sample_rate'])
  89 |             except:
  90 |                 self.sample_rate = float(self.get_metadata(self.channels[0])['sample_rate'])
  91 | 
  92 | 
  93 |     def get_metadata(self, channel):
  94 |         """Get the metadata for the specified channel.
  95 | 
  96 |         Look for first for events metadata, and fall-back on raw metadata, returning an empty dict if neither could be found."""
  97 |         if hasattr(self, '_cached_metadata'):
  98 |             if channel in self._cached_metadata:
  99 |                 return self._cached_metadata[channel]
 100 |         else:
 101 |             self._cached_metadata = {}
 102 | 
 103 |         if self.__channel_meta__.format(channel) in self:
 104 |             meta = _clean_attrs(self[self.__channel_meta__.format(channel)].attrs)
 105 |         elif self.has_raw(channel): # use raw meta data
 106 |             meta = _clean_attrs(self[self.__raw_meta__.format(channel)].attrs)
 107 |         else:
 108 |             meta = {}
 109 | 
 110 |         self._cached_metadata[channel] = meta
 111 |         return meta
 112 | 
 113 | 
 114 |     def get_event_detection_parameters(self):
 115 |         """Get the full set of parameters related to event detection """
 116 |         if self.__pore_model_old__ in self:   # Old Minknow file
 117 |             xmldoc = minidom.parseString("".join(self[self.__pore_model_old__].value))
 118 |             return dict(xmldoc.getElementsByTagName('event_detection')[0].attributes.items())
 119 |         elif self.__pore_model_new__ in self:  # New Minknow file
 120 |             result = "".join(self[self.__pore_model_new__].value)
 121 |             result = result.replace('true', 'True').replace('false', 'False')
 122 |             return ast.literal_eval(result)['event_detection']
 123 | 
 124 | 
 125 |     def get_tracking_meta(self):
 126 |         """Get tracking meta data"""
 127 |         return _clean_attrs(self[self.__tracking_path__].attrs)
 128 | 
 129 | 
 130 |     def get_context_meta(self):
 131 |         """Get context meta"""
 132 |         return _clean_attrs(self[self.__context_path__].attrs)
 133 | 
 134 | 
 135 |     def has_raw(self, channel):
 136 |         """Return True if there is raw data for this channel."""
 137 |         raw_location = self.__raw_data__.format(channel)
 138 |         return self._has_data(raw_location)
 139 | 
 140 | 
 141 |     def has_reads(self, channel):
 142 |         """Return True if there is read data for this channel."""
 143 |         read_location = self.__read_data__.format(channel)
 144 |         return self._has_data(read_location)
 145 | 
 146 | 
 147 |     def has_states(self, channel):
 148 |         """Return True if there is State data for this channel."""
 149 |         state_location = self.__state_data__.format(channel)
 150 |         return self._has_data(state_location)
 151 | 
 152 | 
 153 |     def _has_data(self, location):
 154 |         """Return true if the given data path exists
 155 | 
 156 |         :param location: str, path with fast5.
 157 |         """
 158 |         if hasattr(self, '_cached_paths'):
 159 |             if location in self._cached_paths:
 160 |                 return self._cached_paths[location]
 161 |         else:
 162 |             self._cached_paths = {}
 163 | 
 164 |         location_split = location.split('/')
 165 |         folder = '/'.join(location_split[:-1])
 166 |         name = location_split[-1]
 167 |         present = folder in self and name in self[folder].keys()
 168 |         self._cached_paths[location] = present
 169 |         return present
 170 | 
 171 | 
 172 |     def _time_interval_to_index(self, channel, times):
 173 |         """Translate a tuple of (start_sec, end_sec) to an index."""
 174 |         start_sec, end_sec = times
 175 |         start = self._seconds_to_index(channel, start_sec)
 176 |         end = self._seconds_to_index(channel, end_sec)
 177 |         return (start, end)
 178 | 
 179 | 
 180 |     def _seconds_to_index(self, channel, time):
 181 |         """Translate a point in time to an index."""
 182 |         if time is None:
 183 |             return None
 184 | 
 185 |         return int(time * float(self.sample_rate))
 186 | 
 187 | 
 188 |     def _scale(self, channel, data):
 189 |         """Scale event data if necessary, else return unchanged.
 190 | 
 191 |         If event metadata can't be found, assume events don't need scaling."""
 192 | 
 193 |         meta_data = self.get_metadata(channel)
 194 | 
 195 |         if 'scaling_used' not in meta_data or meta_data.get('scaling_used'):
 196 |             return data
 197 |         else:
 198 |             channel_scale = meta_data['range'] / meta_data['digitisation']
 199 |             channel_offset = meta_data['offset']
 200 |             data['mean'] = (data['mean'] + channel_offset) * channel_scale
 201 |             return data
 202 | 
 203 | 
 204 |     def get_raw(self, channel, times=None, raw_indices=(None, None), use_scaling=True):
 205 |         """If available, parse channel raw data.
 206 | 
 207 |         :param channel: channel number int
 208 |         :param times: tuple of floats (start_second, end_second)
 209 |         :param raw_indices: tuple of ints (start_index, end_index)
 210 |         :param use_scaling: if True, scale the current level
 211 | 
 212 |         .. note::
 213 |             Exactly one of the slice keyword arguments needs to be specified,
 214 |             as the method will override them in the order of times
 215 |             > raw_indices.
 216 |         """
 217 | 
 218 |         if not self.has_raw(channel):
 219 |             raise KeyError('Channel {} does not contain raw data.'.format(channel))
 220 | 
 221 |         if times is not None:
 222 |             raw_indices = self._time_interval_to_index(channel, times)
 223 | 
 224 |         raw_data = self.__raw_data__.format(channel)
 225 |         data = self[raw_data][raw_indices[0]:raw_indices[1]]
 226 | 
 227 |         if use_scaling:
 228 |             meta_data = self.get_metadata(channel)
 229 |             raw_unit = meta_data['range'] / meta_data['digitisation']
 230 |             data = (data + meta_data['offset']) * raw_unit
 231 | 
 232 |         return data
 233 | 
 234 | 
 235 |     def get_events(self, channel, times=None, raw_indices=None, event_indices=(None, None),
 236 |                    use_scaling=True):
 237 |         """Parse channel event data.
 238 | 
 239 |         :param channel: channel number int
 240 |         :param times: tuple of floats (start_second, end_second)
 241 |         :param raw_indices: tuple of ints (start_index, end_index)
 242 |         :param event_indices: tuple of ints (start_index, end_index)
 243 |         :param use_scaling: if True, scale the current level
 244 | 
 245 |         .. note::
 246 |             Exactly one of the slice keyword arguments needs to be specified,
 247 |             as the method will override them in the order of times
 248 |             > raw_indices > event_indices.
 249 |         """
 250 | 
 251 |         event_data = self.__event_data__.format(channel)
 252 |         ev = self[event_data]
 253 | 
 254 |         if times is not None:
 255 |             raw_indices = self._time_interval_to_index(channel, times)
 256 |         if raw_indices is not None:
 257 |             event_indices = np.searchsorted(ev['start'], raw_indices)
 258 |         data = _sanitize_data_for_reading(ev[event_indices[0]:event_indices[1]])
 259 | 
 260 |         # Change variance to stdv column
 261 |         data['variance'] = np.sqrt(data['variance'])
 262 |         data.dtype.names = ['stdv' if n == 'variance' else n for n in data.dtype.names]
 263 | 
 264 |         if use_scaling:
 265 |             return self._scale(channel, data)
 266 |         else:
 267 |             return data
 268 | 
 269 | 
 270 |     def _get_reads_data(self, channel):
 271 |         """Parse channel read data exactly as it is in the bulk fast5 file.
 272 | 
 273 |         :param channel: channel number int
 274 | 
 275 |         .. note::
 276 |             No processing is done - reads might span several rows.
 277 |         """
 278 |         if not self.has_reads(channel):
 279 |             raise KeyError('Channel {} does not contain read data.'.format(channel))
 280 | 
 281 |         return self[self.__read_data__.format(channel)]
 282 | 
 283 | 
 284 |     def get_reads(self, channel, transitions=False, multi_row_class='auto'):
 285 |         """Parse channel read data to yield details of reads.
 286 | 
 287 |         :param channel: channel number int
 288 |         :param transitions: if True, include transition reads
 289 |         :param multi_row_class: options: 'auto', modal, 'penultimate', 'final'.
 290 |             For reads which span multiple rows, use the classification from
 291 |             'auto': modal class if present, penultimate row if not
 292 |             'modal': modal class if present
 293 |             'penultimate': penultimate row
 294 |             'final': final row.
 295 |             Modal classification not supported by very old versions of MinKNOW.
 296 |          """
 297 | 
 298 |         multi_row_choices = {'auto', 'modal', 'penultimate', 'final'}
 299 |         if multi_row_class not in multi_row_choices:
 300 |             raise ValueError('''{} is not one of the permitted choices for
 301 |                 multi_row_class. Permitted choices: {}.'''.format(multi_row_class, multi_row_choices))
 302 | 
 303 |         read_data = self._get_reads_data(channel)
 304 | 
 305 |         return_keys = {
 306 |             'read_start', 'read_length',
 307 |             'event_index_start', 'event_index_end', 'classification', 'read_id',
 308 |             'median', 'median_sd', 'median_dwell', 'range', 'drift'
 309 |         }
 310 |         additional_keys = {'flags'}
 311 |         computed_keys = {'drift'}
 312 |         required_keys = return_keys.union(additional_keys).difference(computed_keys)
 313 |         for key in required_keys:
 314 |             if key not in read_data.dtype.names:
 315 |                 raise KeyError('The read data did not contain the required key {}.'.format(key))
 316 | 
 317 |         if multi_row_class == 'modal':
 318 |             if 'modal_classification' not in read_data.dtype.names:
 319 |                 raise KeyError("The read data did not contain the key 'modal_classification'.")
 320 | 
 321 |         # classification is enumerated
 322 |         enum_map = h5py.check_dtype(enum=read_data.dtype['classification'])
 323 |         classes = _clean_attrs({v:k for k, v in enum_map.items()})
 324 |         # read dataset into memory, lest we return h5py objects
 325 |         read_data = read_data[()]
 326 | 
 327 |         # we need to combine 'event_index_start', 'read_start' from first row in
 328 |         # the read with sum 'read_length' over all rows and all other cols from
 329 |         # final row. If penultimate_class is True, use classification from penultimate row.
 330 |         # We also need to calculate drift, which is the absolute difference
 331 |         # between the local_median field of the first and last rows of a read.
 332 |         accum_stats = None
 333 |         accum_names = ('event_index_start', 'read_start', 'read_length', 'classification')
 334 |         for n, row in enumerate(read_data):
 335 |             if accum_stats is None:
 336 |                 accum_stats = {k:row[k] for k in accum_names}
 337 |                 accum_stats['drift'] = 0
 338 |                 first_local_median = row['local_median']
 339 |             else:
 340 |                 accum_stats['read_length'] += row['read_length']
 341 | 
 342 |                 if multi_row_class == 'auto':  # use modal classification if column present, use penultimate if not
 343 |                     if 'modal_classification' in read_data.dtype.names:
 344 |                         accum_stats['classification'] = row['modal_classification']
 345 |                     else:
 346 |                         accum_stats['classification'] = read_data[n - 1]['classification']
 347 | 
 348 |                 if multi_row_class == 'modal':  # use modal classification if column present
 349 |                     accum_stats['classification'] = row['modal_classification']
 350 | 
 351 |                 if multi_row_class == 'penultimate':  # use classification from previous row
 352 |                     accum_stats['classification'] = read_data[n - 1]['classification']
 353 | 
 354 |                 if multi_row_class == 'final':  # use classification from current row
 355 |                     accum_stats['classification'] = row['classification']
 356 | 
 357 |                 accum_stats['drift'] = abs(row['local_median'] - first_local_median)
 358 | 
 359 |             # pick out only the columns we want
 360 |             row_details = {k:row[k] for k in return_keys - computed_keys}
 361 | 
 362 |             if row['flags'] & 0x1 == 0:
 363 |                 # read has ended
 364 |                 if classes[row['classification']] == 'transition' and not transitions:
 365 |                     accum_stats = None  # prepare for next read
 366 |                 else:
 367 |                     for k in accum_stats:  # replace / add computed keys
 368 |                         row_details[k] = accum_stats[k]
 369 |                     row_details['classification'] = classes[row_details['classification']]
 370 |                     yield _clean_attrs(row_details)
 371 |                     accum_stats = None
 372 | 
 373 | 
 374 |     def get_state_changes(self, channel):
 375 |         """Parse channel state changes.
 376 | 
 377 |         :param channel: channel number int
 378 |         """
 379 |         if not self.has_states(channel):
 380 |             raise KeyError('Channel {} does not contain state data.'.format(channel))
 381 | 
 382 |         if hasattr(self, '_cached_state_changes'):
 383 |             if channel in self._cached_state_changes:
 384 |                 return self._cached_state_changes[channel]
 385 |         else:
 386 |             self._cached_state_changes = {}
 387 | 
 388 |         # state data is enumerated
 389 |         col = 'summary_state'
 390 |         data = self[self.__state_data__.format(channel)]
 391 |         enum_map = h5py.check_dtype(enum=data.dtype[col])
 392 |         enum_to_state = _clean_attrs({v:k for k, v in enum_map.items()})
 393 | 
 394 |         # translate ints into strings
 395 |         states = np.array([enum_to_state[key] for key in data[col]])
 396 | 
 397 |         try:
 398 |             data = np.array(data['approx_raw_index'])
 399 |         except ValueError: #not a KeyError: see h5py/_hl/dataset.pyc in readtime_dtype(basetype, names)
 400 |             data = np.array(data['acquisition_raw_index'])
 401 |         if len(data) > 0:
 402 |             data = data.astype([('approx_raw_index', data.dtype)], copy=False)
 403 |             data = append_fields(data,
 404 |                                 ['approx_raw_index_end','summary_state'],
 405 |                                 [np.roll(data['approx_raw_index'], -1), states], usemask=False)
 406 |             # set end of last state to something approximating infinity (the largest u64 int).
 407 |             data['approx_raw_index_end'][-1] = -1
 408 |         else:  # some channels don't contain channel state data, just create a dummy array
 409 |             data =  np.array([],  dtype=[('approx_raw_index', '<u8'),
 410 |                                          ('approx_raw_index_end', '<u8'),
 411 |                                          ('summary_state', 'S28')])
 412 |         self._cached_state_changes[channel] = data
 413 |         return _sanitize_data_for_reading(data)
 414 | 
 415 | 
 416 |     def get_state(self, channel, raw_index=None, time=None):
 417 |         """Find the channel state at a given time
 418 | 
 419 |         :param channel: channel number int
 420 |         :param raw_index: sample index
 421 |         :param time: time in seconds
 422 | 
 423 |         .. note::
 424 |             Exactly one of the slice keyword arguments needs to be specified,
 425 |             as the method will override them in the order of times
 426 |             > raw_indices.
 427 |         """
 428 |         assert (time is not None) or (raw_index is not None), 'Need either a time or a raw_index argument'
 429 |         if time is not None:
 430 |             raw_index = self._seconds_to_index(channel, time)
 431 | 
 432 |         data = self.get_state_changes(channel)
 433 | 
 434 |         # Check if the requested index is before the first state entry
 435 |         if raw_index < data['approx_raw_index'][0]:
 436 |             msg = 'No state data at index {}, which is before first state at {}'
 437 |             raise RuntimeError(msg.format(raw_index, data['approx_raw_index'][0]))
 438 | 
 439 |         # Now get last record before requested sample, handling the special case
 440 |         # where there is no last record (i.e. if raw_index == 0)
 441 |         if raw_index == 0:
 442 |             i = 0
 443 |         else:
 444 |             i = np.searchsorted(data['approx_raw_index'], raw_index) - 1
 445 | 
 446 |         state = data['summary_state'][i]
 447 | 
 448 |         return state
 449 | 
 450 | 
 451 |     def get_states_in_window(self, channel, times=None, raw_indices=None):
 452 |         """Find all channel states within a time window.
 453 | 
 454 |         :param channel: channel number int
 455 |         :param times: tuple of floats (start_second, end_second)
 456 |         :param raw_indices: tuple of ints (start_index, end_index)
 457 | 
 458 |         .. note::
 459 |             Exactly one of the slice keyword arguments needs to be specified,
 460 |             as the method will override them in the order of times
 461 |             > raw_indices.
 462 |         """
 463 | 
 464 |         assert (times is not None) or (raw_indices is not None), 'Need either a time or a raw_index argument'
 465 |         if times is not None:
 466 |             raw_indices = self._seconds_to_index(channel, times[0]), self._seconds_to_index(channel, times[1])
 467 |         states = self.get_state_changes(channel)
 468 |         first_state, last_state = np.searchsorted(states['approx_raw_index'], raw_indices, side='right')
 469 |         return np.unique(states['summary_state'][first_state-1:last_state])
 470 | 
 471 | 
 472 |     def get_mux(self, channel, raw_index=None, time=None, wells_only=False, return_raw_index=False):
 473 |         """Find the multiplex well_id ("the mux") at a given time
 474 | 
 475 |         :param channel: channel number int
 476 |         :param raw_index: sample index
 477 |         :param time: time in seconds
 478 |         :wells_only: bool, if True, ignore changes to mux states not in [1,2,3,4]
 479 |                      and hence return the last well mux.
 480 |         :return_raw_index: bool, if True, return tuple (mux, raw_index), raw_index being
 481 |                      raw index when the mux was set.
 482 | 
 483 |         .. note::
 484 |             There are multiple mux states associated with each well (e.g. common_voltage_1 and unblock_volage_1).
 485 |             Here, we return the well_id associated with the mux state (using self.enum_to_mux), i.e. 1 in both these cases.
 486 | 
 487 |             Exactly one of the slice keyword arguments needs to be specified,
 488 |             as the method will override them in the order of times
 489 |             > raw_indices.
 490 |         """
 491 |         assert (time is not None) or (raw_index is not None), 'Need either a time or a raw_index argument'
 492 |         if time is not None:
 493 |             raw_index = self._seconds_to_index(channel, time)
 494 | 
 495 |         data = self.get_mux_changes(channel, wells_only=wells_only)
 496 | 
 497 |         # Check if the requested index is before the first mux entry
 498 |         if raw_index < data['approx_raw_index'][0]:
 499 |             msg = 'No mux data at index {}, which is before first mux at {}'
 500 |             raise RuntimeError(msg.format(raw_index, data['approx_raw_index'][0]))
 501 | 
 502 |         # Now get last record before requested sample, handling the special case
 503 |         # where there is no last record (i.e. if raw_index == 0)
 504 |         if raw_index == 0:
 505 |             i = 0
 506 |         else:
 507 |             i = np.searchsorted(data['approx_raw_index'], raw_index) - 1
 508 | 
 509 |         mux = self.enum_to_mux[data[i]['well_id']]
 510 | 
 511 |         if return_raw_index:
 512 |             raw_index = data[i]['approx_raw_index']  # when the mux was set
 513 |             return mux, raw_index
 514 |         else:
 515 |             return mux
 516 | 
 517 | 
 518 |     @staticmethod
 519 |     def _strip_metadata(data):
 520 |         """Strip dtype.metadata dicts from enumerated arrays.
 521 | 
 522 |         :param data: structured np.array
 523 |         :returns: view of the same data with the metadata removed.
 524 | 
 525 |         .. note::
 526 |             since h5py v 2.3, enumerated dtypes come with a dtype.metadata dict
 527 |             see https://github.com/numpy/numpy/issues/6771 and
 528 |             https://github.com/h5py/h5py/pull/355/commits/5da2e96942218ffb1c9b614be9be8409bea219f8
 529 |             This can stop functions like recfunctions.append_fields working on
 530 |             these arrays, so strip out this dict. as it's not writeable, just
 531 |             create a view with the appropriate data type
 532 |         """
 533 |         d = []
 534 |         for col, str_type in dtype_descr(data):
 535 |             if not isinstance(str_type, str) and isinstance(str_type[1], dict) and 'enum' in str_type[1]:
 536 |                 str_type = str_type[0]
 537 |             d.append((col, str_type))
 538 |         return data.view(np.dtype(d))
 539 | 
 540 | 
 541 |     def get_mux_changes(self, channel, wells_only=False):
 542 |         """Get changes in multiplex settings for given channel.
 543 | 
 544 |         :param channel: channel for which to fetch data
 545 |         :wells_only: bool, if True, ignore changes to mux states not in [1,2,3,4]
 546 | 
 547 |         .. note::
 548 |             There are multiple mux states associated with each well (e.g. 1:common_voltage_1 and 6:unblock_voltage_1).
 549 |             Here, we return mux state numbers, e.g. 1 and 6, which can be linked to the well_id using self.enum_to_mux
 550 |         """
 551 |         if hasattr(self, '_cached_mux_changes'):
 552 |             if channel in self._cached_mux_changes[wells_only]:
 553 |                 return self._cached_mux_changes[wells_only][channel]
 554 |         else:
 555 |             # cache mux changes separately for well_only True and False
 556 |             self._cached_mux_changes = {True: {}, False: {}}
 557 | 
 558 |         enum_col = 'well_id'
 559 |         multiplex_data = self.__multiplex_data__.format(channel)
 560 |         data = self[multiplex_data]
 561 |         enum = _clean_attrs(h5py.check_dtype(enum=data.dtype[enum_col]))
 562 |         assert enum == self.__mk1_asic_mux_states__, 'Got unexpected multiplex states'
 563 | 
 564 |         if not hasattr(self, "enum_to_mux"):
 565 |             # Build a dict which relates enum values to mux.
 566 |             self.enum_to_mux = {}
 567 |             for k, v in enum.items():
 568 |                 mux = 0
 569 |                 mo = re.search(r'(\d)$', k)
 570 |                 if mo is not None:
 571 |                     mux = int(mo.group(0))
 572 |                 self.enum_to_mux[v] = mux
 573 |         data = data[()]  # load into memory
 574 |         data = self._strip_metadata(data)  # remove dtype.metadata dict present with h5py>=2.3.0
 575 | 
 576 |         # remove any rows where the mux state has not changed
 577 |         data = get_changes(data, ignore_cols=('approx_raw_index',))
 578 | 
 579 |         if wells_only:  # only consider changes to wells in [1,2,3,4]
 580 |             wells = [1, 2, 3, 4]
 581 |             mask = np.in1d(data['well_id'], wells)
 582 |             mask[0] = True  # keep first mux, whatever it is
 583 |             data = data[mask]
 584 |         self._cached_mux_changes[wells_only][channel] = data
 585 |         return data
 586 | 
 587 | 
 588 |     def get_mux_changes_in_window(self, channel, times=None, raw_indices=None):
 589 |         """Find all mux changes within a time window.
 590 | 
 591 |         :param channel: channel number int
 592 |         :param times: tuple of floats (start_second, end_second)
 593 |         :param raw_indices: tuple of ints (start_index, end_index)
 594 | 
 595 |         .. note::
 596 |             There are multiple mux values associated with each well (e.g. 1:common_voltage_1 and 6:unblock_voltage_1).
 597 |             Here, we return mux values, e.g. 1 and 6, which can be linked to the well_id using self.enum_to_mux.
 598 | 
 599 |             Exactly one of the slice keyword arguments needs to be specified,
 600 |             as the method will override them in the order of times
 601 |             > raw_indices.
 602 |         """
 603 | 
 604 |         assert (times is not None) or (raw_indices is not None), 'Need either a time or a raw_index argument'
 605 |         if times is not None:
 606 |             raw_indices = self._seconds_to_index(channel, times[0]), self._seconds_to_index(channel, times[1])
 607 |         muxes = self.get_mux_changes(channel)
 608 |         first_mux, last_mux = np.searchsorted(muxes['approx_raw_index'], raw_indices, side='right')
 609 |         return muxes[first_mux-1:last_mux]
 610 | 
 611 | 
 612 |     def get_waveform_timings(self):
 613 |         """Extract the timings of the waveforms (if any).
 614 | 
 615 |          :returns: list of tuples of start and end times
 616 |         """
 617 |         mux_timings = []
 618 |         on_index = None
 619 |         for i in range(0, len(self["Device"]["AsicCommands"])):
 620 |             if self._waveform_enabled(i):
 621 |                 on_index = self["Device"]["AsicCommands"][i]["frame_number"]
 622 |             elif on_index is not None:
 623 |                 # when _waveform_enabled(i) returns to False, save on and off
 624 |                 # timings
 625 |                 off_index = self["Device"]["AsicCommands"][i]["frame_number"]
 626 |                 on_time = on_index / self.sample_rate
 627 |                 off_time = off_index / self.sample_rate
 628 |                 mux_timings.append((on_time, off_time))
 629 |                 on_index = None
 630 |         return mux_timings
 631 | 
 632 | 
 633 |     def _waveform_enabled(self, cmd_index):
 634 |         """Checks AsicCommand history to see if the waveform command was issued.
 635 | 
 636 |         .. note::
 637 |         Here is the relevant section of the engineering documentation.
 638 |         engineering documentation (July 2015 version)
 639 | 
 640 |         Settings from PC: 512 bytes
 641 |         1. Equals 17 otherwise FPGA drops the parcel
 642 |         2. Command for FPGA:
 643 |             =1 load configuration data in ASIC
 644 |             =2 begin reading data from ASIC
 645 |             =3 reset ASIC chip
 646 |             =5 load configuration and begin/continue reading - used for
 647 |                real-time re-loading ASIC configuration
 648 |         3.  4bit: enable zero supply voltage for Fan ('1'- Fan can be switched
 649 |                   off completely, '0'- Fan is always On)
 650 |             3bit: temperature control On/Off ('1' - On, '0' - Off)
 651 |             2-1 bits: Fan speed control ('00' - Off, '11' - On
 652 |                       (only when temperature control is off))
 653 |             0 bit: soft temperature control ('1' - On, '0' - Off)
 654 |         4.  0bit: On/Off ASIC analogue supply voltage ('0' - off, '1' - on)
 655 |         5.  ASIC clock: '000' - 64MHz, '001' - 128MHz, '010' - 32MHz,
 656 |                             '100' - 16MHz, '110' - 8MHz
 657 |         6.  3 bit: Enable ('1' - on, '0' - off) channel mapping (channel
 658 |                    sequence 0,1...510,511) for 512 channels mode
 659 |             2 bit: Enable ('1' - on, '0' - off) ASIC configuration update every
 660 |                    1ms with values for bias voltage from LUT
 661 |             1-0 bits: Number of channels from ASIC: '00' - 128ch,
 662 |                       '01'-256ch, '10' - 512ch
 663 |         """
 664 | 
 665 |         waveform_flag = self["Device"]["AsicCommands"][cmd_index]["command"].tostring()[5]
 666 |         # if cmd is not a bytestring, convert waveform flag to an integer. Needed for python2.x compatibility
 667 |         if not isinstance(waveform_flag, int):
 668 |             waveform_flag = ord(waveform_flag)
 669 |         waveform_enabled = waveform_flag & 4 != 0
 670 |         return waveform_enabled
 671 | 
 672 | 
 673 |     def get_voltage(self, times=None, raw_indices=(None, None), use_scaling=True):
 674 |         """Extracts raw common electrode trace
 675 | 
 676 |         :raw_indices: tuple of ints to limit section of voltage data loaded.
 677 |         :use_scaling: bool, whether to scale voltage data. If no scaling meta is found,
 678 |                       scale by -5 (as appropriate for MinION).
 679 |         :return: voltage as array (including 5x multiplyer for MinKnow)
 680 |         """
 681 |         if times is not None:
 682 |             raw_indices = self._time_interval_to_index(self.channels[0], times)
 683 | 
 684 |         voltages = self[self.__voltage_data__
 685 |                         ][raw_indices[0]:raw_indices[1]]['bias_voltage']
 686 |         if use_scaling:
 687 |             # fast5 converted from ABF files have a voltage meta section
 688 |             # containing scaling parameters
 689 |             if self.__voltage_meta__ in self:
 690 |                 voltage_meta = _clean_attrs(self[self.__voltage_meta__].attrs)
 691 |                 unit = voltage_meta['range'] / voltage_meta['digitisation']
 692 |                 offset = voltage_meta['offset']
 693 |             else:
 694 |                 # Assume MinION scaling of 5
 695 |                 unit = -5
 696 |                 offset = 0
 697 |             voltages = (voltages + offset) * unit
 698 | 
 699 |         return voltages
 700 | 
 701 | 
 702 |     def get_bias_voltage_changes(self):
 703 |         """Get changes in the bias voltage.
 704 | 
 705 |         .. note::
 706 |             For a long (-long-long) time the only logging of the common
 707 |             electrode voltage was the experimental history (accurate to one
 708 |             second). The addition of the voltage trace changed this, but this
 709 |             dataset is cumbersome. MinKnow 1.x(.3?) added the asic command
 710 |             history which is typically much shorter and therefore quicker to
 711 |             query. The bias voltage is numerously record. For MinION asics
 712 |             there is typically a -5X multiplier to convert the data into
 713 |             correct units with the sign people are used to.
 714 |         """
 715 |         if hasattr(self, '_cached_voltage_changes'):
 716 |             return self._cached_voltage_changes
 717 | 
 718 |         # First try the asic command, fallback to the experimental history,
 719 |         # and finally the voltage trace.
 720 |         try:
 721 |             self._cached_voltage_changes = self._bias_from_asic_commands()
 722 |         except:
 723 |             try:
 724 |                 self._cached_voltage_changes = self._bias_from_exp_hist()
 725 |             except:
 726 |                 try:
 727 |                     self._cached_voltage_changes = self._bias_from_voltages()
 728 |                 except:
 729 |                     raise RuntimeError('Cannot parse voltage changes.')
 730 | 
 731 |         return self._cached_voltage_changes
 732 | 
 733 | 
 734 |     def _bias_from_voltages(self):
 735 |         """Extract voltage changes from the voltage trace data."""
 736 | 
 737 |         voltages = self.get_voltage()
 738 |         changes = np.where(voltages[:-1] != voltages[1:])[0]
 739 | 
 740 |         voltage_changes = np.empty(
 741 |             len(changes) + 1,
 742 |             dtype=[('time', float), ('set_bias_voltage', int)]
 743 |         )
 744 |         voltage_changes['time'][0] = voltages[0]
 745 |         voltage_changes['time'][1:] = changes
 746 |         voltage_changes['time'] /= self.sample_rate
 747 |         voltage_changes['set_bias_voltage'] = voltages[0]
 748 |         voltage_changes['set_bias_voltage'][1:] = voltages[changes]
 749 |         return voltage_changes
 750 | 
 751 | 
 752 |     def _bias_from_asic_commands(self):
 753 |         """Extract voltages in Asic commands, filtering to only changes."""
 754 | 
 755 |         all_voltages = [AsicBCommand(cmd).configuration.bias_voltage
 756 |             for cmd in self['/Device/AsicCommands']['command']
 757 |         ]
 758 |         all_frames = self['/Device/AsicCommands']['frame_number']
 759 | 
 760 |         prev_voltage = all_voltages[0]
 761 |         changes = [(all_frames[0], prev_voltage)]
 762 |         for frame, voltage in itertools.izip(all_frames[1:], all_voltages[1:]):
 763 |             if voltage != prev_voltage:
 764 |                 changes.append((frame, voltage))
 765 | 
 766 |         voltage_changes = np.array(
 767 |             changes,
 768 |             dtype=[('time', float), ('set_bias_voltage', int)]
 769 |         )
 770 |         voltage_changes['time'] /= self.sample_rate
 771 |         voltage_changes['set_bias_voltage'] *= -5
 772 |         return voltage_changes
 773 | 
 774 | 
 775 |     def _bias_from_exp_hist(self):
 776 |         """Extract voltage changes from experimental history.
 777 | 
 778 |         ..note:: The experimental history is deprecated in MinKnow 1.3
 779 |         """
 780 |         if self.parsed_exp_history is None:
 781 |             self.parse_history()
 782 |         voltage_changes = self.parsed_exp_history['set_bias_voltage']
 783 |         voltage_changes['set_bias_voltage'] *= -1
 784 |         return voltage_changes
 785 | 
 786 | 
 787 |     def get_bias_voltage_changes_in_window(self, times=None, raw_indices=None):
 788 |         """Find all mux voltage changes within a time window.
 789 | 
 790 |         :param times: tuple of floats (start_second, end_second)
 791 |         :param raw_indices: tuple of ints (start_index, end_index)
 792 | 
 793 |         .. note::
 794 |             This is the bias voltage from the expt history (accurate to 1
 795 |             second), and will not include any changes in voltage related to
 796 |             waveforms. For the full voltage trace, use get_voltage.
 797 | 
 798 |             Exactly one of the slice keyword arguments needs to be specified,
 799 |             as the method will override them in the order of times
 800 |             > raw_indices.
 801 |         """
 802 | 
 803 |         assert (times is not None) or (raw_indices is not None), 'Need either a time or a raw_index argument'
 804 |         if times is None:
 805 |             times = float(raw_indices[0]) / self.sample_rate, float(raw_indices[1]) / self.sample_rate
 806 |         bias_voltage_changes = self.get_bias_voltage_changes()
 807 |         first_index, last_index = np.searchsorted(bias_voltage_changes['time'], times, side='right')
 808 |         return bias_voltage_changes[first_index:last_index]
 809 | 
 810 | 
 811 |     __engine_states__ = {
 812 |         'minion_asic_temperature': float,
 813 |         'minion_heatsink_temperature': float,
 814 |         'set_bias_voltage': float,
 815 |         'fan_speed': int
 816 |     }
 817 |     __temp_fields__ = ('heatsink', 'asic')
 818 | 
 819 | 
 820 |     def parse_history(self):
 821 |         """Parse the experimental history to pull out various environmental factors.
 822 |         The functions below are quite nasty, don't enquire too hard.
 823 |         """
 824 |         try:
 825 |             exph_fh = StringIO(str(self['Meta/User']['experimental_history'][:].tostring().decode()))
 826 |         except Exception:
 827 |             raise RuntimeError('Cannot read experimental_history from fast5')
 828 | 
 829 |         data = defaultdict(list)
 830 |         for item in self._iter_records(exph_fh):
 831 |             #item should contain 'time' and something else
 832 |             time = item['time']
 833 |             field, value = next((k, v) for k, v in item.items() if k != 'time')
 834 |             data[field].append((time, value))
 835 | 
 836 |         self.parsed_exp_history = {
 837 |             k:np.array(data[k], dtype=[('time', float), (k, self.__engine_states__[k])])
 838 |             for k in data.keys()
 839 |         }
 840 |         return self
 841 | 
 842 | 
 843 |     def get_engine_state(self, state, time=None):
 844 |         """Get changes in an engine state or the value of an engine
 845 |         state at a given time.
 846 | 
 847 |         :param state: the engine state to retrieve.
 848 |         :param time: the time at which to grab engine state.
 849 |         """
 850 |         if state not in self.__engine_states__:
 851 |             raise RuntimeError("'field' argument must be one of {}.".format(self.__engine_states__.keys()))
 852 | 
 853 |         if self.parsed_exp_history is None:
 854 |             self.parse_history()
 855 | 
 856 |         states = self.parsed_exp_history[state]
 857 |         if time is None:
 858 |             return states
 859 |         else:
 860 |             i = np.searchsorted(states['time'], time) - 1
 861 |             return states[state][i]
 862 | 
 863 | 
 864 |     def get_temperature(self, time=None, field=__temp_fields__[0]):
 865 |         if field not in self.__temp_fields__:
 866 |             raise RuntimeError("'field' argument must be one of {}.".format(self.__temp_fields__))
 867 | 
 868 |         return self.get_engine_state('minion_{}_temperature'.format(field), time)
 869 | 
 870 | 
 871 |     def _iter_records(self, exph_fh):
 872 |         """Parse an iterator over file-like object representing
 873 |         an experimental history.
 874 |         """
 875 |         for line in exph_fh:
 876 |             mo = re.match(r'.*:\s+Expt time: (\d+)s:? (.*)', line)
 877 |             if mo:
 878 |                 time, msg = mo.groups()
 879 |                 rec = self._parse_line(msg)
 880 |                 if rec:
 881 |                     key, value = rec
 882 |                     yield {'time': int(time), key:value}
 883 | 
 884 | 
 885 |     def _parse_line(self, msg):
 886 |         """Check if a line of experimental history records
 887 |         a change in the engine state.
 888 |         """
 889 |         mo = re.match(r'Experimental EngineState: (.*)', msg)
 890 |         if mo:
 891 |             msg2 = mo.group(1)
 892 |             return self._parse_engine_state(msg2)
 893 | 
 894 | 
 895 |     def _parse_engine_state(self, msg):
 896 |         """Extract engine state and value from a line of
 897 |         experimental history.
 898 |         """
 899 |         mo = re.match(r'(\w+) is now (.*)', msg)
 900 |         if mo:
 901 |             key, value = mo.group(1), mo.group(2)
 902 |             if key in self.__engine_states__:
 903 |                 return key, value
 904 | 
 905 | 
 906 |     def _add_attrs(self, data, location, convert=None):
 907 |         """Convenience method for adding attrs to a possibly new group.
 908 |         :param data: dict of attrs to add
 909 |         :param location: hdf path
 910 |         :param convert: function to apply to all dictionary values
 911 |         """
 912 |         self.__add_attrs(self, data, location, convert=None)
 913 | 
 914 | 
 915 |     @staticmethod
 916 |     def __add_attrs(self, data, location, convert=None):
 917 |         """Implementation of _add_attrs as staticmethod. This allows
 918 |         functionality to be used in .New() constructor but is otherwise nasty!
 919 |         """
 920 |         if location not in self:
 921 |             self.create_group(location)
 922 |         attrs = self[location].attrs
 923 |         for k, v in data.items():
 924 |             if convert is not None:
 925 |                 attrs[_sanitize_data_for_writing(k)] = _sanitize_data_for_writing(convert(v))
 926 |             else:
 927 |                 attrs[_sanitize_data_for_writing(k)] = _sanitize_data_for_writing(v)
 928 | 
 929 | 
 930 |     def _add_numpy_table(self, data, location):
 931 |         data = _sanitize_data_for_writing(data)
 932 |         self.create_dataset(location, data=data, compression=True)
 933 | 
 934 | 
 935 |     @classmethod
 936 |     def New(cls, fname, read='a', tracking_id={}, context_tags={}, channel_id={}):
 937 |         """Construct a fresh bulk file, with meta data written to
 938 |         standard locations. There is currently no checking this meta data.
 939 |         TODO: Add meta data checking.
 940 | 
 941 |         """
 942 | 
 943 |         # Start a new file, populate it with meta
 944 |         with h5py.File(fname, 'w') as h:
 945 |             h.attrs[_sanitize_data_for_writing('file_version')] = _sanitize_data_for_writing(1.0)
 946 |             for data, location in zip(
 947 |                 [tracking_id, context_tags],
 948 |                 [cls.__tracking_path__, cls.__context_path__]
 949 |             ):
 950 |                 # see cjw's comment in fast5.py:
 951 |                 # 'no idea why these must be str, just following ossetra'
 952 |                 cls.__add_attrs(h, data, location, convert=str)
 953 | 
 954 |         # return instance from new file
 955 |         return cls(fname, read)
 956 | 
 957 | 
 958 |     def set_raw(self, raw, channel, meta=None):
 959 |         """Set the raw data in file.
 960 | 
 961 |         :param raw: raw data to add
 962 |         :param channel: channel number
 963 |         """
 964 |         req_keys = ['description', 'digitisation', 'offset', 'range',
 965 |                     'sample_rate']
 966 | 
 967 |         meta = {k:v for k,v in meta.items() if k in req_keys}
 968 |         if len(meta.keys()) != len(req_keys):
 969 |             raise KeyError(
 970 |                 'Raw meta data must contain keys: {}.'.format(req_keys)
 971 |             )
 972 | 
 973 |         raw_folder = '/'.join(self.__raw_data__.format(channel).split('/')[:-1])
 974 |         raw_data_path = self.__raw_data__.format(channel)
 975 |         self._add_attrs(meta, raw_folder)
 976 |         self[raw_data_path] = raw
 977 | 
 978 | 
 979 |     def set_events(self, data, meta, channel):
 980 |         """Write event data to file
 981 | 
 982 |         :param data: event data
 983 |         :param meta: meta data to attach to read
 984 |         :param read_number: per-channel read counter
 985 |         """
 986 |         req_meta_keys = ['description', 'digitisation', 'offset', 'range',
 987 |                     'sample_rate']
 988 |         if not set(req_meta_keys).issubset(meta.keys()):
 989 |             raise KeyError(
 990 |                 'Read meta does not contain required fields: {}, got {}'.format(
 991 |                     req_fields, meta.keys()
 992 |                 )
 993 |             )
 994 |         req_event_fields = [
 995 |             'start', 'length', 'mean', 'variance'
 996 |         ]
 997 |         if not isinstance(data, np.ndarray):
 998 |             raise TypeError('Data is not ndarray.')
 999 | 
1000 |         # if data contains 'stdv', square this to get the variance
1001 |         # seemingly bulk fast5 files contain variance and not stdv, as
1002 |         # taking the sqrt would be slow on minknow.
1003 |         names = list(data.dtype.names)
1004 |         for i, name in enumerate(names):
1005 |             if name == 'stdv':
1006 |                 names[i] = 'variance'
1007 |                 data['stdv'] = np.square(data['stdv'])
1008 |         data.dtype.names = names
1009 | 
1010 |         if not set(req_event_fields).issubset(data.dtype.names):
1011 |             raise KeyError(
1012 |                 'Read data does not contain required fields: {}, got {}.'.format(
1013 |                     req_event_fields, data.dtype.names
1014 |                 )
1015 |             )
1016 | 
1017 |         event_meta_path = self.__channel_meta__.format(channel)
1018 |         self._add_attrs(meta, event_meta_path)
1019 | 
1020 |         uint_fields = ('start', 'length')
1021 |         dtype = np.dtype([(
1022 |             d[0], 'uint32') if d[0] in uint_fields else d
1023 |             for d in dtype_descr(data)
1024 |         ])
1025 | 
1026 |         # If the data is not an int or uint we assume it is in seconds and scale
1027 |         # appropriately
1028 |         if data['start'].dtype.kind not in ['i', 'u']:
1029 |             data['start'] *= meta['sample_rate']
1030 |             data['length'] *= meta['sample_rate']
1031 | 
1032 |         events_path = self.__event_data__.format(channel)
1033 |         self._add_numpy_table(
1034 |             data.astype(dtype), events_path
1035 |         )
1036 | 
1037 | 
1038 |     def set_voltage(self, data, meta):
1039 |         req_keys = ['description', 'digitisation', 'offset', 'range',
1040 |                     'sample_rate']
1041 |         meta = {k:v for k,v in meta.items() if k in req_keys}
1042 |         if len(meta.keys()) != len(req_keys):
1043 |             raise KeyError(
1044 |                 'Raw meta data must contain keys: {}.'.format(req_keys)
1045 |             )
1046 | 
1047 |         self._add_attrs(meta, self.__voltage_meta__)
1048 |         dtype = np.dtype([('bias_voltage', np.int16)])
1049 |         self._add_numpy_table(
1050 |             data.astype(dtype, copy=False), self.__voltage_data__
1051 | 
1052 |         )
1053 | 
1054 | 
1055 | #
1056 | # Taken from minknow/asicb_command/__init__.py
1057 | #
1058 | class AsicBConfiguration(object):
1059 |     """Wrapper around the asicb configuration struct passed to the asicb over usb"""
1060 |     def __init__(self, config):
1061 |         self.data = str(config)
1062 |         # Interpret as bytes...
1063 |         self.bytes = np.frombuffer(self.data, dtype="u1")
1064 |         # ...with reverse bit order
1065 |         self.bits = np.unpackbits(self.bytes[::-1])[::-1].copy()
1066 | 
1067 | 
1068 |     @property
1069 |     def bias_voltage(self):
1070 |         val = self.int_at(129, 121)
1071 |         if val > 256:
1072 |             return 256 - val
1073 |         return val
1074 | 
1075 | 
1076 |     def active_mux(self, channel):
1077 |         """
1078 |         Gets the active mux for the specified channel
1079 |         :param channel: 0 based
1080 |         """
1081 |         first_bit_channel_0 = 211     # bit of mux state for channel 0
1082 |         mux_state_size = 4
1083 |         requested_channel_first_bit = first_bit_channel_0 + mux_state_size * channel
1084 |         return self.int_at(requested_channel_first_bit + mux_state_size - 1, requested_channel_first_bit)
1085 | 
1086 | 
1087 |     def int_at(self, start, end):
1088 |         bits = self.bits_at(start, end)
1089 |         num = 0
1090 |         for on in reversed(bits):
1091 |             num = num << 1
1092 |             if on:
1093 |                 num |= 1
1094 |         return num
1095 | 
1096 | 
1097 |     def bits_at(self, start, end):
1098 |         return self.bits[end:start+1]
1099 | 
1100 | 
1101 | class AsicBCommand(object):
1102 |     """Wrapper around the asicb command structure"""
1103 |     def __init__(self, command):
1104 |         self.data = str(command)
1105 |         self._configuration = AsicBConfiguration(self.data[10:])
1106 |         self.bytes = np.frombuffer(self.data, dtype="u1")
1107 | 
1108 |         if self.bytes[0] != 17:
1109 |             raise Exception("Invalid command - magic byte was '{}', expected '17'"
1110 |                             .format(self.bytes[0]))
1111 | 
1112 | 
1113 |     @property
1114 |     def min_temperature(self):
1115 |         return self._bytes[7]
1116 | 
1117 | 
1118 |     @property
1119 |     def min_temperature(self):
1120 |         return self._bytes[8]
1121 | 
1122 | 
1123 |     @property
1124 |     def configuration(self):
1125 |         return self._configuration
1126 | 


--------------------------------------------------------------------------------