├── .pydevproject
├── CHANGELOG.md
├── LICENSE.md
├── MANIFEST.in
├── README.rst
├── docs
    ├── Makefile
    └── source
    │   ├── conf.py
    │   └── index.rst
├── img
    └── ONT_logo.png
├── ont_fast5_api
    ├── __init__.py
    ├── analysis_tools
    │   ├── __init__.py
    │   ├── alignment.py
    │   ├── base_tool.py
    │   ├── basecall_1d.py
    │   ├── basecall_2d.py
    │   ├── event_detection.py
    │   └── segmentation.py
    ├── compression_settings.py
    ├── conversion_tools
    │   ├── __init__.py
    │   ├── check_file_compression.py
    │   ├── compress_fast5.py
    │   ├── conversion_utils.py
    │   ├── demux_fast5.py
    │   ├── fast5_subset.py
    │   ├── multi_to_single_fast5.py
    │   └── single_to_multi_fast5.py
    ├── data_sanitisation.py
    ├── fast5_file.py
    ├── fast5_info.py
    ├── fast5_interface.py
    ├── fast5_read.py
    ├── helpers.py
    ├── multi_fast5.py
    ├── static_data.py
    └── vbz_plugin
    │   ├── libvbz_hdf_plugin.dylib
    │   ├── libvbz_hdf_plugin_aarch64.so
    │   ├── libvbz_hdf_plugin_m1.dylib
    │   ├── libvbz_hdf_plugin_x86_64.so
    │   └── vbz_hdf_plugin.dll
├── setup.py
└── test
    ├── __init__.py
    ├── data
        ├── basecall_2d_file_v1.0.fast5
        ├── hardlink
        │   ├── single_reads
        │   │   ├── 00031f3e-415c-4ab5-9c16-fb6fe45ff519.fast5
        │   │   ├── 000c0b4e-46c2-4fb5-9b17-d7031eefb975.fast5
        │   │   ├── 000ebd63-3e1a-4499-9ded-26af3225a022.fast5
        │   │   ├── 002ad0e4-c6bb-4eff-a30f-5fec01475ab8.fast5
        │   │   ├── 002b0891-03bf-4622-ae66-ae6984890ed4.fast5
        │   │   ├── 0048058c-ecb4-4a0f-b283-9a128bd598c5.fast5
        │   │   ├── 004a87b0-c9f6-4237-b4d6-466ab979aee2.fast5
        │   │   └── 0059d270-3238-4413-b38b-f588e28326df.fast5
        │   └── unlinked
        │   │   └── batch0.fast5
        ├── multi_read
        │   └── batch_0.fast5
        ├── multi_read_analyses
        │   └── batch_0.fast5
        ├── read_file_v0.6_raw.fast5
        ├── read_file_v0.6_single.fast5
        ├── read_file_v1.0_single.fast5
        ├── rle_basecall_table
        │   └── rle_example.fast5
        ├── single_read_analyses
        │   └── read.fast5
        ├── single_reads
        │   ├── fe85b517-62ee-4a33-8767-41cab5d5ab39.fast5
        │   ├── fe8a3026-d1f4-46b3-8daa-e610f27acde1.fast5
        │   ├── fe9374ee-b86a-4ca4-81dc-ac06e3297728.fast5
        │   └── read0.fast5
        ├── summaries
        │   └── two_barcode_summary.txt
        ├── telemetry_test.fast5
        └── vbz_reads
        │   └── vbz_reads.fast5
    ├── helpers.py
    ├── test_alignment_tools.py
    ├── test_basecall_1d_tools.py
    ├── test_basecall_2d_tools.py
    ├── test_check_compression.py
    ├── test_compress_fast5.py
    ├── test_compression_settings.py
    ├── test_data_sanitisation.py
    ├── test_demux_fast5.py
    ├── test_event_detection_tools.py
    ├── test_fast5_conversion_utils.py
    ├── test_fast5_converter.py
    ├── test_fast5_file.py
    ├── test_fast5_interface.py
    ├── test_fast5_subset.py
    ├── test_hardlink_metadata.py
    ├── test_multi_fast5.py
    └── test_segmentation_tools.py


/.pydevproject:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
2 | <?eclipse-pydev version="1.0"?>
3 | 
4 | <pydev_project>
5 | <pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
6 | <pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
7 | </pydev_project>
8 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | All notable changes and fixes to ont_fast5_api will be documented here
  3 | 
  4 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
  5 | This project (aspires to) adhere to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
  6 | 
  7 | ## [4.1.3]
  8 | 
  9 | ### Added
 10 | - Support for python up to 3.12
 11 | 
 12 | ## [4.1.2]
 13 | 
 14 | ### Added
 15 | - Support for h5py>=3.9
 16 | ### Changed
 17 | - Renamed VBZ compression settings to make it clearer which version is used in production
 18 | ### Removed
 19 | - Support for python3.6
 20 | 
 21 | ## [4.1.1]
 22 | 
 23 | ### Fixed
 24 | - Compatibility with numpy==1.24 unicode type
 25 | 
 26 | ### Changed
 27 | - Updated Windows VBZ Plugin dll
 28 | 
 29 | ## [4.1.0]
 30 | 
 31 | ### Added
 32 | - Support for fast5_api on macOS-M1
 33 | 
 34 | ## [4.0.2]
 35 | 
 36 | ### Fixed
 37 | - Fixed Fast5Read import error
 38 | 
 39 | ## [4.0.1]
 40 | 
 41 | ### Changed
 42 | - Fixed unresolved reference in `compress_fast5.py`
 43 | - Fixed issue with `compress_fast5.py` not retaining enumeration metadata for the end_reason attribute
 44 | - Increased minimum h5py version to 2.10
 45 | 
 46 | ## [4.0.0]
 47 | 
 48 | ### Added
 49 | - Script `demux_fast5` for demultiplexing fast5 reads based on column in summary file, e.g. for barcoded experiments 
 50 | 
 51 | ### Removed
 52 | - Removed deb builds which are no longer supported
 53 | - Python3.5 support 
 54 | 
 55 | ## [3.3.0] 2021-02-17
 56 | 
 57 | ### Added
 58 | - Added `yield_fast5_reads` to conversion_tools.
 59 | 
 60 | ## [3.2.0] 2021-01-28
 61 | 
 62 | ### Changed
 63 | - Dropped support for older h5py/numpy versions, min now h5py>=2.8, numpy>=1.16
 64 | - fast5_subset now displays errors (but continues processing) when it encounters input fast5 files it can't read.
 65 | 
 66 | ### Added
 67 | - Add support for explicitly specifying file drivers when loading
 68 |   multi-read fast5 files.
 69 | 
 70 | ## [3.1.6] 2020-08-20
 71 | ### Added
 72 | - `compress_fast5` now has a `--sanitize` option to remove optional groups.
 73 | 
 74 | ### Fixed
 75 | - Correctly handle the case where h5pl can be imported but doesn't have the prepend() function available.
 76 | 
 77 | ## [3.1.5] 2020-06-15
 78 | ### Added
 79 | - Added explicit requirements and checks to prevent running on Python 2.
 80 | 
 81 | ## [3.1.4] 2020-06-12
 82 | ### Fixed
 83 | - Compression now works in `single_to_multi`.
 84 | 
 85 | ## [3.1.3] 2020-05-28
 86 | ### Fixed
 87 | - Compression argument in `fast5_subset` and `single_to_multi` failed if not set
 88 | 
 89 | ## [3.1.2] 2020-05-04
 90 | ### Fixed
 91 | - Compression argument in `fast5_subset` and `single_to_multi` was parsed incorrectly
 92 | 
 93 | ## [3.1.1] 2020-04-03
 94 | ### Fixed
 95 | - Argument list for `fast5_subset` and `single_to_multi` had a syntax error
 96 | 
 97 | ## [3.1.0] 2020-04-02
 98 | ### Added
 99 | - Hardlinking of metadata to prevent duplication and reduce filesize
100 | - Ability to enable compression when using `fast5_subset` and `single_to_multi`
101 | ### Fixed
102 | - `fast5_subset` thread pool could sometimes close before all tasks were completed
103 | - `fast5_subset` will create output directory if it doesn't exist
104 | 
105 | ## [3.0.2] 2020-03-17
106 | ### Fixed
107 | - Comparison of file_versions could throw an error
108 | 
109 | ## [3.0.1] 2020-01-29
110 | ### Fixed
111 | - Basecall1DTools could not load data from a Fast5Read
112 | 
113 | ## [3.0.0] 2020-01-20
114 | ### Removed
115 | - python2 compatibility
116 | ### Fixed
117 | - minor documentation errors: https://github.com/nanoporetech/ont_fast5_api/issues/28
118 | 
119 | ## [2.1.0] 2019-12-16 
120 | ### Added
121 | - Script to check the compression type of fast5 files in a folder
122 | - `compress_fast5` can now be used `--in_place`
123 | ### Fixed
124 | - Reading arrays with padded strings now succeeds (on h5py>2.7)
125 | - Compatibility bugs with h5py==2.6 now raises appropriate errors
126 | - Fast5File now has attribute read_id to match documentation
127 | ### Changed
128 | - Now use standard settings for gzip compression (gzip=1, shuffle=None)
129 | - Inverted dependency between `Fast5File` and `Fast5Read` so `Fast5Read` is now the primary object
130 | 
131 | ## [2.0.1] 2019-11-28
132 | ### Added
133 | - Option to `--ignore_symlinks` in fast5 conversion scripts
134 | - Explicit check to file_type for detemining single/multi-read files 
135 | ### Fixed
136 | - `fast5_subset` with single read fast5s was failing
137 | - unit test data now cleaned up properly
138 | 
139 | ## [2.0.0] 2019-11-19
140 | ### Added 
141 | - Compatibility for VBZ compressed reads
142 | - `compress_fast5` script for compressing/decompressing fast5 files
143 | - `get_reads()` helper method to more easily loop through reads in a fast5 file
144 | ### Changed
145 | - `Fast5File().get_raw_data()` updated interface to match `Fast5Read` and remove support for legacy files with multiple read numbers in a single `Fast5File`
146 | - Minimum depedency version requirements bumped. Set to Ubuntu16 `apt` python3-package defaults 
147 | ### Removed 
148 | - Legacy `Fast5Writer` object. `MultiReadFast5` or `EmptyFast5File` are preferred 
149 | 
150 | ## [1.4.9] 2019-11-01
151 | ### Added
152 | - Check for progressbar2 package and fail early if it's installed.
153 | 
154 | ## [1.4.8] 2019-10-22
155 | ### Added
156 | - Support for h5py==2.10 string data type encoding changes
157 | ### Fixed
158 | - Corrected some "for for" typos in argparse help text.
159 | 
160 | ## [1.4.7] 2019-07-29
161 | ### Fixed
162 | - Bug in read string and read_id concatenation resulted in broken output file
163 | 
164 | ## [1.4.6] 2019-07-03
165 | ### Added
166 | - Updated fast5_subset script to extract also from single-read fast5 files
167 | ### Changed
168 | - Renamed fast5_subset source script from multi_fast5_subset.py to fast5_subset.py
169 | 
170 | ## [1.4.5] 2019-07-01
171 | ### Fixed
172 | - Bug in number of processes being 0 when batch size is greater than number of reads (py2)
173 | 
174 | ## [1.4.4] 2019-06-18
175 | ### Fixed
176 | - Bug in path name output from pathlib changes
177 | 
178 | ## [1.4.3] 2019-06-12
179 | ### Fixed
180 | - Bug with apt-install and pathlib2
181 | 
182 | ## [1.4.2] 2019-06-10
183 | ### Fixed
184 | - get_raw_data() now works with scale=True when start,end are None
185 | 
186 | ## [1.4.1] 2019-06-06
187 | ### Added
188 | - Useful error message if no input files found
189 | ### Fixed
190 | - filename_mapping output gave incorrect filenames
191 | 
192 | ## [1.4.0] 2019-05-29
193 | ### Added
194 | - Script for extracting reads by id from `multi_read` files
195 | 
196 | ## [1.3.0] 2019-03-01
197 | ### Fixed
198 | - Bug in output to `filename_mapping.txt`
199 | 
200 | ## [1.2.0] 2019-01-11
201 | ### Added
202 | - Multi-threading support for multi<->single conversion for improved performance
203 | 
204 | ### Fixed
205 | - Removed incorrect license accidentally added to README
206 | 
207 | ## [1.1.1] 2019-01-10
208 | ### Changed
209 | - Minor documentation updates
210 | - Follow symlinks when finding files recursively
211 | 
212 | ## [1.1.0] 2019-01-07
213 | ### Added
214 | - Generic single- and multi- read interface via `get_fast5_file`
215 | 
216 | ### Fixed
217 | - Incorrect time estimates for single-multi conversion
218 | - Fixed path creation if not exist
219 | 
220 | ## [1.0.1] 2018-09-26
221 | ### Added
222 | - Support for multi-read fast5 files
223 | - Conversion tools for single-multi read files
224 | 
225 | ### Fixed
226 | - Support for deprecated interface to Basecall2D following 0.4.0, support will end in v1.x.x
227 | 
228 | 
229 | ## [0.4.0] 2017-07-16 (internal only)
230 | ### Fixed
231 | - Basecall1d and Basecall2d raise consistent KeyError when fastq data missing
232 | 
233 | ### Changed
234 | - Interface to Basecall1d and Basecall2d unified for add_sequence() and get_sequence()
235 | 
236 | 
237 | ## [0.3.3] 2017-06-23
238 | ### Added
239 | - Fast5 file now supports logging via 'Fast5File.add_log()'
240 | 
241 | ### Fixed
242 | - Invalid component names no longer checked against LEGACY_COMPENENTS
243 | - Raise KeyError when fastq data missing from Basecall1d
244 | - median_before and start_mux populate correctly with sensible defaults
245 | 
246 | 
247 | ## [0.3.2] 2017-03-22
248 | ### Added
249 | Major release - changes not logged before this point
250 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | # This file tells sdist which addional files to include in the distribution it builds.
 2 | # That distribution is used as the base for building the .deb with stdeb, and certain files
 3 | # (such as header files and .md files) are not included by default.
 4 | # See https://docs.python.org/2/distutils/sourcedist.html#manifest-template
 5 | 
 6 | include README.md
 7 | include LICENSE.md
 8 | prune test
 9 | prune build
10 | prune docs
11 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | VERSION       ?= unknown 
 10 | 
 11 | # Internal variables.
 12 | PAPEROPT_a4     = -D latex_paper_size=a4
 13 | PAPEROPT_letter = -D latex_paper_size=letter
 14 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) -D version=$(VERSION) -D release=$(VERSION) $(SPHINXOPTS) source
 15 | # the i18n builder cannot share the environment and doctrees with the others
 16 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 17 | 
 18 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 19 | 
 20 | help:
 21 | 	@echo "Please use \`make <target>' where <target> is one of"
 22 | 	@echo "  api        to autogenerate API documentation"
 23 | 	@echo "  html       to make standalone HTML files"
 24 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 25 | 	@echo "  singlehtml to make a single large HTML file"
 26 | 	@echo "  pickle     to make pickle files"
 27 | 	@echo "  json       to make JSON files"
 28 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 29 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 30 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 31 | 	@echo "  epub       to make an epub"
 32 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 33 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 34 | 	@echo "  text       to make text files"
 35 | 	@echo "  man        to make manual pages"
 36 | 	@echo "  texinfo    to make Texinfo files"
 37 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 38 | 	@echo "  gettext    to make PO message catalogs"
 39 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 40 | 	@echo "  linkcheck  to check all external links for integrity"
 41 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 42 | 
 43 | clean:
 44 | 	$(eval NON_INDEX_FILES := $(filter-out source/index.rst, $(wildcard source/*.rst)))
 45 | 	-rm -rf $(BUILDDIR)
 46 | 	mkdir $(BUILDDIR)
 47 | ifneq ($(NON_INDEX_FILES),)
 48 | 	rm $(NON_INDEX_FILES)
 49 | endif
 50 | 
 51 | api:
 52 | 	$(eval NON_INDEX_FILES := $(filter-out source/index.rst, $(wildcard source/*.rst)))
 53 | ifneq ($(NON_INDEX_FILES),)
 54 | 	rm $(NON_INDEX_FILES)
 55 | endif
 56 | 	sphinx-apidoc --no-toc -o source/ ..
 57 | 	rm source/test.rst
 58 | 	rm source/setup.rst
 59 | 	@echo
 60 | 	@echo "API gubbins generated in source directory for version $(VERSION)."
 61 | 
 62 | html:
 63 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 66 | 
 67 | dirhtml:
 68 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 69 | 	@echo
 70 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 71 | 
 72 | singlehtml:
 73 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 74 | 	@echo
 75 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 76 | 
 77 | pickle:
 78 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 79 | 	@echo
 80 | 	@echo "Build finished; now you can process the pickle files."
 81 | 
 82 | json:
 83 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 84 | 	@echo
 85 | 	@echo "Build finished; now you can process the JSON files."
 86 | 
 87 | htmlhelp:
 88 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 89 | 	@echo
 90 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 91 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 92 | 
 93 | qthelp:
 94 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 95 | 	@echo
 96 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 97 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 98 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/fast5_api.qhcp"
 99 | 	@echo "To view the help file:"
100 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/fast5_api.qhc"
101 | 
102 | devhelp:
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/ont_fast5_api"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/ont_fast5_api"
109 | 	@echo "# devhelp"
110 | 
111 | epub:
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | latex:
117 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | 	@echo
119 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | 	      "(use \`make latexpdf' here to do that automatically)."
122 | 
123 | latexpdf:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through pdflatex..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | text:
130 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
131 | 	@echo
132 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
133 | 
134 | man:
135 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
136 | 	@echo
137 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
138 | 
139 | texinfo:
140 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
141 | 	@echo
142 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
143 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
144 | 	      "(use \`make info' here to do that automatically)."
145 | 
146 | info:
147 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
148 | 	@echo "Running Texinfo files through makeinfo..."
149 | 	make -C $(BUILDDIR)/texinfo info
150 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
151 | 
152 | gettext:
153 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
154 | 	@echo
155 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
156 | 
157 | changes:
158 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
159 | 	@echo
160 | 	@echo "The overview file is in $(BUILDDIR)/changes."
161 | 
162 | linkcheck:
163 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
164 | 	@echo
165 | 	@echo "Link check complete; look for any errors in the above output " \
166 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
167 | 
168 | doctest:
169 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
170 | 	@echo "Testing of doctests in the sources finished, look at the " \
171 | 	      "results in $(BUILDDIR)/doctest/output.txt."
172 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # fast5_api documentation build configuration file, created by
  4 | # sphinx-quickstart on Fri Nov 21 09:32:46 2014.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
 16 | sys.path.insert(0, os.path.abspath(os.path.join('..', '..', 'ont_fast5_api')))
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | #sys.path.insert(0, os.path.abspath('.'))
 22 | 
 23 | # -- General configuration -----------------------------------------------------
 24 | 
 25 | # If your documentation needs a minimal Sphinx version, state it here.
 26 | #needs_sphinx = '1.0'
 27 | 
 28 | # Add any Sphinx extension module names here, as strings. They can be extensions
 29 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 30 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
 31 | 
 32 | # Add any paths that contain templates here, relative to this directory.
 33 | templates_path = ['_templates']
 34 | 
 35 | # The suffix of source filenames.
 36 | source_suffix = '.rst'
 37 | 
 38 | # The encoding of source files.
 39 | #source_encoding = 'utf-8-sig'
 40 | 
 41 | # The master toctree document.
 42 | master_doc = 'index'
 43 | 
 44 | # General information about the project.
 45 | project = u'ont_fast5_api'
 46 | copyright = u'2016, Oxford Nanopore Technologies'
 47 | 
 48 | # The version info for the project you're documenting, acts as replacement for
 49 | # |version| and |release|, also used in various other places throughout the
 50 | # built documents.
 51 | #
 52 | # The short X.Y version.
 53 | version = '1.6.2'
 54 | # The full version, including alpha/beta/rc tags.
 55 | release = '1.6.2'
 56 | 
 57 | # The language for content autogenerated by Sphinx. Refer to documentation
 58 | # for a list of supported languages.
 59 | #language = None
 60 | 
 61 | # There are two options for replacing |today|: either, you set today to some
 62 | # non-false value, then it is used:
 63 | #today = ''
 64 | # Else, today_fmt is used as the format for a strftime call.
 65 | #today_fmt = '%B %d, %Y'
 66 | 
 67 | # List of patterns, relative to source directory, that match files and
 68 | # directories to ignore when looking for source files.
 69 | exclude_patterns = []
 70 | 
 71 | # The reST default role (used for this markup: `text`) to use for all documents.
 72 | #default_role = None
 73 | 
 74 | # If true, '()' will be appended to :func: etc. cross-reference text.
 75 | #add_function_parentheses = True
 76 | 
 77 | # If true, the current module name will be prepended to all description
 78 | # unit titles (such as .. function::).
 79 | #add_module_names = True
 80 | 
 81 | # If true, sectionauthor and moduleauthor directives will be shown in the
 82 | # output. They are ignored by default.
 83 | #show_authors = False
 84 | 
 85 | # The name of the Pygments (syntax highlighting) style to use.
 86 | pygments_style = 'sphinx'
 87 | 
 88 | # A list of ignored prefixes for module index sorting.
 89 | #modindex_common_prefix = []
 90 | 
 91 | 
 92 | # -- Options for HTML output ---------------------------------------------------
 93 | 
 94 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 95 | # a list of builtin themes.
 96 | html_theme = 'sphinxdoc'
 97 | 
 98 | # Theme options are theme-specific and customize the look and feel of a theme
 99 | # further.  For a list of options available for each theme, see the
100 | # documentation.
101 | #html_theme_options = {}
102 | 
103 | # Add any paths that contain custom themes here, relative to this directory.
104 | #html_theme_path = []
105 | 
106 | # The name for this set of Sphinx documents.  If None, it defaults to
107 | # "<project> v<release> documentation".
108 | #html_title = None
109 | 
110 | # A shorter title for the navigation bar.  Default is the same as html_title.
111 | #html_short_title = None
112 | 
113 | # The name of an image file (relative to this directory) to place at the top
114 | # of the sidebar.
115 | #html_logo = None
116 | 
117 | # The name of an image file (within the static path) to use as favicon of the
118 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
119 | # pixels large.
120 | #html_favicon = None
121 | 
122 | # Add any paths that contain custom static files (such as style sheets) here,
123 | # relative to this directory. They are copied after the builtin static files,
124 | # so a file named "default.css" will overwrite the builtin "default.css".
125 | html_static_path = ['_static']
126 | 
127 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
128 | # using the given strftime format.
129 | #html_last_updated_fmt = '%b %d, %Y'
130 | 
131 | # If true, SmartyPants will be used to convert quotes and dashes to
132 | # typographically correct entities.
133 | #html_use_smartypants = True
134 | 
135 | # Custom sidebar templates, maps document names to template names.
136 | #html_sidebars = {}
137 | 
138 | # Additional templates that should be rendered to pages, maps page names to
139 | # template names.
140 | #html_additional_pages = {}
141 | 
142 | # If false, no module index is generated.
143 | #html_domain_indices = True
144 | 
145 | # If false, no index is generated.
146 | #html_use_index = True
147 | 
148 | # If true, the index is split into individual pages for each letter.
149 | #html_split_index = False
150 | 
151 | # If true, links to the reST sources are added to the pages.
152 | #html_show_sourcelink = True
153 | 
154 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
155 | #html_show_sphinx = True
156 | 
157 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
158 | #html_show_copyright = True
159 | 
160 | # If true, an OpenSearch description file will be output, and all pages will
161 | # contain a <link> tag referring to it.  The value of this option must be the
162 | # base URL from which the finished HTML is served.
163 | #html_use_opensearch = ''
164 | 
165 | # This is the file name suffix for HTML files (e.g. ".xhtml").
166 | #html_file_suffix = None
167 | 
168 | # Output file base name for HTML help builder.
169 | htmlhelp_basename = 'fast5_api_doc'
170 | 
171 | 
172 | # -- Options for LaTeX output --------------------------------------------------
173 | 
174 | latex_elements = {
175 | # The paper size ('letterpaper' or 'a4paper').
176 | #'papersize': 'letterpaper',
177 | 
178 | # The font size ('10pt', '11pt' or '12pt').
179 | #'pointsize': '10pt',
180 | 
181 | # Additional stuff for the LaTeX preamble.
182 | #'preamble': '',
183 | }
184 | 
185 | # Grouping the document tree into LaTeX files. List of tuples
186 | # (source start file, target name, title, author, documentclass [howto/manual]).
187 | latex_documents = [
188 |   ('index', 'fast5_api.tex', u'fast5_api Documentation',
189 |    u'Kevin Dolan, Forrest Brennen', 'manual'),
190 | ]
191 | 
192 | # The name of an image file (relative to this directory) to place at the top of
193 | # the title page.
194 | #latex_logo = None
195 | 
196 | # For "manual" documents, if this is true, then toplevel headings are parts,
197 | # not chapters.
198 | #latex_use_parts = False
199 | 
200 | # If true, show page references after internal links.
201 | #latex_show_pagerefs = False
202 | 
203 | # If true, show URL addresses after external links.
204 | #latex_show_urls = False
205 | 
206 | # Documents to append as an appendix to all manuals.
207 | #latex_appendices = []
208 | 
209 | # If false, no module index is generated.
210 | #latex_domain_indices = True
211 | 
212 | 
213 | # -- Options for manual page output --------------------------------------------
214 | 
215 | # One entry per manual page. List of tuples
216 | # (source start file, name, description, authors, manual section).
217 | man_pages = [
218 |     ('index', 'ont_fast5_api', u'ont_fast5_api Documentation',
219 |      [u'Kevin Dolan, Forrest Brennen'], 1)
220 | ]
221 | 
222 | # If true, show URL addresses after external links.
223 | #man_show_urls = False
224 | 
225 | 
226 | # -- Options for Texinfo output ------------------------------------------------
227 | 
228 | # Grouping the document tree into Texinfo files. List of tuples
229 | # (source start file, target name, title, author,
230 | #  dir menu entry, description, category)
231 | texinfo_documents = [
232 |   ('index', 'ont_fast5_api', u'ont_fast5_api Documentation',
233 |    u'Kevin Dolan, Forrest Brennen', 'fast5_api', 'One line description of project.',
234 |    'Miscellaneous'),
235 | ]
236 | 
237 | # Documents to append as an appendix to all manuals.
238 | #texinfo_appendices = []
239 | 
240 | # If false, no module index is generated.
241 | #texinfo_domain_indices = True
242 | 
243 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
244 | #texinfo_show_urls = 'footnote'
245 | 
246 | # Included to display docstrings from class __init__() functions.
247 | autoclass_content = "both"
248 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. ont_fast5_api documentation master file, created by
 2 |    sphinx-quickstart on Fri Nov 21 09:32:46 2014.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | .. include:: ../../README.rst
 7 | 
 8 | Contents:
 9 | 
10 | .. toctree::
11 |    :maxdepth: 4
12 |    :glob:
13 | 
14 |    ont_fast5_api
15 | 
16 | 
17 | Indices and tables
18 | ==================
19 | 
20 | * :ref:`genindex`
21 | * :ref:`modindex`
22 | * :ref:`search`


--------------------------------------------------------------------------------
/img/ONT_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/img/ONT_logo.png


--------------------------------------------------------------------------------
/ont_fast5_api/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = '4.1.3'
 2 | __version_info__ = tuple([int(num) for num in __version__.split('.')])
 3 | CURRENT_FAST5_VERSION = 2.0
 4 | 
 5 | import sys
 6 | if sys.version_info < (3,):
 7 |     raise ImportError(
 8 |     """ont-fast5-api requires Python 3.7
 9 | 
10 |     Somehow you have ended up running this on Python 2, which reached its end of
11 |     life in 2019. Apologies! To avoid this issue, either:
12 | 
13 |     - Upgrade to Python 3, or
14 | 
15 |     - Download an older ont-fast5-api version:
16 | 
17 |     $ pip install 'ont-fast5-api<3.0'
18 | 
19 |     Note that you will be missing features and bug fixes by running older versions
20 |     of ont-fast5-api.
21 | 
22 |     """)
23 | 
24 | # Set up a default NullHandler in case we don't end up using another one
25 | # Taken from http://docs.python-guide.org/en/latest/writing/logging/
26 | import logging
27 | logging.getLogger(__name__).addHandler(logging.NullHandler())
28 | 
29 | from ont_fast5_api.compression_settings import register_plugin
30 | register_plugin()
31 | 


--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/ont_fast5_api/analysis_tools/__init__.py


--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/alignment.py:
--------------------------------------------------------------------------------
  1 | """ Helper class for working with alignment type analyses.
  2 | """
  3 | import numpy as np
  4 | 
  5 | from ont_fast5_api.analysis_tools.base_tool import BaseTool
  6 | from ont_fast5_api.fast5_file import Fast5File
  7 | from ont_fast5_api.analysis_tools.segmentation import SegmentationTools
  8 | from ont_fast5_api.fast5_read import Fast5Read
  9 | 
 10 | 
 11 | class AlignmentTools(BaseTool):
 12 |     """ Provides helper methods specific to alignment analyses.
 13 |     """
 14 |     
 15 |     def __init__(self, source, mode='r', group_name=None, meta=None, config=None):
 16 |         """ Create a new alignment tools object.
 17 |         
 18 |         :param source: Either an open Fast5File object, or a filename
 19 |             of a fast5 file.
 20 |         :param mode: The open mode (r or r+). Only if a filename is used
 21 |             for the source argument.
 22 |         :param group_name: The specific alignment analysis instance
 23 |             you are interested in.
 24 |         :param meta: Metadata for a new alignment analysis.
 25 |         :param config: Configuration data for a new alignment analysis.
 26 |         
 27 |         To create a new alignment analysis, provide a group name that
 28 |         does not already exist, and an optional dictionary with the metadata.
 29 |         The following fields are recommended, as a minimum:
 30 |             
 31 |             * name - The name of the basecall software used.
 32 |             * time_stamp - The time at which the analysis was performed.
 33 |         
 34 |         If the group name already exists, the "meta" parameter is ignored. If
 35 |         the specified group has a "component" attribute, and its value is not
 36 |         "alignment", an exception will be thrown.
 37 |         """
 38 |         if isinstance(source, Fast5Read):
 39 |             self.handle = source
 40 |             self.close_handle_when_done = False
 41 |         elif isinstance(source, str):
 42 |             self.handle = Fast5File(source, mode)
 43 |             self.close_handle_when_done = True
 44 |         else:
 45 |             raise Exception('Unrecognized type for argument "source".')
 46 |         if group_name is None:
 47 |             group_name = self.handle.get_latest_analysis('Alignment')
 48 |             if group_name is None:
 49 |                 raise Exception('No Alignment analysis group found in file.')
 50 |         self.group_name = group_name
 51 |         attrs = self.handle.get_analysis_attributes(group_name)
 52 |         if attrs is None:
 53 |             if meta is None:
 54 |                 meta = {}
 55 |             self.handle.add_analysis('alignment', group_name, meta, config)
 56 |             attrs = self.handle.get_analysis_attributes(group_name)
 57 |         if ('component' in attrs
 58 |                 and attrs['component'] not in ['alignment',
 59 |                                                'calibration_strand']):
 60 |             self.close()
 61 |             raise Exception('Analysis does not appear to be an alignment component.')
 62 | 
 63 |     def get_results(self):
 64 |         """ Get details about the alignments that have been performed.
 65 | 
 66 |         :return: A dict of dicts.
 67 | 
 68 |         The keys of the top level are 'template', 'complement' and '2d'.
 69 |         Each of these dicts contains the following fields:
 70 | 
 71 |             * status: Can be 'no data', 'no match found', or 'match found'.
 72 |             * direction: Can be 'forward', 'reverse'.
 73 |             * ref_name: Name of reference.
 74 |             * ref_span: Section of reference aligned to, as a tuple (start, end).
 75 |             * seq_span: Section of the called sequence that aligned, as a tuple (start, end).
 76 |             * seq_len: Total length of the called sequence.
 77 |             * num_aligned: Number of bases that aligned to bases in the reference.
 78 |             * num_correct: Number of aligned bases that match the reference.
 79 |             * num_deletions: Number of bases in the aligned section of the
 80 |                 reference that are not aligned to bases in the called sequence.
 81 |             * num_insertions: Number of bases in the aligned section of the called
 82 |                 sequence that are not aligned to bases in the reference.
 83 |             * identity: The fraction of aligned bases that are correct (num_correct /
 84 |                 num_aligned).
 85 |             * accuracy: The overall basecall accuracy, according to the alignment.
 86 |                 (num_correct / (num_aligned + num_deletions + num_insertions)).
 87 |         
 88 |         Note that if the status field is not 'match found', then all the other
 89 |         fields will be absent.
 90 |         """
 91 |         summary = self.handle.get_summary_data(self.group_name)
 92 |         results = {'template': {'status': 'no data'},
 93 |                    'complement': {'status': 'no data'},
 94 |                    '2d': {'status': 'no data'}}
 95 |         if 'genome_mapping_template' in summary:
 96 |             results['template'] = self._get_results(summary['genome_mapping_template'])
 97 |         if 'genome_mapping_complement' in summary:
 98 |             results['complement'] = self._get_results(summary['genome_mapping_complement'])
 99 |         if 'genome_mapping_2d' in summary:
100 |             results['2d'] = self._get_results(summary['genome_mapping_2d'])
101 |         return results
102 | 
103 |     def get_alignment_data(self, section):
104 |         """ Get the alignment SAM and Fasta, if present.
105 |         
106 |         :param section: Can be 'template', 'complement', or '2d'.
107 |         :return: A tuple containing the SAM and the section of the reference
108 |             aligned to (both as strings). Returns None if no alignment is
109 |             present for that section.
110 |         """
111 |         subgroup = '{}/Aligned_{}'.format(self.group_name, section)
112 |         sam = self.handle.get_analysis_dataset(subgroup, 'SAM')
113 |         fasta = self.handle.get_analysis_dataset(subgroup, 'Fasta')
114 |         if sam is None or fasta is None:
115 |             return None
116 |         sequence = fasta.split('\n')[1]
117 |         return sam, sequence
118 | 
119 |     def add_alignment_data(self, section, sam, sequence):
120 |         """ Add the SAM and Fasta alignment data for a section.
121 |         
122 |         :param section: Can be 'template', 'complement', or '2d'.
123 |         :param sam: A string containing the SAM contents.
124 |         :param sequence: A string containing the section of the
125 |             reference the basecall aligned to.
126 |         """
127 |         subgroup = 'Aligned_{}'.format(section)
128 |         if not subgroup in self.handle.handle['Analyses/{}'.format(self.group_name)]:
129 |             self.handle.add_analysis_subgroup(self.group_name, subgroup)
130 |         sam_arr = np.array(sam, dtype=str)
131 |         self.handle.add_analysis_dataset('{}/{}'.format(self.group_name, subgroup), 'SAM', sam_arr)
132 |         fasta_arr = np.array('>{}\n{}\n'.format(section, sequence), dtype=str)
133 |         self.handle.add_analysis_dataset('{}/{}'.format(self.group_name, subgroup), 'Fasta', fasta_arr)
134 | 
135 |     def calculate_speed(self, section, alignment_results=None):
136 |         """ Calculate speed using alignment information.
137 | 
138 |         :param section: The section (template or complement) we're calculating
139 |             speed for.
140 |         :param alignment_results: Optional dictionary of the alignment summary,
141 |             so that speed can be calculated without having to write the summary
142 |             out to the fast5 file first.
143 |         :return: Speed in bases per second or zero if the speed could not be
144 |             calculated.
145 | 
146 |         The only reliable way we have of finding out how many bases have gone through the pore is by
147 |         looking at how much of the reference the sequence aligned to. This takes that information and
148 |         uses it to calculate speed in reference-bases-per-second.
149 |         """
150 |         speed = 0.0
151 |         if alignment_results:
152 |             results = self._get_results(alignment_results)
153 |         else:
154 |             results = self.get_results()[section]
155 |         if results['status'] != 'match found':
156 |             return 0.0
157 |         ref_span = results['ref_span']
158 |         ref_len = ref_span[1] - ref_span[0]
159 |         seq_span = results['seq_span']
160 |         seq_len = seq_span[1] - seq_span[0]
161 |         total_len = results['seq_len']
162 | 
163 |         sample_rate = self.handle.get_channel_info()['sampling_rate']
164 | 
165 |         # We need the duration from the segmentation results
166 |         chain = self.handle.get_chain(self.group_name)
167 |         if chain is not None:
168 |             segmentation_group = dict(chain).get('segmentation')
169 |         else:
170 |             segmentation_group = None
171 |         duration = 0
172 |         if segmentation_group is not None:
173 |             with SegmentationTools(self.handle, group_name=segmentation_group) as seg:
174 |                 summary = seg.get_results()
175 |                 if summary is not None:
176 |                     duration = summary['duration_{}'.format(section)]
177 |         if duration == 0:
178 |             return 0.0
179 | 
180 |         normalized_duration = duration * seq_len / float(total_len)
181 |         speed = sample_rate * ref_len / normalized_duration
182 |         return speed
183 | 
184 |     ##########################
185 |     #
186 |     #  Private methods below
187 |     #
188 |     ##########################
189 | 
190 |     def _get_results(self, summary):
191 |         results = {'status': 'no data'}
192 |         ref_name = summary['genome']
193 |         if ref_name == 'no_match':
194 |             results['status'] = 'no match found'
195 |             return results
196 |         results['status'] = 'match found'
197 |         results['direction'] = 'forward'
198 |         if ref_name.endswith('_rc'):
199 |             ref_name = ref_name[:-3]
200 |             results['direction'] = 'reverse'
201 |         results['ref_name'] = ref_name
202 |         results['ref_span'] = (summary['genome_start'], summary['genome_end'])
203 |         results['seq_span'] = (summary['strand_start'], summary['strand_end'])
204 |         results['seq_len'] = summary['num_events']
205 |         results.update({key: summary[key] for key in ['num_aligned', 'num_correct', 'num_insertions',
206 |                                                       'num_deletions', 'identity', 'accuracy']})
207 |         return results
208 | 


--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/base_tool.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from abc import abstractmethod
 3 | 
 4 | from ont_fast5_api.fast5_file import Fast5File, Fast5FileTypeError
 5 | from ont_fast5_api.fast5_read import Fast5Read
 6 | 
 7 | 
 8 | class BaseTool(object):
 9 |     @property
10 |     def group_id(self):
11 |         raise NotImplementedError("BaseTool does not have a group_id")
12 | 
13 |     @property
14 |     def analysis_id(self):
15 |         raise NotImplementedError("BaseTool does not have a analysis_id")
16 | 
17 |     def __init__(self, source, mode='r', group_name=None, meta=None, config=None):
18 |         """ Create a new analysis_tools object.
19 | 
20 |         :param source: Either an open Fast5File object, or a filename
21 |             of a fast5 file.
22 |         :param mode: The open mode (r or r+). Only if a filename is used
23 |             for the source argument.
24 |         :param group_name: The specific analysis instance you are interested in.
25 |         :param meta: Metadata for a new analysis.
26 |         :param config: Configuration data for a new analysis.
27 | 
28 |         To create a new analysis group, provide a group name that
29 |         does not already exist, and an optional dictionary with the metadata.
30 |         The following fields are recommended, as a minimum:
31 | 
32 |             * name - The name of the software used.
33 |             * time_stamp - The time at which the analysis was performed.
34 | 
35 |         If the group name already exists, the "meta" parameter is ignored. If
36 |         the specified group has a "component" attribute, and its value does not
37 |         match self.analysis_id, an exception will be thrown.
38 |         """
39 |         if isinstance(source, Fast5Read):
40 |             self.filename = source.filename  # Useful for debugging purposes
41 |             self.handle = source
42 |             self.close_handle_when_done = False
43 |         elif isinstance(source, str):
44 |             self.filename = source  # Useful for debugging purposes
45 |             try:
46 |                 self.handle = Fast5File(source, mode)
47 |             except Fast5FileTypeError :
48 |                 raise NotImplementedError("AnalysisTools do not support accessing MultiReadFast5 files by filepath")
49 |             self.close_handle_when_done = True
50 |         else:
51 |             raise KeyError('Unrecognized type for argument "source": {}'.format(source))
52 |         if group_name is None:
53 |             group_name = self.handle.get_latest_analysis(self.group_id)
54 |             if group_name is None:
55 |                 raise KeyError('No group: {} found in file: {}'.format(group_name, self.filename))
56 |         self.group_name = group_name
57 |         attrs = self.handle.get_analysis_attributes(group_name)
58 | 
59 |         if attrs is None:
60 |             self.handle.add_analysis(self.analysis_id, group_name, meta, config)
61 |             attrs = self.handle.get_analysis_attributes(group_name)
62 |         if 'component' in attrs and attrs['component'] != self.analysis_id:
63 |             raise ValueError('Component {} is not {}'.format(attrs.get('component'), self.analysis_id))
64 | 
65 |     def __enter__(self):
66 |         return self
67 | 
68 |     def __exit__(self, exception_type, exception_value, traceback):
69 |         self.close()
70 |         return False
71 | 
72 |     def close(self):
73 |         """ Closes the object.
74 |         """
75 |         if self.handle and self.close_handle_when_done:
76 |             self.handle.close()
77 | 


--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/basecall_1d.py:
--------------------------------------------------------------------------------
 1 | """ Helper class for working with 1D basecall type analyses.
 2 | """
 3 | import numpy as np
 4 | 
 5 | from ont_fast5_api.analysis_tools.base_tool import BaseTool
 6 | 
 7 | 
 8 | class Basecall1DTools(BaseTool):
 9 |     """ Provides helper methods specific to 1D basecall analyses.
10 |     """
11 |     group_id = 'Basecall_1D'
12 |     analysis_id = 'basecall_1d'
13 | 
14 | 
15 |     def get_event_data(self, section):
16 |         """ Return either the template or complement event data, if present.
17 |         
18 |         :param section: Either template or complement.
19 |         :return: Event data table.
20 |         """
21 |         event_group = '{}/BaseCalled_{}'.format(self.group_name, section)
22 |         data = self.handle.get_analysis_dataset(event_group, 'Events')
23 |         return data
24 | 
25 |     def add_event_data(self, section, data):
26 |         """ Add template or complement basecalled event data.
27 |         
28 |         :param section: Either template or complement.
29 |         :param data: Event data table to be written.
30 |         """
31 |         event_group = 'BaseCalled_{}'.format(section)
32 |         if not event_group in self.handle.handle['Analyses/{}'.format(self.group_name)]:
33 |             self.handle.add_analysis_subgroup(self.group_name, event_group)
34 |         self.handle.add_analysis_dataset('{}/{}'.format(self.group_name, event_group), 'Events', data)
35 | 
36 |     def get_called_sequence(self, section, fastq=False):
37 |         """ Return either the called sequence data, if present.
38 |         
39 |         :param section: ['template', 'complement' or '2D']
40 |         :param fastq: If True, return a single, multiline fastq string. If
41 |             False, return a tuple of (name, sequence, qstring).
42 |         :return: Either the fastq string or the (name, sequence, qstring) tuple.
43 |         """
44 | 
45 |         event_group = '{}/BaseCalled_{}'.format(self.group_name, section)
46 |         data = self.handle.get_analysis_dataset(event_group, 'Fastq')
47 |         if data is None:
48 |             raise KeyError("No fastq data in: {} {}".format(event_group, self.filename))
49 |         if fastq:
50 |             return data
51 |         name, sequence, _, qstring = data.strip().split('\n')
52 |         name = name[1:]
53 |         return name, sequence, qstring
54 | 
55 |     def add_called_sequence(self, section, name, sequence, qstring):
56 |         """ Add basecalled sequence data
57 |         
58 |         :param section: ['template', 'complement' or '2D']
59 |         :param name: The record ID to use for the fastq.
60 |         :param sequence: The called sequence.
61 |         :param qstring: The quality string.
62 |         """
63 |         event_group = 'BaseCalled_{}'.format(section)
64 |         if not event_group in self.handle.handle['Analyses/{}'.format(self.group_name)]:
65 |             self.handle.add_analysis_subgroup(self.group_name, event_group)
66 |         fastq_text = '@{}\n{}\n+\n{}\n'.format(name, sequence, qstring)
67 |         fastq_arr = np.array(fastq_text, dtype=str)
68 |         self.handle.add_analysis_dataset('{}/{}'.format(self.group_name, event_group), 'Fastq', fastq_arr)
69 | 


--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/basecall_2d.py:
--------------------------------------------------------------------------------
 1 | """ Helper class for working with 2D basecall type analyses.
 2 | """
 3 | import warnings
 4 | from ont_fast5_api.analysis_tools.basecall_1d import Basecall1DTools
 5 | 
 6 | 
 7 | class Basecall2DTools(Basecall1DTools):
 8 |     """ Provides helper methods specific to 2D basecall analyses.
 9 |     """
10 | 
11 |     group_id = 'Basecall_2D'
12 |     analysis_id = 'basecall_2d'
13 | 
14 |     def get_prior_alignment(self):
15 |         """ Return the prior alignment that was used for 2D basecalling.
16 | 
17 |         :return: Alignment data table.
18 |         """
19 |         data_group = '{}/HairpinAlign'.format(self.group_name)
20 |         data = self.handle.get_analysis_dataset(data_group, 'Alignment')
21 |         return data
22 | 
23 |     def get_2d_call_alignment(self):
24 |         """ Return the alignment and model_states from the 2D basecall.
25 | 
26 |         :return: Alignment data table.
27 |         """
28 |         data_group = '{}/BaseCalled_2D'.format(self.group_name)
29 |         data = self.handle.get_analysis_dataset(data_group, 'Alignment')
30 |         return data
31 | 
32 |     def add_prior_alignment(self, data):
33 |         """ Add template or complement basecalled event data.
34 |         
35 |         :param data: Alignment table to be written.
36 |         """
37 |         path = 'Analyses/{}'.format(self.group_name)
38 |         if 'HairpinAlign' not in self.handle.handle[path]:
39 |             self.handle.add_analysis_subgroup(self.group_name, 'HairpinAlign')
40 | 
41 |         path = '{}/HairpinAlign'.format(self.group_name)
42 |         self.handle.add_analysis_dataset(path, 'Alignment', data)
43 | 
44 |     def add_2d_call_alignment(self, data):
45 |         """ Add the alignment and model_state data table..
46 |         
47 |         :param data: Alignment and model_state table to be written.
48 |         """
49 |         path = 'Analyses/{}'.format(self.group_name)
50 |         if 'BaseCalled_2D' not in self.handle.handle[path]:
51 |             self.handle.add_analysis_subgroup(self.group_name, 'BaseCalled_2D')
52 | 
53 |         path = '{}/BaseCalled_2D'.format(self.group_name)
54 |         self.handle.add_analysis_dataset(path, 'Alignment', data)
55 | 
56 |     def get_called_sequence(self, section=None, fastq=False):
57 |         """ Return either the called sequence data, if present.
58 |         :param section: ['template', 'complement' or '2D']
59 |         :param fastq: If True, return a single, multiline fastq string. If
60 |             False, return a tuple of (name, sequence, qstring).
61 |         :return: Either the fastq string or the (name, sequence, qstring) tuple.
62 |         """
63 |         if section != "2D":
64 |             warnings.warn("Basecall2DTools.get_called_sequence() should specify section='2D'", DeprecationWarning)
65 |             # Backwards compatibilty to 0.3.3, if no "2D" section, bump args by 1 and pass to super
66 |             if section == None:
67 |                 # We assume that a named arg or no-arg was given
68 |                 return super(Basecall2DTools, self).get_called_sequence("2D", fastq)
69 |             # We assume that a single unnamed arg was given for fastq
70 |             return super(Basecall2DTools, self).get_called_sequence("2D", section)
71 |         return super(Basecall2DTools, self).get_called_sequence(section, fastq)
72 | 


--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/event_detection.py:
--------------------------------------------------------------------------------
  1 | """ Helper class for working with event detection type analyses.
  2 | """
  3 | import numpy as np
  4 | 
  5 | from ont_fast5_api.analysis_tools.base_tool import BaseTool
  6 | 
  7 | 
  8 | class EventDetectionTools(BaseTool):
  9 |     """ Provides helper methods specific to event detection analyses.
 10 |     """
 11 | 
 12 |     group_id = 'EventDetection'
 13 |     analysis_id = 'event_detection'
 14 | 
 15 |     def set_event_data(self, data, read_attrs):
 16 |         """ Set event data with the specied attributes.
 17 |         
 18 |         :param data: Event data table.
 19 |         :param read_attrs: Attributes to put on the read group. This must include
 20 |             the read_number, which must refer to a read present in the object. The
 21 |             attributes should not include the standard read attributes:
 22 | 
 23 |              * read_id
 24 |              * start_time
 25 |              * duration
 26 |              * start_mux
 27 | 
 28 |             Those will be pulled from the read information already present in the
 29 |             object for the specified read.
 30 |         """
 31 |         if self.handle.mode == 'r':
 32 |             raise Exception('File is not open for writing.')
 33 |         read_number = read_attrs['read_number']
 34 |         read_group = '{}/Reads/Read_{}'.format(self.group_name, read_number)
 35 |         read_info = self.handle.status.read_info
 36 |         read_number_map = self.handle.status.read_number_map
 37 |         index = read_number_map.get(read_number)
 38 |         if index is None:
 39 |             raise Exception('Cannot add event detection data for a read that does not exist.')
 40 |         info = read_info[index]
 41 |         read_attrs.update({'read_id': info.read_id,
 42 |                            'start_time': info.start_time,
 43 |                            'duration': info.duration,
 44 |                            'start_mux': info.start_mux,
 45 |                            'median_before': info.median_before})
 46 |         attrs = self.handle.get_analysis_attributes(read_group)
 47 |         if attrs is None:
 48 |             self.handle.add_analysis_subgroup(self.group_name, 'Reads/Read_{}'.format(read_number),
 49 |                                               attrs=read_attrs)
 50 |             self.handle.add_analysis_dataset(read_group, 'Events', data)
 51 |         else:
 52 |             raise Exception('Event detection data already exists for this analysis and read.')
 53 | 
 54 |     def get_event_data(self, read_number=None, time_in_seconds=False):
 55 |         """ Get event data for the specified (or only) read.
 56 |         
 57 |         :param read_number: The read number to grab event data for. If this
 58 |             is None, and there is only one read, it will grab event data for
 59 |             that read.
 60 |         :param time_in_seconds: If True, this will convert (if necessary) the
 61 |             start and length fields from samples to seconds. If they are already
 62 |             in seconds, this option has no effect.
 63 |         :return: A tuple containing the event data, and the read attributes.
 64 |         """
 65 |         read_info = self.handle.status.read_info
 66 |         if read_number is None:
 67 |             if len(read_info) != 1:
 68 |                 raise Exception('Must specify a read number if there is not exactly 1 read.')
 69 |             read_number = read_info[0].read_number
 70 |         else:
 71 |             read_numbers = [info.read_number for info in read_info]
 72 |             if read_number not in read_numbers:
 73 |                 raise Exception('Specified read does not exist.')
 74 |         group = '{}/Reads/Read_{}'.format(self.group_name, read_number)
 75 |         attrs = self.handle.get_analysis_attributes(group)
 76 |         dataset = self.handle.get_analysis_dataset(group, 'Events', skip_decoding=True)
 77 |         if dataset is None:
 78 |             raise Exception('Read number {} has no event data.'.format(read_number))
 79 |         if time_in_seconds and dataset['start'].dtype.kind in ['i', 'u']:
 80 |             channel_info = self.handle.get_channel_info()
 81 |             sample_size = 1.0 / channel_info['sampling_rate']
 82 |             descr = [(x[0], 'float64') if x[0] in ('start', 'length') else x
 83 |                      for x in dataset.dtype.descr]
 84 |             data = dataset.astype(np.dtype(descr))[()]
 85 |             data['start'] *= sample_size
 86 |             data['length'] *= sample_size
 87 |         else:
 88 |             data = dataset[()]
 89 |         return data, attrs
 90 | 
 91 |     def has_event_data(self, read_number=None):
 92 |         """ Find out if the specified (or only) read has event data.
 93 | 
 94 |         :param read_number: The read number to check for event data. If this
 95 |             is ``None``, and there is only one read, it will check that read.
 96 |         :returns: True if event data exists for the read number.
 97 |         """
 98 |         read_info = self.handle.status.read_info
 99 |         if read_number is None:
100 |             if len(read_info) != 1:
101 |                 raise Exception('Must specify a read number if there is not exactly 1 read.')
102 |             read_number = read_info[0].read_number
103 |         else:
104 |             read_numbers = [info.read_number for info in read_info]
105 |             if read_number not in read_numbers:
106 |                 raise Exception('Specified read does not exist.')
107 |         group = '{}/Reads/Read_{}'.format(self.group_name, read_number)
108 |         dataset = self.handle.get_analysis_dataset(group, 'Events', skip_decoding=True)
109 |         return dataset is not None
110 | 
111 |     ##########################
112 |     #
113 |     #  Private methods below
114 |     #
115 |     ##########################
116 | 
117 |     def _new_analysis(self, meta, config):
118 |         if self.handle.mode == 'r':
119 |             raise Exception('Cannot create new event detection group. File is not open for writing.')
120 |         self.handle.add_analysis('event_detection', self.group_name, meta, config)
121 |         self.handle.add_analysis_subgroup(self.group_name, 'Reads')
122 | 


--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/segmentation.py:
--------------------------------------------------------------------------------
  1 | """ Helper class for working with segmentation type analyses.
  2 | """
  3 | import numpy as np
  4 | 
  5 | from ont_fast5_api.analysis_tools.base_tool import BaseTool
  6 | from ont_fast5_api.analysis_tools.event_detection import EventDetectionTools
  7 | 
  8 | 
  9 | class SegmentationTools(BaseTool):
 10 |     """ Provides helper methods specific to segmentation analyses.
 11 |     """
 12 |     group_id = 'Segmentation'
 13 |     analysis_id = 'segmentation'
 14 | 
 15 |     def get_results(self):
 16 |         """ Returns the segmentation summary data.
 17 |         
 18 |         This data is normalized, to eliminate differences in what is stored
 19 |         for different types of segmentation analyses.
 20 |         
 21 |         The following fields are output:
 22 | 
 23 |         * has_template - True if the segmentation found template data.
 24 |         * has_complement - True if the segmentation found complement data.
 25 |         * first_sample_template - The first sample of the template data in
 26 |             the raw data. Only present if has_template is True.
 27 |         * duration_template - The duration (in samples) of the template
 28 |             data. Only present if has_template is True.
 29 |         * first_sample_complement - The first sample of the complement data
 30 |             in the raw data. Only present if has_complement is True.
 31 |         * duration_complement - The duration (in samples) of the complement
 32 |             data. Only present if has_complement is True.
 33 |             
 34 |         """
 35 |         summary = self._get_summary_data()
 36 |         if summary is None:
 37 |             results = {'has_template': False,
 38 |                        'has_complement': False}
 39 |         else:
 40 |             results = {}
 41 |             if 'has_template' in summary:
 42 |                 results['has_template'] = bool(summary['has_template'])
 43 |             else:
 44 |                 results['has_template'] = True if summary['num_temp'] > 0 else False
 45 |             if 'has_complement' in summary:
 46 |                 results['has_complement'] = bool(summary['has_complement'])
 47 |             else:
 48 |                 results['has_complement'] = True if summary['num_comp'] > 0 else False
 49 |             need_raw_info = False
 50 |             if results['has_template']:
 51 |                 if 'start_index_temp' in summary:
 52 |                     summary['start_event_template'] = summary['start_index_temp']
 53 |                     summary['end_event_template'] = summary['end_index_temp']
 54 |                 if 'first_sample_template' not in summary:
 55 |                     need_raw_info = True
 56 |             if results['has_complement']:
 57 |                 if 'start_index_comp' in summary:
 58 |                     summary['start_event_complement'] = summary['start_index_comp']
 59 |                     summary['end_event_complement'] = summary['end_index_comp']
 60 |                 if 'first_sample_complement' not in summary:
 61 |                     need_raw_info = True
 62 |             if need_raw_info:
 63 |                 self._get_raw_info(summary)
 64 |             if results['has_template']:
 65 |                 results['first_sample_template'] = summary['first_sample_template']
 66 |                 results['duration_template'] = summary['duration_template']
 67 |                 if 'start_event_template' in summary:
 68 |                     results['start_event_template'] = summary['start_event_template']
 69 |                     results['end_event_template'] = summary['end_event_template']
 70 |             if results['has_complement']:
 71 |                 results['first_sample_complement'] = summary['first_sample_complement']
 72 |                 results['duration_complement'] = summary['duration_complement']
 73 |                 if 'start_event_complement' in summary:
 74 |                     results['start_event_complement'] = summary['start_event_complement']
 75 |                     results['end_event_complement'] = summary['end_event_complement']
 76 |         return results
 77 | 
 78 |     def get_event_data(self, section, time_in_seconds=False):
 79 |         """ Get the template or complement event data.
 80 |         
 81 |         :param section: Either template, complement, or both.
 82 |         :param time_in_seconds: Return the start and length fields
 83 |             in seconds, rather than samples.
 84 |         :return: The event dataset for the section. If section=both
 85 |             then it returns a tuple with both sections. Returns None
 86 |             if the section does not exist.
 87 |         """
 88 |         if section not in ['template', 'complement', 'both']:
 89 |             raise Exception('Unrecognized section: {} Expected: "template", "complement" or "both"'.format(section))
 90 |         results = self.get_results()
 91 |         if results is None:
 92 |             return None, None if section is 'both' else None
 93 |         if section == 'both':
 94 |             sections = ['template', 'complement']
 95 |         else:
 96 |             sections = [section]
 97 |         evdet_group, _ = self._find_event_data()
 98 |         with EventDetectionTools(self.handle, group_name=evdet_group) as evdet:
 99 |             event_data, _ = evdet.get_event_data(time_in_seconds=time_in_seconds)
100 |         datasets = [None, None]
101 |         for n, this_section in enumerate(sections):
102 |             if not results['has_{}'.format(this_section)]:
103 |                 continue
104 |             ev1 = results['start_event_{}'.format(this_section)]
105 |             ev2 = results['end_event_{}'.format(this_section)]
106 |             datasets[n] = event_data[ev1:ev2]
107 |         if section == 'both':
108 |             return tuple(datasets)
109 |         return datasets[0]
110 | 
111 |     def get_raw_data(self, section, scale=False):
112 |         """ Get the template or complement raw data.
113 |         
114 |         :param section: Either template, complement, or both.
115 |         :param scale: Scale the raw data to pA.
116 |         :return:  The raw data for the section. If section=both
117 |             then it returns a tuple with both sections. Returns None
118 |             if the section does not exist.
119 |         """
120 |         results = self.get_results()
121 |         datasets = [None, None]
122 |         if section == 'both':
123 |             sections = ['template', 'complement']
124 |         else:
125 |             sections = [section]
126 |         for n, this_section in enumerate(sections):
127 |             if not results['has_{}'.format(this_section)]:
128 |                 continue
129 |             start = results['first_sample_{}'.format(this_section)]
130 |             dur = results['duration_{}'.format(this_section)]
131 |             datasets[n] = self.handle.get_raw_data(start=start, end=start+dur, scale=scale)
132 |         if section == 'both':
133 |             return tuple(datasets)
134 |         return datasets[0]
135 | 
136 | 
137 |     ##########################
138 |     #
139 |     #  Private methods below
140 |     #
141 |     ##########################
142 |     
143 |     def _get_summary_data(self):
144 |         summary = self.handle.get_summary_data(self.group_name)
145 |         if summary is None:
146 |             return None
147 |         if 'segmentation' in summary:
148 |             results = summary['segmentation']
149 |         elif 'split_hairpin' in summary:
150 |             results = summary['split_hairpin']
151 |         else:
152 |             results = None
153 |         return results
154 | 
155 |     def _find_event_data(self):
156 |         attrs = self.handle.get_analysis_attributes(self.group_name)
157 |         evdet_group = attrs.get('event_detection')
158 |         if evdet_group is None:
159 |             evdet_group = self.handle.get_latest_analysis('EventDetection')
160 |         else:
161 |             evdet_group = evdet_group[9:]
162 |         if evdet_group is None:
163 |             return None
164 |         # We directly use the Fast5Read interface here, rather than the
165 |         # EventDetectionTools one, because we don't want to load the entire
166 |         # event table into memory.
167 |         read_info = self.handle.status.read_info[0] # We assume only one read.
168 |         read_number = read_info.read_number
169 |         event_table_group = '{}/Reads/Read_{}'.format(evdet_group, read_number)
170 |         dataset = self.handle.get_analysis_dataset(event_table_group, 'Events', skip_decoding=True)
171 |         return evdet_group, dataset
172 | 
173 |     def _get_raw_info(self, summary):
174 |         _, dataset = self._find_event_data()
175 |         read_info = self.handle.status.read_info[0] # We assume only one read.
176 |         if dataset is None:
177 |             summary['first_sample_template'] = None
178 |             summary['duration_template'] = None
179 |             summary['first_sample_complement'] = None
180 |             summary['duration_complement'] = None
181 |             return
182 |         if summary.get('start_event_template', -1) >= 0:
183 |             ev1 = summary['start_event_template']
184 |             ev2 = summary['end_event_template']
185 |             summary['first_sample_template'] = dataset[ev1]['start'] - read_info.start_time
186 |             end = dataset[ev2-1]['start'] + dataset[ev2-1]['length'] - read_info.start_time
187 |             summary['duration_template'] = end - summary['first_sample_template']
188 |         if summary.get('start_event_complement', -1) >= 0:
189 |             ev1 = summary['start_event_complement']
190 |             ev2 = summary['end_event_complement']
191 |             summary['first_sample_complement'] = dataset[ev1]['start'] - read_info.start_time
192 |             end = dataset[ev2-1]['start'] + dataset[ev2-1]['length'] - read_info.start_time
193 |             summary['duration_complement'] = end - summary['first_sample_complement']
194 | 


--------------------------------------------------------------------------------
/ont_fast5_api/compression_settings.py:
--------------------------------------------------------------------------------
 1 | import pkg_resources
 2 | 
 3 | 
 4 | def register_plugin():
 5 |     plugin_path = pkg_resources.resource_filename('ont_fast5_api', 'vbz_plugin')
 6 |     try:
 7 |         from h5py import h5pl
 8 |         h5pl.prepend(bytes(plugin_path, 'UTF-8'))
 9 |     except (ImportError, AttributeError):
10 |         # We don't have the plugin library in h5py<2.10 so we fall back on an environment variable
11 |         import os
12 |         os.environ['HDF5_PLUGIN_PATH'] = plugin_path
13 |     return plugin_path
14 | 
15 | 
16 | class AbstractCompression:
17 |     compression = "AbstractCompression"
18 |     compression_opts = ()
19 |     shuffle = False
20 |     scaleoffset = False
21 |     fletcher32 = False
22 | 
23 |     def __repr__(self):
24 |         return self.compression
25 | 
26 |     @property
27 |     def filter_settings(self):
28 |         return {}
29 | 
30 | 
31 | class VbzCompressionV1Alpha(AbstractCompression):
32 |     def __init__(self):
33 |         self.compression = 32020  # https://portal.hdfgroup.org/display/support/Registered+Filters
34 |         self.compression_opts = (1, 2, 1, 1)  # VBZ_VERSION, VBZ_PACKING, VBZ_ZIG_ZAG, VBZ_ZSTD_COMPRESSION
35 | 
36 |     def __repr__(self):
37 |         return "vbz_v1.alpha"
38 | 
39 |     @property
40 |     def filter_settings(self):
41 |         return {str(self.compression): self.compression_opts}
42 | 
43 | 
44 | class VbzCompression(AbstractCompression):
45 |     def __init__(self):
46 |         self.compression = 32020  # https://portal.hdfgroup.org/display/support/Registered+Filters
47 |         self.compression_opts = (0, 2, 1, 1)  # VBZ_VERSION, VBZ_PACKING, VBZ_ZIG_ZAG, VBZ_ZSTD_COMPRESSION
48 | 
49 |     def __repr__(self):
50 |         return "vbz"
51 | 
52 |     @property
53 |     def filter_settings(self):
54 |         return {str(self.compression): self.compression_opts}
55 | 
56 | 
57 | class GzipCompression(AbstractCompression):
58 |     def __init__(self):
59 |         self.compression = "gzip"
60 |         self.compression_opts = 1
61 | 
62 |     @property
63 |     def filter_settings(self):
64 |         return {str(self.compression): self.compression_opts}
65 | 
66 | 
67 | VBZ_ERROR_MESSAGE = "Failed to read compressed raw data. " \
68 |                     "VBZ compression filter (id=32020) may be missing from expected path: '{}'"
69 | 
70 | 
71 | def raise_missing_vbz_error_read(err):
72 |     if str(VBZ.compression) in str(err):
73 |         raise IOError(VBZ_ERROR_MESSAGE.format(register_plugin())) from err
74 |     # If we don't see anything relating to VBZ just raise the existing error without additional info
75 |     raise
76 | 
77 | 
78 | def raise_missing_vbz_error_write(err):
79 |     if type(err) is OSError and "Can't read data" in str(err):
80 |         raise IOError(VBZ_ERROR_MESSAGE.format(register_plugin())) from err
81 |     # If we don't see anything relating to VBZ just raise the existing error without additional info
82 |     raise
83 | 
84 | 
85 | VBZ_ALPHA = VbzCompressionV1Alpha()
86 | VBZ = VbzCompression()
87 | GZIP = GzipCompression()
88 | 
89 | COMPRESSION_MAP = {str(comp): comp for comp in (VBZ_ALPHA, VBZ, GZIP)}
90 | 


--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/ont_fast5_api/conversion_tools/__init__.py


--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/check_file_compression.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | from ont_fast5_api.compression_settings import COMPRESSION_MAP
 4 | from ont_fast5_api.conversion_tools.conversion_utils import yield_fast5_files
 5 | from ont_fast5_api.fast5_interface import get_fast5_file
 6 | 
 7 | 
 8 | def check_read_compression(read):
 9 |     """
10 |     Check the compresion type on the raw data of a read
11 |     :param read: Fast5Read object
12 |     :return: AbstractCompression object
13 |     """
14 |     detected_compression = read.raw_compression_filters
15 |     for compression in COMPRESSION_MAP.values():
16 |         if compression.filter_settings == detected_compression:
17 |             return compression
18 |     return detected_compression
19 | 
20 | 
21 | def check_compression(input_path, recursive, follow_symlinks, check_all_reads):
22 |     """
23 |     Check the compression type of the raw data in files in a folder
24 |     :param input_path:
25 |     :param recursive:
26 |     :param follow_symlinks:
27 |     :param check_all_reads: bool - check all reads in a file or just the first
28 |     :return: (Compression, read_id, file_path)
29 |     """
30 |     for input_file in yield_fast5_files(input_path, recursive, follow_symlinks):
31 |         with get_fast5_file(input_file, 'r') as f5:
32 |             for read in f5.get_reads():
33 |                 compression = check_read_compression(read)
34 |                 yield (compression, read.read_id, input_file)
35 |                 if not check_all_reads:
36 |                     break
37 | 
38 | 
39 | def main():
40 |     parser = ArgumentParser("Tool for checking the compression type of raw data in fast5 files")
41 |     parser.add_argument('-i', '--input_path', required=True,
42 |                         help="Path to Fast5 file or directory of Fast5 files")
43 |     parser.add_argument('--check_all_reads', action='store_true', required=False, default=False,
44 |                         help="Check all reads in a file individually (default: check only the first read)")
45 |     parser.add_argument('-r', '--recursive', action='store_true', required=False, default=False,
46 |                         help="Search recursively through folders for MultiRead fast5 files")
47 |     parser.add_argument('--ignore_symlinks', action='store_true',
48 |                         help="Ignore symlinks when searching recursively for fast5 files")
49 |     parser.add_argument('--file_list', required=False,
50 |                         help="File containing names of files to search in")
51 |     args = parser.parse_args()
52 |     compression_results = check_compression(args.input_path, args.recursive, not args.ignore_symlinks,
53 |                                             args.check_all_reads)
54 |     for result in compression_results:
55 |         print(result)
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 


--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/compress_fast5.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import shutil
  4 | from argparse import ArgumentParser, ArgumentError
  5 | from multiprocessing.pool import Pool
  6 | 
  7 | from ont_fast5_api import __version__
  8 | from ont_fast5_api.compression_settings import COMPRESSION_MAP
  9 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, get_progress_bar
 10 | from ont_fast5_api.fast5_file import Fast5File, EmptyFast5
 11 | from ont_fast5_api.fast5_read import copy_attributes
 12 | from ont_fast5_api.fast5_interface import is_multi_read
 13 | from ont_fast5_api.multi_fast5 import MultiFast5File
 14 | from ont_fast5_api.static_data import OPTIONAL_READ_GROUPS
 15 | 
 16 | 
 17 | def compress_batch(input_folder, output_folder, target_compression, recursive=True, threads=1, follow_symlinks=True,
 18 |                    in_place=False, sanitize=False):
 19 |     # We require an absolute input path to we can replicate the data structure relative to it later on
 20 |     input_folder = os.path.abspath(input_folder)
 21 | 
 22 |     file_list = get_fast5_file_list(input_folder, recursive, follow_symlinks=follow_symlinks)
 23 |     if len(file_list) == 0:
 24 |         raise ValueError("No input fast5 files found in '{}'. Recursive={}".format(input_folder, recursive))
 25 | 
 26 |     # Set up the process pool and the progressbar
 27 |     pool = Pool(min(threads, len(file_list)))
 28 |     pbar = get_progress_bar(len(file_list))
 29 | 
 30 |     def update(result):
 31 |         if in_place and result is not None:
 32 |             input_path, output_path = result
 33 |             shutil.move(output_path, input_path)
 34 |         pbar.update(pbar.currval + 1)
 35 | 
 36 |     for input_file in file_list:
 37 |         input_path = os.path.join(input_folder, input_file)
 38 |         if in_place:
 39 |             output_path = os.path.join(input_path + ".tmp.compressed")
 40 |         else:
 41 |             output_path = os.path.join(output_folder, os.path.relpath(input_path, input_folder))
 42 | 
 43 |         pool.apply_async(func=compress_file,
 44 |                          args=(input_path, output_path, target_compression, sanitize),
 45 |                          callback=update)
 46 | 
 47 |     # Tear down the process pool and pbar. We can't use contextmanagers since we need to close() then join()
 48 |     pool.close()
 49 |     pool.join()
 50 |     pbar.finish()
 51 | 
 52 | 
 53 | def compress_file(input_file, output_file, target_compression, sanitize=False):
 54 |     try:
 55 |         os.makedirs(os.path.dirname(output_file), exist_ok=True)
 56 |         if is_multi_read(input_file):
 57 |             with MultiFast5File(input_file, 'r') as input_f5, MultiFast5File(output_file, 'a') as output_f5:
 58 |                 for read in input_f5.get_reads():
 59 |                     output_f5.add_existing_read(read, target_compression, sanitize=sanitize)
 60 |         else:
 61 |             with Fast5File(input_file, 'r') as input_f5, \
 62 |                     EmptyFast5(output_file, 'a') as output_f5:
 63 |                 compress_single_read(output_f5, input_f5, target_compression, sanitize=sanitize)
 64 |     except Exception as e:
 65 |         # Error raised in Pool.async will be lost so we explicitly print them.
 66 |         logging.exception(e)
 67 |         raise
 68 |     return (input_file, output_file)
 69 | 
 70 | 
 71 | def compress_single_read(output_f5, read_to_copy, target_compression, sanitize=False):
 72 |     read_id = read_to_copy.get_read_id()
 73 |     raw_dataset_name = read_to_copy.raw_dataset_name
 74 |     raw_group_name = read_to_copy.raw_dataset_group_name
 75 |     read_name = "read_" + read_id
 76 |     # Recreating the status object is painful, but doesn't actually interact with the file so we can just reference it.
 77 |     output_f5.status = read_to_copy.status
 78 | 
 79 |     if str(target_compression) in read_to_copy.raw_compression_filters:
 80 |         # If we have the right compression then no need for doing anything fancy
 81 |         output_f5.handle.copy(read_to_copy.handle, read_name)
 82 |     else:
 83 |         copy_attributes(read_to_copy.handle.attrs, output_f5.handle)
 84 |         for subgroup in read_to_copy.handle:
 85 |             if subgroup not in raw_dataset_name:
 86 |                 if sanitize and subgroup in OPTIONAL_READ_GROUPS:
 87 |                     # skip optional groups when sanitizing
 88 |                     continue
 89 |                 output_f5.handle.copy(read_to_copy.handle[subgroup], subgroup)
 90 |             else:
 91 |                 raw_attrs = read_to_copy.handle[raw_group_name].attrs
 92 |                 raw_data = read_to_copy.handle[raw_dataset_name]
 93 |                 output_f5.add_raw_data(raw_data, raw_attrs, compression=target_compression)
 94 | 
 95 | 
 96 | def main():
 97 |     parser = ArgumentParser("Tool for changing the compression of Fast5 files")
 98 |     parser.add_argument('-i', '--input_path', required=True,
 99 |                         help='Folder containing fast5 files')
100 | 
101 |     output_group = parser.add_mutually_exclusive_group(required=True)
102 |     save_arg = output_group.add_argument('-s', '--save_path', default=None,
103 |                                          help="Folder to output fast5 read files to")
104 |     output_group.add_argument('--in_place', action='store_true',
105 |                               help='Replace the old files with new files in place')
106 | 
107 |     parser.add_argument('-c', '--compression', required=True, choices=list(COMPRESSION_MAP.keys()),
108 |                         help="Target output compression type")
109 |     parser.add_argument('--sanitize', action='store_true',
110 |                         help="Clean output files of optional groups and datasets (e.g. 'Analyses')")
111 |     parser.add_argument('-t', '--threads', type=int, default=1, required=False,
112 |                         help="Maximum number of threads to use")
113 |     parser.add_argument('--recursive', action='store_true',
114 |                         help="Search recursively through folders for single_read fast5 files")
115 |     parser.add_argument('--ignore_symlinks', action='store_true',
116 |                         help="Ignore symlinks when searching recursively for fast5 files")
117 |     parser.add_argument('-v', '--version', action='version', version=__version__)
118 |     args = parser.parse_args()
119 | 
120 |     if args.input_path == args.save_path:
121 |         raise ArgumentError(save_arg, "--input_path and --save_path must be different locations, or use --in_place")
122 |     if args.sanitize and args.save_path is None:
123 |         raise ArgumentError(save_arg, "--save_path must be given if using --sanitize")
124 | 
125 |     compress_batch(input_folder=args.input_path,
126 |                    output_folder=args.save_path,
127 |                    target_compression=COMPRESSION_MAP[args.compression],
128 |                    threads=args.threads,
129 |                    recursive=args.recursive,
130 |                    follow_symlinks=not args.ignore_symlinks,
131 |                    in_place=args.in_place,
132 |                    sanitize=args.sanitize)
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     main()
137 | 


--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/demux_fast5.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Script for binning fast5 reads into separate directories based on column value in summary file
  3 | Inteded for demultiplexing reads using barcoding summary file.
  4 | """
  5 | from pathlib import Path
  6 | from typing import Union, Dict, Set, List
  7 | from multiprocessing import Pool
  8 | import logging
  9 | from csv import reader
 10 | from collections import defaultdict
 11 | from time import sleep
 12 | from math import ceil
 13 | from argparse import ArgumentParser
 14 | 
 15 | from ont_fast5_api.compression_settings import COMPRESSION_MAP
 16 | from ont_fast5_api.conversion_tools.conversion_utils import (
 17 |     get_fast5_file_list,
 18 |     get_progress_bar,
 19 |     Fast5FilterWorker,
 20 |     READS_PER_FILE,
 21 |     FILENAME_BASE,
 22 |     ProgressBar,
 23 | )
 24 | 
 25 | DEMULTIPLEX_COLUMN = "barcode_arrangement"
 26 | READ_ID_COLUMN = "read_id"
 27 | 
 28 | 
 29 | class Fast5Demux:
 30 |     """
 31 |     Bin reads from directory of fast5 files according to demultiplex_column in sequencing_summary path
 32 |     :param input_dir: Path to input Fast5 file or directory of Fast5 files
 33 |     :param output_dir: Path to output directory
 34 |     :param summary_file: Path to TSV summary file
 35 |     :param demultiplex_column: str name of column with demultiplex values
 36 |     :param read_id_column: str name of column with read ids
 37 |     :param filename_base: str prefix for output Fast5 files
 38 |     :param batch_size: int maximum number of reads per output file
 39 |     :param threads: int maximum number of worker processes
 40 |     :param recursive: bool flag to search recursively through input_dir for Fast5 files
 41 |     :param follow_symlinks: bool flag to follow symlinks in input_dir
 42 |     :param target_compression: str compression type in output Fast5 files
 43 |     """
 44 | 
 45 |     def __init__(
 46 |         self,
 47 |         input_dir: Path,
 48 |         output_dir: Path,
 49 |         summary_file: Path,
 50 |         demultiplex_column: str,
 51 |         read_id_column: str = READ_ID_COLUMN,
 52 |         filename_base: str = FILENAME_BASE,
 53 |         batch_size: int = READS_PER_FILE,
 54 |         threads: int = 1,
 55 |         recursive: bool = False,
 56 |         follow_symlinks: bool = True,
 57 |         target_compression: Union[str, None] = None,
 58 |     ):
 59 |         self.input_dir = input_dir
 60 |         self.output_dir = output_dir
 61 |         self.summary = summary_file
 62 |         self.demultiplex_column = demultiplex_column
 63 |         self.read_id_column = read_id_column
 64 |         self.filename_base = filename_base
 65 |         self.batch_size = batch_size
 66 |         self.threads = threads
 67 |         self.recursive = recursive
 68 |         self.follow_symlinks = follow_symlinks
 69 |         self.target_compression = target_compression
 70 | 
 71 |         self.read_sets: Dict[str, Set[str]] = {}
 72 |         self.input_fast5s: List[Path] = []
 73 |         self.max_threads: int = 0
 74 |         self.workers: List = []
 75 |         self.progressbar: Union[ProgressBar, None] = None
 76 |         self.logger: logging.Logger = logging.getLogger(self.__class__.__name__)
 77 | 
 78 |     def create_output_dirs(self) -> None:
 79 |         """
 80 |         In output directory create a subdirectory per demux category
 81 |         :return:
 82 |         """
 83 |         self.output_dir.mkdir(parents=True, exist_ok=True)
 84 |         for demux in self.read_sets:
 85 |             out_dir = self.output_dir / demux
 86 |             out_dir.mkdir(exist_ok=True)
 87 | 
 88 |     def run_batch(self) -> None:
 89 |         """
 90 |         Run workers in pool or sequentially
 91 |         Starts multiprocessing pool if max_threads allows it
 92 |         :return:
 93 |         """
 94 |         self.workers_setup()
 95 | 
 96 |         if self.max_threads > 1:
 97 |             with Pool(self.max_threads) as pool:
 98 |                 for worker in self.workers:
 99 |                     worker.run_batch(pool=pool)
100 |                 while any(worker.tasks for worker in self.workers):
101 |                     sleep(1)
102 | 
103 |             pool.join()
104 |             pool.close()
105 |         else:
106 |             for worker in self.workers:
107 |                 worker.run_batch(pool=None)
108 | 
109 |         self.progressbar.finish()
110 | 
111 |     def workers_setup(self) -> None:
112 |         """
113 |         Parse input summary and input file list to determine amount of work
114 |         Create output directories and initialise workers
115 |         :return:
116 |         """
117 |         self.read_sets = self.parse_summary_demultiplex()
118 |         self.input_fast5s = get_fast5_file_list(
119 |             input_path=self.input_dir,
120 |             recursive=self.recursive,
121 |             follow_symlinks=self.follow_symlinks,
122 |         )
123 |         self.max_threads = self.calculate_max_threads()
124 |         # progressbar length is total numbers of reads to be extracted plus total number of files to be read
125 |         total_progress = sum(len(item) for item in self.read_sets.values()) + (
126 |             len(self.input_fast5s) * len(self.read_sets)
127 |         )
128 |         self.progressbar = get_progress_bar(num_reads=total_progress)
129 |         self.create_output_dirs()
130 |         for demux in sorted(self.read_sets):
131 |             self.workers.append(
132 |                 Fast5FilterWorker(
133 |                     input_file_list=self.input_fast5s,
134 |                     output_dir=self.output_dir / demux,
135 |                     read_set=self.read_sets[demux],
136 |                     progressbar=self.progressbar,
137 |                     logger=self.logger,
138 |                     filename_base=self.filename_base,
139 |                     batch_size=self.batch_size,
140 |                     target_compression=self.target_compression,
141 |                 )
142 |             )
143 | 
144 |     def report(self) -> None:
145 |         """
146 |         Log summary of work done
147 |         :return:
148 |         """
149 |         total_reads = 0
150 |         for idx, _ in enumerate(sorted(self.read_sets)):
151 |             worker = self.workers[idx]
152 |             for file, reads in worker.out_files.items():
153 |                 total_reads += len(reads)
154 | 
155 |         self.logger.info("{} reads extracted".format(total_reads))
156 | 
157 |         # report reads not found
158 |         reads_to_extract = sum(len(item) for item in self.read_sets.values())
159 |         if reads_to_extract > total_reads:
160 |             self.logger.warning(
161 |                 "{} reads not found!".format(reads_to_extract - total_reads)
162 |             )
163 | 
164 |     def calculate_max_threads(self) -> int:
165 |         """
166 |         Calculate max number of workers based on number of output files, input files and threads argument
167 |         :return: int
168 |         """
169 |         max_inputs_per_worker = len(self.input_fast5s)
170 |         total_outputs = 0
171 |         for read_set in self.read_sets.values():
172 |             outputs = int(ceil(len(read_set) / float(self.batch_size)))
173 |             total_outputs += min(outputs, max_inputs_per_worker)
174 | 
175 |         return min(self.threads, total_outputs)
176 | 
177 |     def parse_summary_demultiplex(self) -> Dict[str, Set[str]]:
178 |         """
179 |         Open a TSV file and parse read_id and demultiplex columns into dict {demultiplex: read_id_set}
180 |         :return:
181 |         """
182 |         read_sets = defaultdict(set)
183 |         with open(str(self.summary), "r") as fh:
184 |             read_list_tsv = reader(fh, delimiter="\t")
185 |             header = next(read_list_tsv)
186 | 
187 |             if self.read_id_column in header:
188 |                 read_id_col_idx = header.index(self.read_id_column)
189 |             else:
190 |                 raise ValueError(
191 |                     "No '{}' read_id column in header: {}".format(
192 |                         self.read_id_column, header
193 |                     )
194 |                 )
195 | 
196 |             if self.demultiplex_column in header:
197 |                 demultiplex_col_idx = header.index(self.demultiplex_column)
198 |             else:
199 |                 raise ValueError(
200 |                     "No '{}' demultiplex column in header: {}".format(
201 |                         self.demultiplex_column, header
202 |                     )
203 |                 )
204 | 
205 |             for line in read_list_tsv:
206 |                 read_id = line[read_id_col_idx]
207 |                 demux = line[demultiplex_col_idx]
208 |                 read_sets[demux].add(read_id)
209 | 
210 |         return read_sets
211 | 
212 | 
213 | def create_arg_parser():
214 |     parser = ArgumentParser(
215 |         "Tool for binning reads from a multi_read_fast5_file by column value in summary file"
216 |     )
217 |     parser.add_argument(
218 |         "-i",
219 |         "--input",
220 |         required=True,
221 |         type=Path,
222 |         help="Path to Fast5 file or directory of Fast5 files",
223 |     )
224 |     parser.add_argument(
225 |         "-s",
226 |         "--save_path",
227 |         required=True,
228 |         type=Path,
229 |         help="Directory to output MultiRead subset to",
230 |     )
231 |     parser.add_argument(
232 |         "-l",
233 |         "--summary_file",
234 |         required=True,
235 |         type=Path,
236 |         help="TSV file containing read_id column (sequencing_summary.txt file)",
237 |     )
238 |     parser.add_argument(
239 |         "-f",
240 |         "--filename_base",
241 |         default="batch",
242 |         required=False,
243 |         help="Root of output filename, default='{}' -> '{}0.fast5'".format(
244 |             FILENAME_BASE, FILENAME_BASE
245 |         ),
246 |     )
247 |     parser.add_argument(
248 |         "-n",
249 |         "--batch_size",
250 |         type=int,
251 |         default=READS_PER_FILE,
252 |         required=False,
253 |         help="Number of reads per multi-read file (default {})".format(READS_PER_FILE),
254 |     )
255 |     parser.add_argument(
256 |         "-t",
257 |         "--threads",
258 |         type=int,
259 |         default=1,
260 |         required=False,
261 |         help="Maximum number of parallel processes to use (default 1)",
262 |     )
263 |     parser.add_argument(
264 |         "-r",
265 |         "--recursive",
266 |         action="store_true",
267 |         required=False,
268 |         default=False,
269 |         help="Flag to search recursively through input directory for MultiRead fast5 files",
270 |     )
271 |     parser.add_argument(
272 |         "--ignore_symlinks",
273 |         action="store_true",
274 |         help="Ignore symlinks when searching recursively for fast5 files",
275 |     )
276 |     parser.add_argument(
277 |         "-c",
278 |         "--compression",
279 |         required=False,
280 |         default=None,
281 |         choices=list(COMPRESSION_MAP.keys()) + [None],
282 |         help="Target output compression type. If omitted - don't change compression type",
283 |     )
284 |     parser.add_argument(
285 |         "--demultiplex_column",
286 |         type=str,
287 |         default=DEMULTIPLEX_COLUMN,
288 |         required=False,
289 |         help="Name of column for demultiplexing in summary file (default '{}'".format(
290 |             DEMULTIPLEX_COLUMN
291 |         ),
292 |     )
293 |     parser.add_argument(
294 |         "--read_id_column",
295 |         type=str,
296 |         default=READ_ID_COLUMN,
297 |         required=False,
298 |         help="Name of read_id column in summary file (default '{}'".format(
299 |             READ_ID_COLUMN
300 |         ),
301 |     )
302 |     return parser
303 | 
304 | 
305 | def main():
306 |     parser = create_arg_parser()
307 |     args = parser.parse_args()
308 |     if args.compression is not None:
309 |         args.compression = COMPRESSION_MAP[args.compression]
310 | 
311 |     demux = Fast5Demux(
312 |         input_dir=args.input,
313 |         output_dir=args.save_path,
314 |         summary_file=args.summary_file,
315 |         demultiplex_column=args.demultiplex_column,
316 |         read_id_column=args.read_id_column,
317 |         filename_base=args.filename_base,
318 |         batch_size=args.batch_size,
319 |         threads=args.threads,
320 |         recursive=args.recursive,
321 |         follow_symlinks=not args.ignore_symlinks,
322 |         target_compression=args.compression,
323 |     )
324 |     demux.run_batch()
325 |     demux.report()
326 | 
327 | 
328 | if __name__ == "__main__":
329 |     main()
330 | 


--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/fast5_subset.py:
--------------------------------------------------------------------------------
  1 | """Filter Fast5 files based on read_id list
  2 | """
  3 | import csv
  4 | import logging
  5 | from argparse import ArgumentParser
  6 | from math import ceil
  7 | from multiprocessing import Pool
  8 | from os import makedirs, path
  9 | from pathlib import Path
 10 | from time import sleep
 11 | 
 12 | from ont_fast5_api.compression_settings import COMPRESSION_MAP
 13 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, get_progress_bar, Fast5FilterWorker
 14 | from ont_fast5_api.conversion_tools.conversion_utils import READS_PER_FILE, FILENAME_BASE
 15 | 
 16 | logging.basicConfig(level=logging.DEBUG)
 17 | 
 18 | 
 19 | class Fast5Filter:
 20 |     """
 21 |     Extract reads listed read_list_file from fast5 files in input_folder, write to multi-fast5 files in
 22 |     output_folder
 23 |     """
 24 | 
 25 |     def __init__(self, input_folder, output_folder, read_list_file, filename_base=FILENAME_BASE,
 26 |                  batch_size=READS_PER_FILE, threads=1, recursive=False, file_list_file=None, follow_symlinks=True,
 27 |                  target_compression=None):
 28 |         assert path.isdir(input_folder)
 29 |         assert path.isfile(read_list_file)
 30 |         assert isinstance(filename_base, str)
 31 |         assert isinstance(batch_size, int)
 32 |         assert isinstance(threads, int)
 33 |         assert isinstance(recursive, bool)
 34 |         self.logger = logging.getLogger(self.__class__.__name__)
 35 | 
 36 |         self.read_set = parse_summary_file(read_list_file)
 37 |         self.input_f5s = get_fast5_file_list(str(input_folder), recursive, follow_symlinks=follow_symlinks)
 38 |         makedirs(output_folder, exist_ok=True)
 39 | 
 40 |         if len(self.read_set) < 1:
 41 |             raise ValueError("No reads in read list file {}".format(read_list_file))
 42 | 
 43 |         if len(self.input_f5s) < 1:
 44 |             raise ValueError(
 45 |                 "No input fast5 files found in {}. Recursion is set to {}".format(str(input_folder), recursive))
 46 | 
 47 |         if batch_size < 1:
 48 |             raise ValueError("Batch size (--batch_size) must be a positive integer, not {}".format(batch_size))
 49 | 
 50 |         if threads < 1:
 51 |             raise ValueError("Max number of threads (--threads) must be a positive integer, not {}".format(threads))
 52 | 
 53 |         if file_list_file:
 54 |             file_set = parse_summary_file(file_list_file)
 55 |             for file in file_set:
 56 |                 assert path.exists(file), "{} from file list doesn't exist".format(file)
 57 |             self.input_f5s = list(file_set.intersection(self.input_f5s))
 58 | 
 59 |         # determine max number of workers
 60 |         num_outputs = int(ceil(len(self.read_set) / float(batch_size)))
 61 |         self.num_workers = min(threads, num_outputs, len(self.input_f5s))
 62 | 
 63 |         # progressbar total is number of reads in read_set plus number of input files
 64 |         # (to see progress while scanning files that don't have any relevant reads)
 65 |         self.pbar = get_progress_bar(len(self.read_set) + len(self.input_f5s))
 66 | 
 67 |         self.worker = Fast5FilterWorker(
 68 |             input_file_list=self.input_f5s,
 69 |             output_dir=Path(output_folder),
 70 |             logger=self.logger,
 71 |             progressbar=self.pbar,
 72 |             read_set=self.read_set,
 73 |             filename_base=filename_base,
 74 |             batch_size=batch_size,
 75 |             target_compression=target_compression
 76 |         )
 77 | 
 78 |     def run_batch(self):
 79 | 
 80 |         if self.num_workers == 1:
 81 |             self.worker.run_batch(pool=None)
 82 |         else:
 83 |             with Pool(self.num_workers) as pool:
 84 |                 self.worker.run_batch(pool=pool)
 85 | 
 86 |                 while self.worker.tasks:
 87 |                     sleep(1)
 88 | 
 89 |                 pool.close()
 90 |                 pool.join()
 91 | 
 92 |         self.pbar.finish()
 93 |         self.logger.info("{} reads extracted".format(sum(len(v) for v in self.worker.out_files.values())))
 94 | 
 95 |         # report reads not found
 96 |         if len(self.worker.read_set) > 0:
 97 |             self.logger.warning("{} reads not found!".format(len(self.worker.read_set)))
 98 | 
 99 | 
100 | def parse_summary_file(read_list_file):
101 |     """
102 |     Opens a text file and returns set of read_ids
103 |     Expects either a single column file where every line is read_id or
104 |       multi-column Tab-separated CSV, that contains a column read_id
105 |     :param read_list_file: path to file
106 |     :return: set
107 |     """
108 |     reads = set()
109 |     with open(str(read_list_file), 'r') as fh:
110 |         read_list_tsv = csv.reader(fh, delimiter='\t')
111 |         header = next(read_list_tsv)
112 | 
113 |         if "read_id" in header:
114 |             col_idx = header.index("read_id")
115 |         else:
116 |             if len(header) == 1:
117 |                 reads.add(header[0].strip())
118 |                 col_idx = 0
119 |             else:
120 |                 raise TypeError("multi-column file without 'read_id' column")
121 | 
122 |         for line in read_list_tsv:
123 |             reads.add(line[col_idx].strip())
124 |     if len(reads) < 1:
125 |         raise ValueError("No reads in read list file {}".format(read_list_file))
126 |     return reads
127 | 
128 | 
129 | def main():
130 |     parser = ArgumentParser("Tool for extracting reads from a multi_read_fast5_file by read_id")
131 |     parser.add_argument('-i', '--input', required=True,
132 |                         help="Path to Fast5 file or directory of Fast5 files")
133 |     parser.add_argument('-s', '--save_path', required=True,
134 |                         help="Folder to output MultiRead subset to")
135 |     parser.add_argument('-l', '--read_id_list', required=True,
136 |                         help="File containing list of read ids to extract (or sequencing_summary.txt file)")
137 |     parser.add_argument('-f', '--filename_base', default=FILENAME_BASE, required=False,
138 |                         help="Root of output filename, default='{}' -> '{}0.fast5'".format(FILENAME_BASE, FILENAME_BASE))
139 |     parser.add_argument('-n', '--batch_size', type=int, default=READS_PER_FILE, required=False,
140 |                         help="Number of reads per multi-read file (default {}".format(READS_PER_FILE))
141 |     parser.add_argument('-t', '--threads', type=int, default=1, required=False,
142 |                         help="Maximum number of threads to use")
143 |     parser.add_argument('-r', '--recursive', action='store_true', required=False, default=False,
144 |                         help="Search recursively through folders for MultiRead fast5 files")
145 |     parser.add_argument('--ignore_symlinks', action='store_true',
146 |                         help="Ignore symlinks when searching recursively for fast5 files")
147 |     parser.add_argument('-c', '--compression', required=False, default=None,
148 |                         choices=list(COMPRESSION_MAP.keys()) + [None], help="Target output compression type")
149 |     parser.add_argument('--file_list', required=False,
150 |                         help="File containing names of files to search in")
151 |     args = parser.parse_args()
152 | 
153 |     if args.compression is not None:
154 |         args.compression = COMPRESSION_MAP[args.compression]
155 | 
156 |     multifilter = Fast5Filter(input_folder=args.input,
157 |                               output_folder=args.save_path,
158 |                               filename_base=args.filename_base,
159 |                               read_list_file=args.read_id_list,
160 |                               batch_size=args.batch_size,
161 |                               threads=args.threads,
162 |                               recursive=args.recursive,
163 |                               file_list_file=args.file_list,
164 |                               follow_symlinks=not args.ignore_symlinks,
165 |                               target_compression=args.compression)
166 | 
167 |     multifilter.run_batch()
168 | 
169 | 
170 | if __name__ == '__main__':
171 |     main()
172 | 


--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/multi_to_single_fast5.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | from multiprocessing import Pool
  3 | import logging
  4 | import os
  5 | 
  6 | from ont_fast5_api import __version__
  7 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, get_progress_bar
  8 | from ont_fast5_api.fast5_file import EmptyFast5, Fast5FileTypeError
  9 | from ont_fast5_api.fast5_interface import check_file_type, MULTI_READ
 10 | from ont_fast5_api.multi_fast5 import MultiFast5File
 11 | 
 12 | logging.basicConfig(level=logging.INFO)
 13 | logger = logging.getLogger(__name__)
 14 | exc_info = False
 15 | 
 16 | 
 17 | def batch_convert_multi_files_to_single(input_path, output_folder, threads, recursive, follow_symlinks):
 18 |     pool = Pool(threads)
 19 |     file_list = get_fast5_file_list(input_path, recursive, follow_symlinks=follow_symlinks)
 20 |     pbar = get_progress_bar(len(file_list))
 21 | 
 22 |     def update(result):
 23 |         input_file = result[0]
 24 |         with open(os.path.join(output_folder, "filename_mapping.txt"), 'a') as output_table:
 25 |             for filename in result[1]:
 26 |                 output_table.write("{}\t{}\n".format(input_file, filename))
 27 |         pbar.update(pbar.currval + 1)
 28 | 
 29 |     if not os.path.exists(output_folder):
 30 |         os.makedirs(output_folder)
 31 | 
 32 |     results_array = []
 33 |     for batch_num, filename in enumerate(file_list):
 34 |         results_array.append(pool.apply_async(convert_multi_to_single,
 35 |                                               args=(filename, output_folder,
 36 |                                                     str(batch_num)),
 37 |                                               callback=update))
 38 | 
 39 |     pool.close()
 40 |     pool.join()
 41 |     pbar.finish()
 42 | 
 43 | 
 44 | def convert_multi_to_single(input_file, output_folder, subfolder):
 45 |     output_files = ()
 46 |     try:
 47 |         output_files = try_multi_to_single_conversion(input_file, output_folder, subfolder)
 48 |     except Exception as e:
 49 |         logger.error("{}\n\tFailed to copy files from: {}"
 50 |                      "".format(e, input_file), exc_info=exc_info)
 51 |     return input_file, output_files
 52 | 
 53 | 
 54 | def try_multi_to_single_conversion(input_file, output_folder, subfolder):
 55 |     output_files = []
 56 |     with MultiFast5File(input_file, 'r') as multi_f5:
 57 |         file_type = check_file_type(multi_f5)
 58 |         if file_type != MULTI_READ:
 59 |             raise Fast5FileTypeError("Could not convert Multi->Single for file type '{}' with path '{}'"
 60 |                                      "".format(file_type, input_file))
 61 |         for read in multi_f5.get_reads():
 62 |             try:
 63 |                 output_file = os.path.join(output_folder, subfolder, "{}.fast5".format(read.read_id))
 64 |                 create_single_f5(output_file, read)
 65 |                 output_files.append(os.path.basename(output_file))
 66 |             except Exception as e:
 67 |                 logger.error("{}\n\tFailed to copy read '{}' from {}"
 68 |                              "".format(str(e), read.read_id, input_file), exc_info=exc_info)
 69 |     return output_files
 70 | 
 71 | 
 72 | def create_single_f5(output_file, read):
 73 |     if not os.path.exists(os.path.dirname(output_file)):
 74 |         os.makedirs(os.path.dirname(output_file))
 75 |     with EmptyFast5(output_file, 'w') as single_f5:
 76 |         for group in read.handle:
 77 |             if group == "Raw":
 78 |                 read_number = read.handle["Raw"].attrs["read_number"]
 79 |                 single_f5.handle.copy(read.handle[group], "Raw/Reads/Read_{}".format(read_number))
 80 |             elif group in ("channel_id", "context_tags", "tracking_id"):
 81 |                 if "UniqueGlobalKey" not in single_f5.handle:
 82 |                     single_f5.handle.create_group("UniqueGlobalKey")
 83 |                 single_f5.handle.copy(read.handle[group], "UniqueGlobalKey/{}".format(group))
 84 |             else:
 85 |                 single_f5.handle.copy(read.handle[group], group)
 86 | 
 87 | 
 88 | def main():
 89 |     parser = ArgumentParser("")
 90 |     parser.add_argument('-i', '--input_path', required=True,
 91 |                         help="MultiRead fast5 file or path to directory of MultiRead files")
 92 |     parser.add_argument('-s', '--save_path', required=True,
 93 |                         help="Folder to output SingleRead fast5 files to")
 94 |     parser.add_argument('--recursive', action='store_true',
 95 |                         help="Search recursively through folders for MultiRead fast5 files")
 96 |     parser.add_argument('--ignore_symlinks', action='store_true',
 97 |                         help="Ignore symlinks when searching recursively for fast5 files")
 98 |     parser.add_argument('-t', '--threads', type=int, default=1, required=False,
 99 |                         help="Number of threads to use")
100 |     parser.add_argument('-v', '--version', action='version', version=__version__)
101 |     args = parser.parse_args()
102 | 
103 |     batch_convert_multi_files_to_single(args.input_path, args.save_path, args.threads,
104 |                                         args.recursive, follow_symlinks=not args.ignore_symlinks)
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     main()
109 | 


--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/single_to_multi_fast5.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from argparse import ArgumentParser
  4 | from multiprocessing import Pool
  5 | 
  6 | from ont_fast5_api import __version__
  7 | from ont_fast5_api.compression_settings import COMPRESSION_MAP
  8 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, batcher, get_progress_bar
  9 | from ont_fast5_api.fast5_file import Fast5File, Fast5FileTypeError
 10 | from ont_fast5_api.multi_fast5 import MultiFast5File
 11 | 
 12 | logging.basicConfig(level=logging.INFO)
 13 | logger = logging.getLogger(__name__)
 14 | exc_info = False
 15 | 
 16 | 
 17 | def batch_convert_single_to_multi(input_path, output_folder, filename_base, batch_size,
 18 |                                   threads, recursive, follow_symlinks, target_compression):
 19 |     pool = Pool(threads)
 20 |     file_list = get_fast5_file_list(input_path, recursive, follow_symlinks)
 21 |     pbar = get_progress_bar(int((len(file_list) + batch_size - 1) / batch_size))
 22 | 
 23 |     def update(result):
 24 |         output_file = result[1]
 25 |         with open(os.path.join(output_folder, "filename_mapping.txt"), 'a') as output_table:
 26 |             for filename in result[0]:
 27 |                 output_table.write("{}\t{}\n".format(filename, output_file))
 28 |         pbar.update(pbar.currval + 1)
 29 | 
 30 |     results_array = []
 31 |     os.makedirs(output_folder, exist_ok=True)
 32 |     for batch_num, batch in enumerate(batcher(file_list, batch_size)):
 33 |         output_file = os.path.join(output_folder, "{}_{}.fast5".format(filename_base, batch_num))
 34 |         results_array.append(pool.apply_async(create_multi_read_file,
 35 |                                               args=(batch, output_file, target_compression),
 36 |                                               callback=update))
 37 | 
 38 |     pool.close()
 39 |     pool.join()
 40 |     pbar.finish()
 41 | 
 42 | 
 43 | def create_multi_read_file(input_files, output_file, target_compression):
 44 |     results = []
 45 |     os.makedirs(os.path.dirname(output_file), exist_ok=True)
 46 |     if os.path.exists(output_file):
 47 |         logger.info("FileExists - appending new reads to existing file: {}".format(output_file))
 48 |     try:
 49 |         with MultiFast5File(output_file, 'a') as multi_f5:
 50 |             for filename in input_files:
 51 |                 try:
 52 |                     with Fast5File(filename, 'r') as f5_input:
 53 |                         read = f5_input.get_read(f5_input.read_id)
 54 |                         multi_f5.add_existing_read(read, target_compression=target_compression)
 55 |                     results.append(os.path.basename(filename))
 56 |                 except Fast5FileTypeError as e:
 57 |                     logger.error("{}: Cannot input MultiRead files to single_to_multi: '{}'"
 58 |                                  "".format(e, filename), exc_info=exc_info)
 59 |                     raise
 60 |                 except Exception as e:
 61 |                     logger.error("{}\n\tFailed to add single read file: '{}' to '{}'"
 62 |                                  "".format(e, filename, output_file), exc_info=exc_info)
 63 | 
 64 |     except Fast5FileTypeError:
 65 |         raise
 66 |     except Exception as e:
 67 |         logger.error("{}\n\tFailed to write to MultiRead file: {}"
 68 |                      "".format(e, output_file), exc_info=exc_info)
 69 |     return results, output_file
 70 | 
 71 | 
 72 | def main():
 73 |     parser = ArgumentParser("")
 74 |     parser.add_argument('-i', '--input_path', required=True,
 75 |                         help='Folder containing single read fast5 files')
 76 |     parser.add_argument('-s', '--save_path', required=True,
 77 |                         help="Folder to output multi read files to")
 78 |     parser.add_argument('-f', '--filename_base', default='batch', required=False,
 79 |                         help="Root of output filename, default='batch' -> 'batch_0.fast5'")
 80 |     parser.add_argument('-n', '--batch_size', type=int, default=4000, required=False,
 81 |                         help="Number of reads per multi-read file")
 82 |     parser.add_argument('-t', '--threads', type=int, default=1, required=False,
 83 |                         help="Number of threads to use")
 84 |     parser.add_argument('--recursive', action='store_true',
 85 |                         help="Search recursively through folders for single_read fast5 files")
 86 |     parser.add_argument('--ignore_symlinks', action='store_true',
 87 |                         help="Ignore symlinks when searching recursively for fast5 files")
 88 |     parser.add_argument('-c', '--compression', required=False, default=None,
 89 |                         choices=list(COMPRESSION_MAP.keys()) + [None], help="Target output compression type")
 90 |     parser.add_argument('-v', '--version', action='version', version=__version__)
 91 |     args = parser.parse_args()
 92 | 
 93 |     if args.compression is not None:
 94 |         args.compression = COMPRESSION_MAP[args.compression]
 95 | 
 96 |     batch_convert_single_to_multi(args.input_path,
 97 |                                   args.save_path,
 98 |                                   args.filename_base,
 99 |                                   args.batch_size,
100 |                                   args.threads,
101 |                                   args.recursive,
102 |                                   follow_symlinks=not args.ignore_symlinks,
103 |                                   target_compression=args.compression)
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     main()
108 | 


--------------------------------------------------------------------------------
/ont_fast5_api/data_sanitisation.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | import numpy as np
 3 | 
 4 | 
 5 | def _clean(value):
 6 |     """ Convert numpy numeric types to their python equivalents. """
 7 |     if isinstance(value, np.ndarray):
 8 |         if value.dtype.kind == 'S':
 9 |             return np.char.decode(value).tolist()
10 |         else:
11 |             return value.tolist()
12 |     elif type(value).__module__ == np.__name__:
13 |         # h5py==2.8.0 on windows sometimes fails to cast this from an np.float64 to a python.float
14 |         # We have to let the user do this themselves, since casting here could be dangerous
15 |         # https://github.com/h5py/h5py/issues/1051
16 |         conversion = value.item()  # np.asscalar(value) was deprecated in v1.16
17 |         if isinstance(conversion, bytes):
18 |             conversion = conversion.decode()
19 |         return conversion
20 |     elif isinstance(value, bytes):
21 |         return value.decode()
22 |     else:
23 |         return value
24 | 
25 | 
26 | def _sanitize_data_for_writing(data):
27 |     # To make the interface more user friendly we encode python strings as  byte-strings when writing datasets
28 |     if isinstance(data, str):
29 |         # Plain python-strings can be encoded trivially
30 |         return data.encode()
31 |     elif isinstance(data, np.ndarray) and data.dtype.kind == np.dtype(np.unicode_):
32 |         # If the array is all of one type, unicode-string, we can encode with numpy
33 |         return data.astype('S')
34 |     elif isinstance(data, np.ndarray) and len(data.dtype) > 1:
35 |         # If the array is of mixed types we have to set the encoding column by column
36 |         encoded_dtypes = []
37 |         for field_name in data.dtype.names:
38 |             field_dtype, field_byte_index = data.dtype.fields[field_name]
39 |             if field_dtype.kind == 'U':
40 |                 str_len = field_dtype.itemsize // field_dtype.alignment
41 |                 field_dtype = np.dtype("|S{}".format(str_len))
42 |             encoded_dtypes.append((field_name, field_dtype))
43 |         return data.astype(encoded_dtypes)
44 | 
45 |     return data
46 | 
47 | 
48 | def _sanitize_data_for_reading(data):
49 |     # To make the interface more user friendly we decode byte-strings into unicode strings when reading datasets
50 |     if isinstance(data, h5py.Dataset):
51 |         data = data[()]
52 | 
53 |     if isinstance(data, bytes):
54 |         # Plain byte-strings can be decoded trivially
55 |         return data.decode()
56 |     elif isinstance(data, np.ndarray) and data.dtype.kind == 'S':
57 |         # If the array is all of one type, byte-string, we can decode with numpy
58 |         return np.char.decode(data)
59 |     elif isinstance(data, np.ndarray) and len(data.dtype) > 1:
60 |         # If the array is of mixed types we have to decode column by column
61 |         decoded_dtypes = []
62 |         for field_name in data.dtype.names:
63 |             field_dtype, field_byte_index = data.dtype.fields[field_name]
64 |             if field_dtype.kind == 'S':
65 |                 field_dtype = np.dtype("<U{}".format(field_dtype.itemsize))
66 |             decoded_dtypes.append((field_name, field_dtype))
67 |         return data.astype(decoded_dtypes)
68 | 
69 |     return data
70 | 


--------------------------------------------------------------------------------
/ont_fast5_api/fast5_info.py:
--------------------------------------------------------------------------------
  1 | """ Helper class for getting information about a fast5 file.
  2 | """
  3 | import os
  4 | import h5py
  5 | 
  6 | from packaging import version as packaging_version
  7 | 
  8 | # This unused import is included for backwards compatibilty and can be removed in future.
  9 | from ont_fast5_api.data_sanitisation import _clean
 10 | 
 11 | class ReadInfo(object):
 12 |     """ This object provides basic details about a read.
 13 |     """
 14 |     
 15 |     def __init__(self, read_number, read_id, start_time, duration,
 16 |                  mux=0, median_before=-1.0):
 17 |         """ Constructs an object describing a read.
 18 |         
 19 |         :param read_number: A read number, unique for the channel.
 20 |         :param read_id: A globally unique read id.
 21 |         :param start_time: The start time of the read (in samples).
 22 |         :param duration: The duration of the read (in samples).
 23 |         :param mux: The mux of the channel when the read occurred.
 24 |         :param median_before: The median current before the read.
 25 |         """
 26 |         self.read_number = read_number
 27 |         self.read_id = read_id
 28 |         self.has_raw_data = False
 29 |         self.duration = duration
 30 |         self.has_event_data = False
 31 |         self.event_data_count = 0
 32 |         self.start_time = start_time
 33 |         self.start_mux = mux
 34 |         self.median_before = median_before
 35 | 
 36 | 
 37 | class Fast5Info(object):
 38 |     """ This object provides some basic details about a read fast5 file.
 39 |     
 40 |     **Fields**
 41 |       * **valid:** Indicates whether the fast5 file is valid or not.
 42 |       * **version:** Indicates the version of the read fast5 file
 43 |         specification the file conforms to (if any).
 44 |       * **read_info:** A list of ReadInfo objects. One entry for each read.
 45 |       * **read_number_map:** A dictionary giving the index into the read_info
 46 |         list for each read number.
 47 |       * **read_id_map:** A dictionary giving the index into the read_info
 48 |         list for each read-id.
 49 |     """
 50 | 
 51 |     def __init__(self, fname):
 52 |         """ Constructs a status object from a file.
 53 | 
 54 |         :param fname: Filename of fast5 file to read status from.
 55 |         """
 56 |         self.valid = True
 57 |         self.channel = None
 58 |         self.read_info = []
 59 |         self.read_number_map = {}
 60 |         self.read_id_map = {}
 61 |         try:
 62 |             with h5py.File(fname, 'r') as handle:
 63 |                 if 'file_version' in handle.attrs:
 64 |                     self.version = _clean(handle.attrs['file_version'])
 65 |                     minimum_valid_version = packaging_version.Version('0.6')
 66 |                     if packaging_version.parse(str(self.version)) \
 67 |                        < minimum_valid_version:
 68 |                         self.valid = False
 69 |                 else:
 70 |                     self.valid = False
 71 |                     self.version = 0.0
 72 | 
 73 |                 # Check for required groups.
 74 |                 top_groups = handle.keys()
 75 |                 if 'UniqueGlobalKey' in top_groups:
 76 |                     global_keys = handle['UniqueGlobalKey'].keys()
 77 |                 if 'tracking_id' not in global_keys and not self._legacy_version():
 78 |                     self.valid = False
 79 |                 if 'channel_id' not in global_keys:
 80 |                     self.valid = False
 81 | 
 82 |                 self.channel = handle['UniqueGlobalKey/channel_id'].attrs.get('channel_number')
 83 |                 if self.channel is None and self._legacy_version():
 84 |                     self.valid = False
 85 | 
 86 |                 # Get the read information.
 87 |                 if 'Raw' in top_groups:
 88 |                     reads = handle['Raw/Reads'].keys()
 89 |                     for read in reads:
 90 |                         read_group_name = 'Raw/Reads/{}'.format(read)
 91 |                         read_group = handle[read_group_name]
 92 |                         read_attrs = read_group.attrs
 93 |                         read_number = _clean(read_attrs['read_number'])
 94 |                         if 'read_id' in read_attrs:
 95 |                             read_id = _clean(read_attrs['read_id'])
 96 |                         else:
 97 |                             if not self._legacy_version():
 98 |                                 self.valid = False
 99 |                             else:
100 |                                 read_id = os.path.basename(fname)
101 |                         start_time = _clean(read_attrs['start_time'])
102 |                         duration = _clean(read_attrs['duration'])
103 |                         mux = _clean(read_attrs.get('start_mux',0))
104 |                         median_before = _clean(read_attrs.get('median_before',-1.0))
105 |                         read_info = ReadInfo(read_number, read_id, start_time, duration, mux, median_before)
106 |                         if 'Signal' in read_group:
107 |                             read_info.has_raw_data = True
108 |                         elif self._legacy_version():
109 |                             if 'Data' in read_group:
110 |                                 read_info.has_raw_data = True
111 |                             else:
112 |                                 self.valid = False
113 |                         self.read_info.append(read_info)
114 |                         n = len(self.read_info) - 1
115 |                         self.read_number_map[read_number] = n
116 |                         self.read_id_map[read_id] = n
117 |                 else:
118 |                     if not self._legacy_version():
119 |                         self.valid = False
120 |                 analyses = sorted(handle['Analyses'].keys()) if 'Analyses' in handle else []
121 |                 for ana in analyses[::-1]:
122 |                     if ana.startswith('EventDetection'):
123 |                         reads_group_name = 'Analyses/{}/Reads'.format(ana)
124 |                         if reads_group_name not in handle:
125 |                             continue
126 |                         reads = handle[reads_group_name].keys()
127 |                         for read in reads:
128 |                             read_group_name = '{}/{}'.format(reads_group_name, read)
129 |                             read_group = handle[read_group_name]
130 |                             read_attrs = read_group.attrs
131 |                             read_number = _clean(read_attrs['read_number'])
132 |                             if 'read_id' in read_attrs:
133 |                                 read_id = _clean(read_attrs['read_id'])
134 |                             else:
135 |                                 if not self._legacy_version():
136 |                                     self.valid = False
137 |                                     continue
138 |                                 else:
139 |                                     read_id = os.path.basename(fname)
140 |                             start_time = _clean(read_attrs['start_time'])
141 |                             duration = _clean(read_attrs['duration'])
142 |                             mux = _clean(read_attrs.get('start_mux', 0))
143 |                             median_before = _clean(read_attrs.get('median_before', -1.0))
144 |                             read_info = ReadInfo(read_number, read_id, start_time, duration, mux, median_before)
145 |                             if 'Events' in read_group:
146 |                                 read_info.has_event_data = True
147 |                                 read_info.event_data_count = len(read_group['Events'])
148 |                             else:
149 |                                 read_info.has_event_data = False
150 |                                 read_info.event_data_count = 0
151 |                             if read_number in self.read_number_map:
152 |                                 read_index = self.read_number_map[read_number]
153 |                                 self.read_info[read_index].has_event_data = read_info.has_event_data
154 |                                 self.read_info[read_index].event_data_count = read_info.event_data_count
155 |                             else:
156 |                                 if not self._legacy_version():
157 |                                     self.valid = False
158 |                                 self.read_info.append(read_info)
159 |                                 n = len(self.read_info) - 1
160 |                                 self.read_number_map[read_number] = n
161 |                                 self.read_id_map[read_id] = n
162 |                         break
163 |         except:
164 |             self.valid = False
165 |             raise
166 | 
167 |         if self._legacy_version():
168 |             # There must be either raw data or event data (or both).
169 |             if len(self.read_info) == 0:
170 |                 self.valid = False
171 | 
172 |     def _legacy_version(self):
173 |         legacy_cutoff = packaging_version.Version("1.1")
174 |         return packaging_version.parse(str(self.version)) < legacy_cutoff
175 | 


--------------------------------------------------------------------------------
/ont_fast5_api/fast5_interface.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from ont_fast5_api.fast5_file import Fast5File, Fast5FileTypeError
 3 | from ont_fast5_api.fast5_info import _clean
 4 | from ont_fast5_api.multi_fast5 import MultiFast5File
 5 | 
 6 | MULTI_READ = "multi-read"
 7 | SINGLE_READ = "single-read"
 8 | BULK_FAST5 = "bulk"
 9 | 
10 | 
11 | def get_fast5_file(filepath, mode='r', driver=None):
12 |     if is_multi_read(filepath):
13 |         return MultiFast5File(filepath, mode, driver=driver)
14 |     else:
15 |         return Fast5File(filepath, mode)
16 | 
17 | 
18 | def check_file_type(f5_file):
19 |     try:
20 |         return _clean(f5_file.handle.attrs['file_type'])
21 |     except KeyError:
22 |         # On older files we don't have the 'file_type' attribute so check groups explicitly
23 |         if len(f5_file.handle) == 0:
24 |             # If there are no top-level groups we default to MultiRead
25 |             return MULTI_READ
26 |         if len([read for read in f5_file.handle if read.startswith('read_')]) != 0:
27 |             # If there are any read_xxx groups we're definitely MultiRead
28 |             return MULTI_READ
29 |         if "UniqueGlobalKey" in f5_file.handle:
30 |             # This group indicates a single read
31 |             return SINGLE_READ
32 |     raise TypeError("Fast5 file type could not be identified as single- or multi-read. "
33 |                     "\nFilepath: {}".format(f5_file.filename))
34 | 
35 | 
36 | def is_multi_read(filepath):
37 |     """
38 |     Determine if a file is a MultiFast5File, True if it is, False if it is a single Fast5File error for other types
39 |     """
40 |     with MultiFast5File(filepath, mode='r') as f5_file:
41 |         file_type = check_file_type(f5_file)
42 |         if file_type == MULTI_READ:
43 |             return True
44 |         elif file_type == SINGLE_READ:
45 |             return False
46 |         elif file_type == BULK_FAST5:
47 |             raise NotImplementedError("ont_fast5_api does not support bulk fast files: {}".format(filepath))
48 |         raise Fast5FileTypeError("Unknown file type: '{}' for file: {}".format(file_type, filepath))
49 | 


--------------------------------------------------------------------------------
/ont_fast5_api/helpers.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | import warnings
 3 | from ont_fast5_api import fast5_file as f5f
 4 | 
 5 | 
 6 | class FileToDict(object):
 7 | 
 8 |     def __init__(self):
 9 |         self.contents = {}
10 | 
11 |     def scan(self, name, obj):
12 |         if isinstance(obj, h5py.Group):
13 |             self.contents[name] = 'group'
14 |         else:
15 |             self.contents[name] = 'dataset'
16 |             self.contents['{}.data'.format(name)] = \
17 |                 str(f5f._sanitize_data_for_reading(obj))
18 |             self.contents['{}.cols'.format(name)] = obj.dtype.names
19 |         attrdict = {}
20 |         for item in obj.attrs:
21 |             attrdict[item] = \
22 |                 str(f5f._sanitize_data_for_reading(obj.attrs[item]))
23 |         self.contents['{}.attrs'.format(name)] = attrdict
24 |         return None
25 | 
26 | 
27 | def compare_hdf_files(file1, file2):
28 |     """ Compare two hdf files.
29 |     :param file1: First file to compare.
30 |     :param file2: Second file to compare.
31 | 
32 |     :returns True if they are the same.
33 |     """
34 |     warnings.simplefilter("default")
35 |     warnings.warn("'compare_hdf_files': HDF5 comparison is deprecated. \n"
36 |                   "If this feature is still required please contact the project maintainers",
37 |                   DeprecationWarning)
38 |     data1 = FileToDict()
39 |     data2 = FileToDict()
40 |     scanner1 = data1.scan
41 |     scanner2 = data2.scan
42 |     with h5py.File(file1, 'r') as fh1:
43 |         fh1.visititems(scanner1)
44 |     with h5py.File(file2, 'r') as fh2:
45 |         fh2.visititems(scanner2)
46 |     return data1.contents == data2.contents
47 | 


--------------------------------------------------------------------------------
/ont_fast5_api/multi_fast5.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | 
  3 | from ont_fast5_api import CURRENT_FAST5_VERSION
  4 | from ont_fast5_api.fast5_file import Fast5File, Fast5FileTypeError
  5 | from ont_fast5_api.fast5_read import AbstractFast5, Fast5Read, copy_attributes
  6 | from ont_fast5_api.static_data import HARDLINK_GROUPS, OPTIONAL_READ_GROUPS
  7 | 
  8 | 
  9 | class MultiFast5File(AbstractFast5):
 10 |     def __init__(self, filename, mode='r', driver=None):
 11 |         # See https://docs.h5py.org/en/stable/high/file.html#file-drivers for
 12 |         # information on why you might want to use a specific driver.
 13 |         self.filename = filename
 14 |         self.mode = mode
 15 |         self.handle = h5py.File(self.filename, self.mode, driver=driver)
 16 |         self._run_id_map = None
 17 |         if mode != 'r' and 'file_version' not in self.handle.attrs:
 18 |             try:
 19 |                 self.handle.attrs['file_version'] = str(CURRENT_FAST5_VERSION)
 20 |             except IOError as e:
 21 |                 raise IOError("Could not write 'file_version' in mode '{}' to fast5 file: {}"
 22 |                                    "".format(self.filename, self.mode)) from e
 23 | 
 24 |     def get_reads(self):
 25 |         for group_name in self.handle:
 26 |             if group_name.startswith('read_'):
 27 |                 yield Fast5Read(self, group_name[5:])
 28 | 
 29 |     def get_read_ids(self):
 30 |         # Return all groups with the 'read_' stripped from the front
 31 |         return [group_name[5:] for group_name in self.handle if group_name.startswith('read_')]
 32 | 
 33 |     def get_read(self, read_id):
 34 |         group_name = "read_" + read_id
 35 |         if group_name not in self.handle:
 36 |             raise KeyError("Read '{}' not in: {}".format(group_name, self.filename))
 37 |         return Fast5Read(self, read_id)
 38 | 
 39 |     def create_read(self, read_id, run_id):
 40 |         DeprecationWarning("'MultiFast5File.create_read()' will be deprecated. "
 41 |                            "Use `MultiFast5File.create_empty_read()` instead")
 42 |         return self.create_empty_read(read_id, run_id)
 43 | 
 44 |     def create_empty_read(self, read_id, run_id):
 45 |         group_name = "read_" + read_id
 46 |         if group_name not in self.handle:
 47 |             try:
 48 |                 self.handle.create_group(group_name)
 49 |             except ValueError as e:
 50 |                 raise ValueError("Could not create group '{}' in file: {}".format(group_name, self.filename)) from e
 51 |             try:
 52 |                 for shared_group in HARDLINK_GROUPS:
 53 |                     self.handle["{}/{}".format(group_name, shared_group)] = \
 54 |                         self.handle["read_{}/{}".format(self.run_id_map[run_id], shared_group)]
 55 |             except KeyError:
 56 |                 # If we can't hardlink to existing groups then continue as normal
 57 |                 # registering this read as the new source of metadata for this run_id_map
 58 |                 self.run_id_map[run_id] = read_id
 59 |             self.handle[group_name].attrs["run_id"] = run_id
 60 |         else:
 61 |             raise ValueError("Read '{}' already exists in: {}".format(group_name, self.filename))
 62 |         return Fast5Read(self, read_id)
 63 | 
 64 | 
 65 |     @property
 66 |     def run_id_map(self):
 67 |         if self._run_id_map is None:
 68 |             self._run_id_map = dict()
 69 |             for read in self.get_reads():
 70 |                 try:
 71 |                     self._run_id_map[read.run_id] = read.read_id
 72 |                 except KeyError:
 73 |                     # If we can't find the read.run_id then there is a KeyError
 74 |                     # We want to ignore these cases since they can't be linked anyway
 75 |                     pass
 76 |         return self._run_id_map
 77 | 
 78 |     def add_existing_read(self, read_to_add, target_compression=None, sanitize=False):
 79 |         if isinstance(read_to_add, Fast5File):
 80 |             self._add_read_from_single(read_to_add, target_compression, sanitize=sanitize)
 81 |         elif isinstance(read_to_add.parent, MultiFast5File):
 82 |             self._add_read_from_multi(read_to_add, target_compression, sanitize=sanitize)
 83 |         else:
 84 |             raise Fast5FileTypeError("Could not add read to output file from input file type '{}' with path '{}'"
 85 |                                      "".format(type(read_to_add.parent), read_to_add.parent.filename))
 86 | 
 87 |     def _add_read_from_multi(self, read_to_add, target_compression, sanitize=False):
 88 |         read_name = "read_" + read_to_add.read_id
 89 |         self.handle.create_group(read_name)
 90 |         output_group = self.handle[read_name]
 91 |         copy_attributes(read_to_add.handle.attrs, output_group)
 92 |         for subgroup in read_to_add.handle:
 93 |             if sanitize and subgroup in OPTIONAL_READ_GROUPS:
 94 |                 # skip optional groups when sanitizing
 95 |                 continue
 96 |             elif subgroup == read_to_add.raw_dataset_group_name \
 97 |                     and target_compression is not None \
 98 |                     and str(target_compression) not in read_to_add.raw_compression_filters:
 99 |                 raw_attrs = read_to_add.handle[read_to_add.raw_dataset_group_name].attrs
100 |                 raw_data = read_to_add.handle[read_to_add.raw_dataset_name]
101 |                 output_read = self.get_read(read_to_add.read_id)
102 |                 output_read.add_raw_data(raw_data, raw_attrs, compression=target_compression)
103 |                 continue
104 |             elif subgroup in HARDLINK_GROUPS:
105 |                 if read_to_add.run_id in self.run_id_map:
106 |                     # There may be a group to link to, but we must check it actually exists!
107 |                     hardlink_source = "read_{}/{}".format(self.run_id_map[read_to_add.run_id], subgroup)
108 |                     if hardlink_source in self.handle:
109 |                         hardlink_dest = "read_{}/{}".format(read_to_add.read_id, subgroup)
110 |                         self.handle[hardlink_dest] = self.handle[hardlink_source]
111 |                         continue
112 |                 # If we couldn't hardlink to anything we need to make the group we create available for future reads
113 |                 self.run_id_map[read_to_add.run_id] = read_to_add.read_id
114 |             # If we haven't done a special-case copy then we can fall back on the default copy
115 |             output_group.copy(read_to_add.handle[subgroup], subgroup)
116 | 
117 |     def _add_read_from_single(self, read_to_add, target_compression, sanitize=False):
118 |         read_name = "read_" + read_to_add.read_id
119 |         self.handle.create_group(read_name)
120 |         output_group = self.handle[read_name]
121 |         copy_attributes(read_to_add.handle.attrs, output_group)
122 |         for subgroup in read_to_add.handle:
123 |             if sanitize and subgroup in OPTIONAL_READ_GROUPS:
124 |                 # skip optional groups when sanitizing
125 |                 continue
126 |             elif subgroup == "UniqueGlobalKey":
127 |                 for unique_group in read_to_add.handle["UniqueGlobalKey"]:
128 |                     if unique_group in HARDLINK_GROUPS and read_to_add.run_id in self.run_id_map:
129 |                         hardlink_source = "read_{}/{}".format(self.run_id_map[read_to_add.run_id], unique_group)
130 |                         if hardlink_source in self.handle:
131 |                             hardlink_dest = "read_{}/{}".format(read_to_add.read_id, unique_group)
132 |                             self.handle[hardlink_dest] = self.handle[hardlink_source]
133 |                     else:
134 |                         output_group.copy(read_to_add.handle["UniqueGlobalKey/{}".format(unique_group)],
135 |                                           unique_group)
136 |                 self.run_id_map[read_to_add.run_id] = read_to_add.read_id
137 |             elif subgroup == "Raw":
138 |                 if target_compression is None or str(target_compression) in read_to_add.raw_compression_filters:
139 |                     output_group.copy(read_to_add.handle[read_to_add.raw_dataset_group_name], "Raw")
140 |                 else:
141 |                     raw_attrs = read_to_add.handle[read_to_add.raw_dataset_group_name].attrs
142 |                     raw_data = read_to_add.handle[read_to_add.raw_dataset_name]
143 |                     output_read = self.get_read(read_to_add.read_id)
144 |                     output_read.add_raw_data(raw_data, raw_attrs, compression=target_compression)
145 |             else:
146 |                 if not sanitize:
147 |                     output_group.copy(read_to_add.handle[subgroup], subgroup)
148 | 


--------------------------------------------------------------------------------
/ont_fast5_api/static_data.py:
--------------------------------------------------------------------------------
 1 | # Current workflows add the component name to the hdf group as
 2 | # an attribute. For older files, the following dict allows us
 3 | # to work out the component name from the group name.
 4 | LEGACY_COMPONENT_NAMES = {'Alignment': 'alignment',
 5 |                           'Basecall_1D': 'basecall_1d',
 6 |                           'OnlineBasecall': 'basecall_1d',
 7 |                           'Basecall_2D': 'basecall_2d',
 8 |                           'Calibration_Strand': 'calibration_strand',
 9 |                           'EventDetection': 'event_detection',
10 |                           'Segmentation': 'segmentation',
11 |                           'Hairpin_Split': 'segmentation',
12 |                           'Segment_Linear': 'segmentation',
13 |                           'Validation': 'segmentation',
14 |                           'AlignToRef': 'align_to_ref',
15 |                           'Barcoding': 'barcoding',
16 |                           'Classification': 'classification',
17 |                           'Evaluation': 'evaluation',
18 |                           'Multiple_Alignment': 'multiple_alignment',
19 |                           'Squiggle_Map': 'squiggle_map',
20 |                           'Sam_Segmentor': 'sam_segmentor',
21 |                           'arma': 'arma',
22 |                           'Basic_component': 'basic_component',
23 |                           'RawGenomeCorrected':'raw_genome_corrected' # default name from tombo resquiggle command
24 |                           }
25 | 
26 | supported_modes = ('r', 'r+', 'w', 'w-', 'x', 'a')
27 | mode_docstring = """Supported file modes:
28 |     r        Readonly, file must exist (default)
29 |     r+       Read/write, file must exist
30 |     w        Create file, truncate if exists
31 |     w- or x  Create file, fail if exists
32 |     a        Read/write if exists, create otherwise"""  # Taken from h5py
33 | 
34 | HARDLINK_GROUPS = ("context_tags", "tracking_id")
35 | OPTIONAL_READ_GROUPS = {'Analyses'}
36 | 


--------------------------------------------------------------------------------
/ont_fast5_api/vbz_plugin/libvbz_hdf_plugin.dylib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/ont_fast5_api/vbz_plugin/libvbz_hdf_plugin.dylib


--------------------------------------------------------------------------------
/ont_fast5_api/vbz_plugin/libvbz_hdf_plugin_aarch64.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/ont_fast5_api/vbz_plugin/libvbz_hdf_plugin_aarch64.so


--------------------------------------------------------------------------------
/ont_fast5_api/vbz_plugin/libvbz_hdf_plugin_m1.dylib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/ont_fast5_api/vbz_plugin/libvbz_hdf_plugin_m1.dylib


--------------------------------------------------------------------------------
/ont_fast5_api/vbz_plugin/libvbz_hdf_plugin_x86_64.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/ont_fast5_api/vbz_plugin/libvbz_hdf_plugin_x86_64.so


--------------------------------------------------------------------------------
/ont_fast5_api/vbz_plugin/vbz_hdf_plugin.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/ont_fast5_api/vbz_plugin/vbz_hdf_plugin.dll


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | from setuptools import setup, find_packages
 4 | 
 5 | __pkg_name__ = 'ont_fast5_api'
 6 | 
 7 | 
 8 | def get_version():
 9 |     init_file = os.path.join(__pkg_name__, '__init__.py')
10 |     with open(init_file, 'r') as init_fh:
11 |         verstrline = init_fh.read()
12 |     vsre = r"^__version__ = ['\"]([^'\"]*)['\"]"
13 |     mo = re.search(vsre, verstrline, re.M)
14 |     if mo:
15 |         return mo.group(1)
16 |     else:
17 |         raise RuntimeError("Unable to find version string in '{}'".format(init_file))
18 | 
19 | 
20 | with open('README.rst') as readme:
21 |     documentation = readme.read()
22 | 
23 | installation_requirements = []
24 | if 'IGNORE_INCLUDES' not in os.environ:
25 |     installation_requirements = ['h5py>=3',
26 |                                  'numpy>=1.16',
27 |                                  'packaging',
28 |                                  'progressbar33>=2.3.1',
29 |                                  'setuptools']
30 | 
31 | setup(name=__pkg_name__.replace("_", "-"),
32 |       author='Oxford Nanopore Technologies, Limited',
33 |       description='Oxford Nanopore Technologies fast5 API software',
34 |       license='MPL 2.0',
35 |       long_description=documentation,
36 |       version=get_version(),
37 |       url='https://github.com/nanoporetech/{}'.format(__pkg_name__),
38 |       install_requires=installation_requirements,
39 |       packages=find_packages(),
40 |       package_data={__pkg_name__: ['vbz_plugin/*.so', 'vbz_plugin/*.dylib', 'vbz_plugin/*.dll']},
41 |       python_requires='>=3.7',
42 |       entry_points={'console_scripts': [
43 |           "multi_to_single_fast5={}.conversion_tools.multi_to_single_fast5:main".format(__pkg_name__),
44 |           "single_to_multi_fast5={}.conversion_tools.single_to_multi_fast5:main".format(__pkg_name__),
45 |           "fast5_subset={}.conversion_tools.fast5_subset:main".format(__pkg_name__),
46 |           "compress_fast5={}.conversion_tools.compress_fast5:main".format(__pkg_name__),
47 |           "check_compression={}.conversion_tools.check_file_compression:main".format(__pkg_name__),
48 |           "demux_fast5={}.conversion_tools.demux_fast5:main".format(__pkg_name__),
49 |       ]},
50 |       classifiers=[
51 |           'Development Status :: 5 - Production/Stable',
52 |           'Environment :: Console',
53 |           'Intended Audience :: Developers',
54 |           'Intended Audience :: Science/Research',
55 |           'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
56 |           'Natural Language :: English',
57 |           'Operating System :: Microsoft :: Windows',
58 |           'Operating System :: POSIX :: Linux',
59 |           'Operating System :: MacOS',
60 |           'Programming Language :: Python :: 3 :: Only',
61 |           'Topic :: Scientific/Engineering :: Bio-Informatics',
62 |       ],
63 |       keywords='fast5 nanopore')
64 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/__init__.py


--------------------------------------------------------------------------------
/test/data/basecall_2d_file_v1.0.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/basecall_2d_file_v1.0.fast5


--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/00031f3e-415c-4ab5-9c16-fb6fe45ff519.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/00031f3e-415c-4ab5-9c16-fb6fe45ff519.fast5


--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/000c0b4e-46c2-4fb5-9b17-d7031eefb975.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/000c0b4e-46c2-4fb5-9b17-d7031eefb975.fast5


--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/000ebd63-3e1a-4499-9ded-26af3225a022.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/000ebd63-3e1a-4499-9ded-26af3225a022.fast5


--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/002ad0e4-c6bb-4eff-a30f-5fec01475ab8.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/002ad0e4-c6bb-4eff-a30f-5fec01475ab8.fast5


--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/002b0891-03bf-4622-ae66-ae6984890ed4.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/002b0891-03bf-4622-ae66-ae6984890ed4.fast5


--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/0048058c-ecb4-4a0f-b283-9a128bd598c5.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/0048058c-ecb4-4a0f-b283-9a128bd598c5.fast5


--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/004a87b0-c9f6-4237-b4d6-466ab979aee2.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/004a87b0-c9f6-4237-b4d6-466ab979aee2.fast5


--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/0059d270-3238-4413-b38b-f588e28326df.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/0059d270-3238-4413-b38b-f588e28326df.fast5


--------------------------------------------------------------------------------
/test/data/hardlink/unlinked/batch0.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/unlinked/batch0.fast5


--------------------------------------------------------------------------------
/test/data/multi_read/batch_0.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/multi_read/batch_0.fast5


--------------------------------------------------------------------------------
/test/data/multi_read_analyses/batch_0.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/multi_read_analyses/batch_0.fast5


--------------------------------------------------------------------------------
/test/data/read_file_v0.6_raw.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/read_file_v0.6_raw.fast5


--------------------------------------------------------------------------------
/test/data/read_file_v0.6_single.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/read_file_v0.6_single.fast5


--------------------------------------------------------------------------------
/test/data/read_file_v1.0_single.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/read_file_v1.0_single.fast5


--------------------------------------------------------------------------------
/test/data/rle_basecall_table/rle_example.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/rle_basecall_table/rle_example.fast5


--------------------------------------------------------------------------------
/test/data/single_read_analyses/read.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_read_analyses/read.fast5


--------------------------------------------------------------------------------
/test/data/single_reads/fe85b517-62ee-4a33-8767-41cab5d5ab39.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_reads/fe85b517-62ee-4a33-8767-41cab5d5ab39.fast5


--------------------------------------------------------------------------------
/test/data/single_reads/fe8a3026-d1f4-46b3-8daa-e610f27acde1.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_reads/fe8a3026-d1f4-46b3-8daa-e610f27acde1.fast5


--------------------------------------------------------------------------------
/test/data/single_reads/fe9374ee-b86a-4ca4-81dc-ac06e3297728.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_reads/fe9374ee-b86a-4ca4-81dc-ac06e3297728.fast5


--------------------------------------------------------------------------------
/test/data/single_reads/read0.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_reads/read0.fast5


--------------------------------------------------------------------------------
/test/data/summaries/two_barcode_summary.txt:
--------------------------------------------------------------------------------
1 | read_id	barcode_arrangement
2 | fe85b517-62ee-4a33-8767-41cab5d5ab39	barcode01
3 | fe9374ee-b86a-4ca4-81dc-ac06e3297728	barcode02
4 | fe849dd3-63bc-4044-8910-14e1686273bb	barcode02
5 | fe8a3026-d1f4-46b3-8daa-e610f27acde1	barcode01


--------------------------------------------------------------------------------
/test/data/telemetry_test.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/telemetry_test.fast5


--------------------------------------------------------------------------------
/test/data/vbz_reads/vbz_reads.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/vbz_reads/vbz_reads.fast5


--------------------------------------------------------------------------------
/test/helpers.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from tempfile import TemporaryDirectory, _get_candidate_names
 4 | import unittest
 5 | 
 6 | test_data = os.path.join(os.path.dirname(__file__), 'data')
 7 | 
 8 | 
 9 | def disable_logging(test_func):
10 |     def do_test(self, *args, **kwargs):
11 |         logging.disable(logging.CRITICAL)
12 |         test_func(self, *args, **kwargs)
13 | 
14 |     return do_test
15 | 
16 | 
17 | class TestFast5ApiHelper(unittest.TestCase):
18 | 
19 |     def setUp(self):
20 |         self._tmp_dir = TemporaryDirectory()
21 |         self.save_path = self._tmp_dir.name
22 | 
23 |     def tearDown(self):
24 |         self._tmp_dir.cleanup()
25 | 
26 |     def generate_temp_filename(self):
27 |         return os.path.join(self.save_path, next(_get_candidate_names()))
28 | 


--------------------------------------------------------------------------------
/test/test_alignment_tools.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from ont_fast5_api.fast5_file import Fast5File
  4 | from ont_fast5_api.analysis_tools.alignment import AlignmentTools
  5 | from test.helpers import TestFast5ApiHelper
  6 | 
  7 | 
  8 | class TestAlignmentTools(TestFast5ApiHelper):
  9 | 
 10 |     def test_001_put_and_retrieve(self):
 11 |         fname = self.generate_temp_filename()
 12 |         summary_temp = {'genome': 'Lambda',
 13 |                         'genome_start': 100,
 14 |                         'genome_end': 200,
 15 |                         'strand_start': 1,
 16 |                         'strand_end': 101,
 17 |                         'num_events': 125,
 18 |                         'num_aligned': 92,
 19 |                         'num_correct': 87,
 20 |                         'num_insertions': 8,
 21 |                         'num_deletions': 8,
 22 |                         'identity': 0.9457,
 23 |                         'accuracy': 0.8056}
 24 |         summary_comp = {'genome': 'Lambda_rc',
 25 |                         'genome_start': 100,
 26 |                         'genome_end': 200,
 27 |                         'strand_start': 0,
 28 |                         'strand_end': 96,
 29 |                         'num_events': 120,
 30 |                         'num_aligned': 90,
 31 |                         'num_correct': 88,
 32 |                         'num_insertions': 6,
 33 |                         'num_deletions': 10,
 34 |                         'identity': 0.9778,
 35 |                         'accuracy': 0.8302}
 36 |         summary_2d = {'genome': 'Lambda',
 37 |                       'genome_start': 100,
 38 |                       'genome_end': 200,
 39 |                       'strand_start': 0,
 40 |                       'strand_end': 100,
 41 |                       'num_events': 125,
 42 |                       'num_aligned': 98,
 43 |                       'num_correct': 96,
 44 |                       'num_insertions': 4,
 45 |                       'num_deletions': 4,
 46 |                       'identity': 0.9796,
 47 |                       'accuracy': 0.9057}
 48 |         sam1 = 'Dummy string for template SAM.'
 49 |         sam2 = 'Dummy string for complement SAM.'
 50 |         sam3 = 'Dummy string for 2D SAM.'
 51 |         sequence1 = ''.join(np.random.choice(['A', 'C', 'G', 'T'], 100))
 52 |         bc = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
 53 |         sequence2 = ''.join([bc[letter] for letter in sequence1[::-1]])
 54 |         with Fast5File(fname, mode='w') as fh:
 55 |             fh.add_channel_info({'channel_number': 1,
 56 |                                  'sampling_rate': 4000,
 57 |                                  'digitisation': 8192,
 58 |                                  'range': 819.2,
 59 |                                  'offset': 0})
 60 |             fh.add_read(12, 'unique_snowflake', 12345, 4000, 0, 120.75)
 61 |             attrs = {'name': 'test', 'version': 0, 'time_stamp': 'just now', 'component': 'segmentation'}
 62 |             fh.add_analysis('segmentation', 'Segmentation_000', attrs)
 63 |             seg_data = {'has_template': 1,
 64 |                         'has_complement': 1,
 65 |                         'first_sample_template': 0,
 66 |                         'duration_template': 2000,
 67 |                         'first_sample_complement': 2000,
 68 |                         'duration_complement': 2000}
 69 |             fh.set_summary_data('Segmentation_000', 'segmentation', seg_data)
 70 |             attrs['component'] = 'alignment'
 71 |             attrs['segmentation'] = 'Analyses/Segmentation_000'
 72 |             fh.add_analysis('alignment', 'Alignment_000', attrs)
 73 |             fh.set_summary_data('Alignment_000', 'genome_mapping_template', summary_temp)
 74 |             fh.set_summary_data('Alignment_000', 'genome_mapping_complement', summary_comp)
 75 |             fh.set_summary_data('Alignment_000', 'genome_mapping_2d', summary_2d)
 76 |             with AlignmentTools(fh, group_name='Alignment_000') as align:
 77 |                 align.add_alignment_data('template', sam1, sequence1)
 78 |                 align.add_alignment_data('complement', sam2, sequence2)
 79 |                 align.add_alignment_data('2d', sam3, sequence1)
 80 |         with Fast5File(fname, mode='r') as fh:
 81 |             with AlignmentTools(fh, group_name='Alignment_000') as align:
 82 |                 sam, seq = align.get_alignment_data('template')
 83 |                 self.assertEqual(sam1, sam)
 84 |                 self.assertEqual(sequence1, seq)
 85 |                 sam, seq = align.get_alignment_data('complement')
 86 |                 self.assertEqual(sam2, sam)
 87 |                 self.assertEqual(sequence2, seq)
 88 |                 sam, seq = align.get_alignment_data('2d')
 89 |                 self.assertEqual(sam3, sam)
 90 |                 self.assertEqual(sequence1, seq)
 91 |                 results = align.get_results()
 92 |                 speed_temp = align.calculate_speed('template')
 93 |                 speed_comp = align.calculate_speed('complement')
 94 |                 # Make sure we can calculate speed using only what's in the
 95 |                 # summary
 96 |                 summary = fh.get_summary_data('Alignment_000')
 97 |                 template_summary = summary['genome_mapping_template']
 98 |                 summary_speed_temp = align.calculate_speed('template',
 99 |                                                            template_summary)
100 |         self.assertEqual(250, speed_temp)
101 |         self.assertEqual(250, speed_comp)
102 |         self.assertEqual(speed_temp, summary_speed_temp)
103 |         self.assertDictEqual({'status': 'match found',
104 |                               'direction': 'forward',
105 |                               'ref_name': 'Lambda',
106 |                               'ref_span': (100, 200),
107 |                               'seq_span': (1, 101),
108 |                               'seq_len': 125,
109 |                               'num_aligned': 92,
110 |                               'num_correct': 87,
111 |                               'num_insertions': 8,
112 |                               'num_deletions': 8,
113 |                               'identity': 0.9457,
114 |                               'accuracy': 0.8056}, results['template'])
115 |         self.assertDictEqual({'status': 'match found',
116 |                               'direction': 'reverse',
117 |                               'ref_name': 'Lambda',
118 |                               'ref_span': (100, 200),
119 |                               'seq_span': (0, 96),
120 |                               'seq_len': 120,
121 |                               'num_aligned': 90,
122 |                               'num_correct': 88,
123 |                               'num_insertions': 6,
124 |                               'num_deletions': 10,
125 |                               'identity': 0.9778,
126 |                               'accuracy': 0.8302}, results['complement'])
127 |         self.assertDictEqual({'status': 'match found',
128 |                               'direction': 'forward',
129 |                               'ref_name': 'Lambda',
130 |                               'ref_span': (100, 200),
131 |                               'seq_span': (0, 100),
132 |                               'seq_len': 125,
133 |                               'num_aligned': 98,
134 |                               'num_correct': 96,
135 |                               'num_insertions': 4,
136 |                               'num_deletions': 4,
137 |                               'identity': 0.9796,
138 |                               'accuracy': 0.9057}, results['2d'])
139 | 


--------------------------------------------------------------------------------
/test/test_basecall_1d_tools.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from ont_fast5_api.fast5_file import Fast5File
 3 | from ont_fast5_api.analysis_tools.basecall_1d import Basecall1DTools
 4 | from test.helpers import TestFast5ApiHelper
 5 | 
 6 | 
 7 | class TestBasecall1DTools(TestFast5ApiHelper):
 8 | 
 9 |     def test_001_put_and_retrieve(self):
10 |         fname = self.generate_temp_filename()
11 |         dtypes = [('mean', float),
12 |                   ('start', float),
13 |                   ('stdv', float),
14 |                   ('length', float),
15 |                   ('called_state', '<U5'),
16 |                   ('move', int)]
17 |         data1 = np.zeros(10, dtype=dtypes)
18 |         data1['mean'] = [10.0, 15.0, 8.5, 7.2, 13.6, 9.4, 11.8, 10.1, 4.2, 10.9]
19 |         data1['stdv'] = [0.7, 0.9, 1.0, 1.1, 0.75, 0.6, 0.83, 1.12, 9.45, 2.9]
20 |         data1['start'] = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
21 |         data1['length'] = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
22 |         data1['move'] = [1, 1, 0, 1, 0, 2, 1, 1, 1, 0]
23 |         data1['called_state'] = ['AAAAA', 'AAAAT', 'AAAAT', 'AAATC', 'AAATC',
24 |                                  'ATCCG', 'TCCGT', 'CCGTT', 'CGTTA', 'CGTTA']
25 |         data2 = data1[::-1]
26 |         seq1 = 'AAAAATCCGTTA'
27 |         seq2 = 'TAACGGATTTTT'
28 |         qstring1 = 'blahblahblah'
29 |         qstring2 = 'halbhalbhalb'
30 |         with Fast5File(fname, mode='w') as fh:
31 |             fh.add_channel_info({'channel_number': 1,
32 |                                  'sampling_rate': 4000,
33 |                                  'digitisation': 8192,
34 |                                  'range': 819.2,
35 |                                  'offset': 0})
36 |             fh.add_read(12, 'unique_snowflake', 12345, 1000, 0, 120.75)
37 |             attrs = {'name': 'test', 'version': 0, 'time_stamp': 'just now'}
38 |             fh.add_analysis('basecall_1d', 'Basecall_1D_000', attrs)
39 |             with Basecall1DTools(fh, group_name='Basecall_1D_000') as basecall:
40 |                 basecall.add_event_data('template', data1)
41 |                 basecall.add_event_data('complement', data2)
42 |                 basecall.add_called_sequence('template', 'template', seq1, qstring1)
43 |                 basecall.add_called_sequence('complement', 'complement', seq2, qstring2)
44 |         with Fast5File(fname, mode='r') as fh:
45 |             with Basecall1DTools(fh, group_name='Basecall_1D_000') as basecall:
46 |                 events1 = basecall.get_event_data('template')
47 |                 np.testing.assert_array_equal(events1, data1)
48 |                 events2 = basecall.get_event_data('complement')
49 |                 np.testing.assert_array_equal(events2, data2)
50 |                 n1, s1, q1 = basecall.get_called_sequence('template')
51 |                 self.assertEqual(n1, 'template')
52 |                 self.assertEqual(s1, seq1)
53 |                 self.assertEqual(q1, qstring1)
54 |                 n2, s2, q2 = basecall.get_called_sequence('complement')
55 |                 self.assertEqual(n2, 'complement')
56 |                 self.assertEqual(s2, seq2)
57 |                 self.assertEqual(q2, qstring2)
58 | 


--------------------------------------------------------------------------------
/test/test_basecall_2d_tools.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from ont_fast5_api.fast5_file import Fast5File
 3 | from ont_fast5_api.analysis_tools.basecall_2d import Basecall2DTools
 4 | from test.helpers import TestFast5ApiHelper
 5 | 
 6 | 
 7 | class TestBasecall2DTools(TestFast5ApiHelper):
 8 | 
 9 |     def test_001_put_and_retrieve(self):
10 |         fname = self.generate_temp_filename()
11 |         dtypes = [('template', int), ('complement', int)]
12 |         data1 = np.zeros(10, dtype=dtypes)
13 |         data1['template'] = [0, 1, 2, 2, 3, 4, 5, 6, 7, 8]
14 |         data1['complement'] = [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
15 |         dtypes.append(('model_state', '<U5'))
16 |         data2 = np.zeros(10, dtype=dtypes)
17 |         data2['template'] = data1['template']
18 |         data2['complement'] = data1['complement']
19 |         data2['model_state'] = ['AAAAA', 'AAAAT', 'AAATC', 'AAATC', 'ATCCG',
20 |                                 'TCCGT', 'CCGTT', 'CGTTA', 'CGTTA', 'GTTAC']
21 |         seq = 'AAAAATCCGTTAC'
22 |         qstring = 'blahblahblahb'
23 |         with Fast5File(fname, mode='w') as fh:
24 |             fh.add_channel_info({'channel_number': 1,
25 |                                  'sampling_rate': 4000,
26 |                                  'digitisation': 8192,
27 |                                  'range': 819.2,
28 |                                  'offset': 0})
29 |             fh.add_read(12, 'unique_snowflake', 12345, 1000, 0, 120.75)
30 |             attrs = {'name': 'test', 'version': 0, 'time_stamp': 'just now'}
31 |             fh.add_analysis('basecall_2d', 'Basecall_2D_000', attrs)
32 |             with Basecall2DTools(fh, group_name='Basecall_2D_000') as basecall:
33 |                 basecall.add_prior_alignment(data1)
34 |                 basecall.add_2d_call_alignment(data2)
35 |                 basecall.add_called_sequence("2D", 'test_2d', seq, qstring)
36 |         with Fast5File(fname, mode='r') as fh:
37 |             with Basecall2DTools(fh, group_name='Basecall_2D_000') as basecall:
38 |                 hp_align = basecall.get_prior_alignment()
39 |                 np.testing.assert_array_equal(hp_align, data1)
40 |                 bc2d = basecall.get_2d_call_alignment()
41 |                 np.testing.assert_array_equal(bc2d, data2)
42 |                 n, s, q = basecall.get_called_sequence("2D")
43 |                 self.assertEqual(n, 'test_2d')
44 |                 self.assertEqual(s, seq)
45 |                 self.assertEqual(q, qstring)
46 | 


--------------------------------------------------------------------------------
/test/test_check_compression.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | import numpy
 5 | 
 6 | from ont_fast5_api.compression_settings import GZIP, VBZ_ALPHA
 7 | from ont_fast5_api.conversion_tools.check_file_compression import check_read_compression, check_compression
 8 | from ont_fast5_api.fast5_file import Fast5File
 9 | from ont_fast5_api.fast5_interface import get_fast5_file
10 | from ont_fast5_api.multi_fast5 import MultiFast5File
11 | from test.helpers import test_data
12 | 
13 | 
14 | class TestCheckCompression(unittest.TestCase):
15 | 
16 |     def test_check_read_compression_single_read(self):
17 |         with get_fast5_file(os.path.join(test_data, 'single_reads', 'read0.fast5'), 'r') as f5:
18 |             for read in f5.get_reads():
19 |                 compression = check_read_compression(read)
20 |                 self.assertEqual(compression, GZIP)
21 | 
22 |     def test_check_read_compression_multi_read(self):
23 |         with get_fast5_file(os.path.join(test_data, 'multi_read', 'batch_0.fast5'), 'r') as f5:
24 |             for read in f5.get_reads():
25 |                 compression = check_read_compression(read)
26 |                 self.assertEqual(compression, GZIP)
27 | 
28 |     def test_check_read_compression_vbz(self):
29 |         with get_fast5_file(os.path.join(test_data, 'vbz_reads', 'vbz_reads.fast5'), 'r') as f5:
30 |             for read in f5.get_reads():
31 |                 compression = check_read_compression(read)
32 |                 self.assertEqual(compression, VBZ_ALPHA)
33 | 
34 |     def test_check_single_read_folder(self):
35 |         input_folder = os.path.join(test_data, 'single_reads')
36 |         compression_results = list(check_compression(input_folder, recursive=False,
37 |                                                      follow_symlinks=False, check_all_reads=False))
38 | 
39 |         ## expected
40 |         expected_results = []
41 |         for input_file in os.listdir(input_folder):
42 |             input_path = os.path.join(input_folder, input_file)
43 |             with Fast5File(input_path, 'r') as f5:
44 |                 expected_results.append((GZIP, f5.read_id, input_path))
45 | 
46 |         self.assertTrue(numpy.array_equal(expected_results, compression_results))
47 | 
48 |     def test_check_multi_read(self):
49 |         input_folder = os.path.join(test_data, 'vbz_reads')
50 |         ## expected results
51 |         expected_results = []
52 |         for input_file in os.listdir(input_folder):
53 |             input_path = os.path.join(input_folder, input_file)
54 |             with MultiFast5File(input_path, 'r') as f5:
55 |                 for read in f5.get_reads():
56 |                     expected_results.append((VBZ_ALPHA, read.read_id, input_path))
57 | 
58 |         # Test check all reads True
59 |         compression_results = list(check_compression(input_folder, recursive=False, follow_symlinks=False,
60 |                                                      check_all_reads=True))
61 |         self.assertTrue(numpy.array_equal(expected_results, compression_results))
62 | 
63 |         ## check one read only
64 |         compression_results = list(check_compression(input_folder, recursive=False, follow_symlinks=False,
65 |                                                      check_all_reads=False))
66 |         self.assertTrue(len(compression_results) == len(os.listdir(input_folder)))
67 |         self.assertTrue(compression_results[0] in expected_results)
68 | 


--------------------------------------------------------------------------------
/test/test_compress_fast5.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | import os
  3 | import shutil
  4 | from unittest.mock import patch
  5 | 
  6 | from ont_fast5_api.compression_settings import VBZ, GZIP
  7 | from ont_fast5_api.conversion_tools.check_file_compression import check_read_compression, check_compression
  8 | from ont_fast5_api.conversion_tools.compress_fast5 import compress_file, compress_single_read, compress_batch
  9 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list
 10 | from ont_fast5_api.fast5_file import Fast5File, EmptyFast5
 11 | from ont_fast5_api.fast5_info import ReadInfo
 12 | from ont_fast5_api.fast5_interface import get_fast5_file
 13 | from ont_fast5_api.multi_fast5 import MultiFast5File
 14 | from ont_fast5_api.static_data import OPTIONAL_READ_GROUPS
 15 | from test.helpers import TestFast5ApiHelper, test_data
 16 | 
 17 | 
 18 | class TestVbzReadWrite(TestFast5ApiHelper):
 19 |     run_id = "123abc"
 20 | 
 21 |     def test_write_vbz_directly(self):
 22 |         input_data = range(10)
 23 |         with h5py.File(os.path.join(self.save_path, 'h5py.fast5'), 'w') as fast5:
 24 |             fast5.create_dataset('Signal', data=input_data, **vars(VBZ))
 25 |             raw = fast5['Signal']
 26 | 
 27 |             self.assertTrue(str(VBZ.compression) in raw._filters)
 28 |             self.assertEqual(VBZ.compression_opts, raw._filters[str(VBZ.compression)])
 29 |             self.assertEqual(list(input_data), list(raw))
 30 | 
 31 |     def test_read_vbz_using_api(self):
 32 |         with MultiFast5File(os.path.join(test_data, 'vbz_reads', 'vbz_reads.fast5'), 'r') as fast5:
 33 |             read_count = 0
 34 |             for read in fast5.get_reads():
 35 |                 # This input file was created to have 4 reads with 20 samples per read
 36 |                 read_count += 1
 37 |                 raw_data = read.get_raw_data()
 38 |                 self.assertEqual(20, len(raw_data))
 39 |             self.assertEqual(4, read_count)
 40 | 
 41 |     def test_write_vbz_using_api(self):
 42 |         input_data = list(range(5))
 43 |         read_id = "0a1b2c3d"
 44 |         with MultiFast5File(self.generate_temp_filename(), 'w') as fast5:
 45 |             fast5.create_empty_read(read_id, self.run_id)
 46 |             read = fast5.get_read(read_id)
 47 |             read.add_raw_data(input_data, attrs={}, compression=VBZ)
 48 |             raw = read.get_raw_data()
 49 |             # First check the data comes back in an appropriate form
 50 |             self.assertEqual(input_data, list(raw))
 51 |             # Then check the types are as they should be under the hood
 52 |             filters = read.raw_compression_filters
 53 |             self.assertTrue(str(VBZ.compression) in filters)
 54 |             self.assertEqual(VBZ.compression_opts, filters[str(VBZ.compression)])
 55 | 
 56 |     def test_write_vbz_using_api_single_read(self):
 57 |         input_data = list(range(5))
 58 |         read_id = "0a1b2c3d"
 59 |         read_number = 0
 60 |         with Fast5File(self.generate_temp_filename(), 'w') as fast5:
 61 |             fast5.status.read_number_map[read_number] = read_number
 62 |             fast5.status.read_info = [ReadInfo(read_number=read_number, read_id=read_id,
 63 |                                                start_time=0, duration=len(input_data))]
 64 |             fast5.add_raw_data(data=input_data, attrs={}, compression=VBZ)
 65 |             raw = fast5.get_raw_data()
 66 |             # First check the data comes back in an appropriate form
 67 |             self.assertEqual(input_data, list(raw))
 68 | 
 69 |             # Then check the types are as they should be under the hood
 70 |             filters = fast5.raw_compression_filters
 71 |             self.assertTrue(str(VBZ.compression) in filters)
 72 |             self.assertEqual(VBZ.compression_opts, filters[str(VBZ.compression)])
 73 | 
 74 | 
 75 | class TestVbzConvert(TestFast5ApiHelper):
 76 |     run_id = "123abc"
 77 | 
 78 |     def assertCompressed(self, data_path, expected_compression, read_count, file_count):
 79 |         files = set()
 80 |         read_ids = set()
 81 |         for compression, read_id, filepath in check_compression(data_path, False, False, check_all_reads=True):
 82 |             self.assertEqual(expected_compression, compression)
 83 |             read_ids.add(read_id)
 84 |             files.add(filepath)
 85 |         self.assertEqual(read_count, len(read_ids))
 86 |         self.assertEqual(file_count, len(files))
 87 | 
 88 |     def assert_end_reason_attr_is_enum(self, _: str, obj: h5py.HLObject):
 89 |         """
 90 |         Assert that the end_reason attribute is an h5 enumeration
 91 | 
 92 |         Parameters matches signature for h5py.Group.visititems(name, object) used in the
 93 |         recursive search. name is unused.
 94 |         """
 95 |         if "end_reason" in obj.attrs:
 96 |             # Set state variable that end_reason attribute is seen in the given file
 97 |             # This is used to assert that this test doesn't pass due to an absence of a
 98 |             # negative result
 99 |             self.end_reason_seen = True
100 | 
101 |             metadata = obj.attrs.get_id("end_reason").dtype.metadata
102 |             self.assertIsNotNone(
103 |                 metadata,
104 |                 msg="end_reason attribute dtype.metadata is None.  "
105 |                     "This indicates that the enumeration metadata is lost"
106 |             )
107 |             self.assertTrue(
108 |                 "enum" in metadata,
109 |                 msg="end_reason attribute dtype.metadata does not contain enum"
110 |             )
111 | 
112 |     def test_add_read_from_multi(self):
113 |         target_compression = VBZ
114 |         with get_fast5_file(os.path.join(test_data, "multi_read", "batch_0.fast5"), "r") as input_f5, \
115 |                 MultiFast5File(self.generate_temp_filename(), 'w') as output_f5:
116 |             read_id = input_f5.get_read_ids()[0]
117 |             input_read = input_f5.get_read(read_id)
118 | 
119 |             # Input read should be uncompressed on the way in:
120 |             self.assertEqual(check_read_compression(input_read), GZIP)
121 | 
122 |             output_f5.add_existing_read(input_read, target_compression)
123 | 
124 |             output_read = output_f5.get_read(read_id)
125 |             self.assertEqual(check_read_compression(output_read), VBZ)
126 | 
127 |     def test_compress_read_from_single(self):
128 |         with get_fast5_file(os.path.join(test_data, "single_reads", "read0.fast5"), "r") as input_f5, \
129 |                 EmptyFast5(self.generate_temp_filename(), 'w') as output_f5:
130 |             read_id = input_f5.get_read_ids()[0]
131 |             input_read = input_f5.get_read(read_id)
132 | 
133 |             # Input read should be uncompressed on the way in:
134 |             self.assertEqual(check_read_compression(input_read), GZIP)
135 | 
136 |             compress_single_read(output_f5, input_read, target_compression=VBZ)
137 | 
138 |             output_read = output_f5.get_read(read_id)
139 |             self.assertEqual(check_read_compression(output_read), VBZ)
140 | 
141 |     @patch('ont_fast5_api.conversion_tools.compress_fast5.get_progress_bar')
142 |     def test_conversion_script_multi(self, mock_pbar):
143 |         input_folder = os.path.join(test_data, 'multi_read')
144 |         compress_batch(input_folder=input_folder, output_folder=self.save_path, target_compression=VBZ)
145 |         self.assertCompressed(self.save_path, VBZ, read_count=4, file_count=1)
146 | 
147 |     @patch('ont_fast5_api.conversion_tools.compress_fast5.get_progress_bar')
148 |     def test_conversion_script_single(self, mock_pbar):
149 |         input_folder = os.path.join(test_data, 'single_reads')
150 |         compress_batch(input_folder=input_folder, output_folder=self.save_path, target_compression=VBZ)
151 |         self.assertCompressed(self.save_path, VBZ, read_count=4, file_count=4)
152 | 
153 |     def assert_all_files_retain_end_reason_enumeration_metadata(self):
154 |         """
155 |         Assertion that all output files have not lost the end_reason enumeration metadata
156 |         """
157 | 
158 |         for f5_file in get_fast5_file_list(self.save_path, recursive=True):
159 | 
160 |             # Require that the end_reason attribute is seen in the file under test.
161 |             # This is used to assert that this test doesn't pass due to an absence of a
162 |             # negative result
163 |             self.end_reason_seen = False
164 | 
165 |             with h5py.File(f5_file, "r") as fh:
166 |                 # Check the top level item
167 |                 self.assert_end_reason_attr_is_enum("", fh)
168 | 
169 |                 # Recursively search all items in the file
170 |                 fh.visititems(self.assert_end_reason_attr_is_enum)
171 | 
172 |             self.assertTrue(
173 |                 self.end_reason_seen,
174 |                 msg="No end_reason attributes seen in the fast5 file. Something went wrong."
175 |             )
176 | 
177 |     @patch('ont_fast5_api.conversion_tools.compress_fast5.get_progress_bar')
178 |     def test_conversion_script_multi_retains_end_reason_enumeration(self, mock_pbar):
179 |         """
180 |         Test that given a collection of multi read fast5 files that undergo compression
181 |         that the end_reason attribute metadata which indicates that it is an enumeration,
182 |         is not lost
183 |         """
184 |         input_folder = os.path.join(test_data, 'multi_read')
185 |         compress_batch(input_folder=input_folder, output_folder=self.save_path,
186 |                        target_compression=VBZ)
187 | 
188 |         self.assert_all_files_retain_end_reason_enumeration_metadata()
189 | 
190 |     @patch('ont_fast5_api.conversion_tools.compress_fast5.get_progress_bar')
191 |     def test_conversion_script_single_retains_end_reason_enumeration(self, mock_pbar):
192 |         """
193 |         Test that given a collection of single read fast5 files that undergo compression
194 |         that the end_reason attribute metadata which indicates that it is an enumeration,
195 |         is not lost
196 |         """
197 | 
198 |         input_folder = os.path.join(test_data, 'single_reads')
199 |         compress_batch(input_folder=input_folder, output_folder=self.save_path,
200 |                        target_compression=VBZ)
201 | 
202 |         self.assert_all_files_retain_end_reason_enumeration_metadata()
203 | 
204 |     @patch('ont_fast5_api.conversion_tools.compress_fast5.get_progress_bar')
205 |     def test_compress_in_place(self, mock_pbar):
206 |         for input_file in os.listdir(os.path.join(test_data, 'single_reads')):
207 |             # We copy file by file as copytree won't work to an existing directory
208 |             shutil.copy(os.path.join(test_data, 'single_reads', input_file), self.save_path)
209 | 
210 |         self.assertCompressed(self.save_path, GZIP, read_count=4, file_count=4)
211 |         in_files = set(os.listdir(self.save_path))
212 |         compress_batch(self.save_path, output_folder=None, target_compression=VBZ, in_place=True)
213 |         self.assertCompressed(self.save_path, VBZ, read_count=4, file_count=4)
214 |         self.assertEqual(in_files, set(os.listdir(self.save_path)))
215 | 
216 | 
217 | class TestSanitise(TestFast5ApiHelper):
218 |         
219 |     @staticmethod
220 |     def list_groups(fname, single_multi='multi'):
221 |         split_index = {
222 |             'multi': 1, 'single': 0}
223 |         all_groups = list()
224 |         filtered_groups = list()
225 |         def _add_group(name):
226 |             all_groups.append(name)
227 |             try:
228 |                 subgroup = name.split('/')[split_index[single_multi]]
229 |             except IndexError:
230 |                 # top level
231 |                 filtered_groups.append(name)
232 |             else:
233 |                 if not subgroup in OPTIONAL_READ_GROUPS:
234 |                     filtered_groups.append(name)
235 |         with h5py.File(fname, 'r') as fh:
236 |             fh.visit(_add_group)
237 |         return all_groups, filtered_groups
238 | 
239 |     def _test(self, input_file, output_file, single_or_multi):
240 |         orig_all_groups, orig_filtered_groups = self.list_groups(input_file, single_or_multi)
241 |         new_all_groups, new_filtered_groups = self.list_groups(output_file, single_or_multi)
242 |         
243 |         self.assertNotEqual(orig_all_groups, orig_filtered_groups)
244 |         self.assertEqual(orig_filtered_groups, new_filtered_groups)
245 |         self.assertEqual(new_all_groups, new_filtered_groups)
246 | 
247 |     def test_multi_to_multi(self):
248 |         input_file = os.path.join(test_data, "multi_read_analyses", "batch_0.fast5")
249 |         output_file = self.generate_temp_filename()
250 |         compress_file(input_file, output_file, VBZ, sanitize=True)
251 |         self._test(input_file, output_file, 'multi')
252 | 
253 |     def test_single_to_multi(self):
254 |         input_file = os.path.join(test_data, "single_read_analyses", "read.fast5")
255 |         output_file = self.generate_temp_filename()
256 |         with Fast5File(input_file, 'r') as input_f5, \
257 |                 EmptyFast5(output_file, 'a') as output_f5:
258 |             compress_single_read(output_f5, input_f5, VBZ, sanitize=True)
259 |         self._test(input_file, output_file, 'single')
260 | 
261 | 


--------------------------------------------------------------------------------
/test/test_compression_settings.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import patch
 3 | import os
 4 | 
 5 | from ont_fast5_api.compression_settings import register_plugin
 6 | 
 7 | class TestCheckCompression(unittest.TestCase):
 8 | 
 9 |     def test_register_plugin_no_prepend(self):
10 |         # GIVEN a version of h5py that has the h5pl module
11 |         with patch('h5py.h5pl', create=True) as mock:
12 | 
13 |             # WHEN h5py.h5pl doesn't have the prepend method
14 |             del mock.prepend
15 |             # and the hdf5 plugin path variable hasn't been set
16 |             if 'HDF5_PLUGIN_PATH' in os.environ:
17 |                 del os.environ['HDF5_PLUGIN_PATH']
18 |             self.assertTrue('HDF5_PLUGIN_PATH' not in os.environ)
19 | 
20 |             # THEN when we try and register the vbz plugin we fall back to setting
21 |             # HDF5_PLUGIN_PATH (because we can't use prepend)
22 |             plugin_path = register_plugin()
23 |             self.assertTrue(os.environ['HDF5_PLUGIN_PATH'] == plugin_path)
24 |         
25 |             
26 | 


--------------------------------------------------------------------------------
/test/test_data_sanitisation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | from numpy import array, ndarray, dtype
  4 | 
  5 | from ont_fast5_api.data_sanitisation import _sanitize_data_for_reading, _sanitize_data_for_writing, _clean
  6 | from ont_fast5_api.multi_fast5 import MultiFast5File
  7 | from test.helpers import TestFast5ApiHelper, test_data
  8 | 
  9 | 
 10 | class TestDataSanitisation(TestFast5ApiHelper):
 11 | 
 12 |     def test__clean(self):
 13 |         self.assertEqual(_clean(1), 1)
 14 |         self.assertEqual(_clean(b''), '')
 15 |         self.assertEqual(_clean(''), '')
 16 |         self.assertEqual(_clean('str'), 'str')
 17 |         self.assertEqual(_clean(b'str'), 'str')
 18 |         self.assertTrue(isinstance(_clean('str'), str))
 19 | 
 20 |         # _clean should convert byte strings into utf-8 ones
 21 |         test_str = array(b'Hello!', dtype=bytes)
 22 |         self.assertEqual(type(test_str), ndarray)
 23 |         self.assertEqual(_clean(test_str), 'Hello!')
 24 | 
 25 |         # _clean shouldn't do anything to python strings
 26 |         test_str = array('Hello!', dtype=str)
 27 |         self.assertEqual(type(test_str), ndarray)
 28 |         self.assertEqual(_clean(test_str), test_str)
 29 | 
 30 |         self.assertEqual(_clean(array([1, 2, 3])), [1, 2, 3])
 31 | 
 32 |     def test__sanitize_data(self):
 33 |         # We expect conversion from utf8 to bytestrings and vice-versa
 34 |         test_string = 'Avast'
 35 |         self.assertEqual(test_string,
 36 |                          _sanitize_data_for_reading(test_string.encode()))
 37 |         self.assertEqual(test_string.encode(),
 38 |                          _sanitize_data_for_writing(test_string))
 39 | 
 40 |         test_array = array('Arr', dtype=str)
 41 |         self.assertEqual(test_array,
 42 |                          _sanitize_data_for_reading(np.char.encode(test_array)))
 43 |         self.assertEqual(np.char.encode(test_array),
 44 |                          _sanitize_data_for_writing(test_array))
 45 | 
 46 |         test_ndarray_utf8 = array([('Narr', 0)],
 47 |                                   dtype=[('string', (str, 4)),
 48 |                                          ('int', int)])
 49 |         test_ndarray_bytes = array([(b'Narr', 0)],
 50 |                                    dtype=[('string', (bytes, 4)),
 51 |                                           ('int', int)])
 52 |         self.assertEqual(test_ndarray_utf8,
 53 |                          _sanitize_data_for_reading(test_ndarray_bytes))
 54 |         self.assertEqual(test_ndarray_bytes,
 55 |                          _sanitize_data_for_writing(test_ndarray_utf8))
 56 | 
 57 |     def test__sanitize_data_emptystrings(self):
 58 |         test_ndarray_utf8 = array([('', '')], dtype=[('empty', str),
 59 |                                                      ('string', str)])
 60 |         test_ndarray_bytes = array([('', '')], dtype=[('empty', bytes),
 61 |                                                       ('string', bytes)])
 62 | 
 63 |         str_array = _sanitize_data_for_reading(test_ndarray_bytes)
 64 |         byte_array = _sanitize_data_for_writing(test_ndarray_utf8)
 65 |         self.assertTrue(np.array_equal(test_ndarray_utf8, str_array))
 66 |         self.assertTrue(np.array_equal(test_ndarray_bytes, byte_array))
 67 | 
 68 |     def test_sanitise_array_empty_string(self):
 69 |         input_list = [('', 1, 4.8), ('', 2, 7.6)]
 70 |         input_types = [('base', str), ('length', 'i4'), ('score', 'f8')]
 71 |         input_array = array(input_list, dtype=input_types)
 72 |         output_array = _sanitize_data_for_writing(input_array)
 73 | 
 74 |         expected_types = [('base', 'S'), ('length', 'i4'), ('score', 'f8')]
 75 |         self.assertEqual(expected_types, output_array.dtype)
 76 | 
 77 |         roundtrip_array = _sanitize_data_for_reading(output_array)
 78 |         self.assertTrue(np.array_equal(input_array, roundtrip_array))
 79 | 
 80 |         # Check things works with numpy.recarrays too
 81 |         input_rec = input_array.view(np.recarray)
 82 |         output_recarray = _sanitize_data_for_writing(input_rec)
 83 |         self.assertEqual(expected_types, output_recarray.dtype)
 84 | 
 85 |     def test_real_example_file(self):
 86 |         with MultiFast5File(os.path.join(test_data, 'rle_basecall_table', 'rle_example.fast5'), 'r') as mf5:
 87 |             for read in mf5.get_reads():
 88 |                 actual_data = read.handle['Analyses/Basecall_1D_000/BaseCalled_template/RunlengthBasecall']
 89 |                 expected_dtypes = [('base', '<U1'),  # After cleaning this is a unicode string
 90 |                                    ('scale', '<f4'),
 91 |                                    ('shape', '<f4'),
 92 |                                    ('weight', '<f4'),
 93 |                                    ('index', '<u4'),
 94 |                                    ('runlength', '<u4')]
 95 | 
 96 |                 for field, expected_type in expected_dtypes:
 97 |                     if field != 'base':
 98 |                         self.assertEqual(dtype(expected_type), actual_data[field].dtype)
 99 |                     else:
100 |                         # Before cleaning the 'base' column is of type byte-string length=1
101 |                         self.assertEqual(dtype('|S1'), actual_data[field].dtype)
102 | 
103 |                 clean_data = _sanitize_data_for_reading(actual_data)
104 |                 self.assertEqual(dtype(expected_dtypes), clean_data.dtype)
105 | 


--------------------------------------------------------------------------------
/test/test_demux_fast5.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from test.helpers import TestFast5ApiHelper, test_data
 3 | from unittest.mock import patch
 4 | from ont_fast5_api.conversion_tools.demux_fast5 import Fast5Demux
 5 | from ont_fast5_api.multi_fast5 import MultiFast5File
 6 | 
 7 | 
 8 | class TestDemuxFast5(TestFast5ApiHelper):
 9 |     multi_fast5 = Path(test_data) / "multi_read" / "batch_0.fast5"
10 |     summary = Path(test_data) / "summaries" / "two_barcode_summary.txt"
11 |     barcode01 = {"fe85b517-62ee-4a33-8767-41cab5d5ab39", "fe8a3026-d1f4-46b3-8daa-e610f27acde1"}
12 |     barcode02 = {"fe9374ee-b86a-4ca4-81dc-ac06e3297728", "fe849dd3-63bc-4044-8910-14e1686273bb"}
13 |     barcodes = (barcode01, barcode02)
14 | 
15 |     @patch('ont_fast5_api.conversion_tools.demux_fast5.logging')
16 |     @patch('ont_fast5_api.conversion_tools.conversion_utils.ProgressBar')
17 |     def test_demux_1t(self, mock_pbar, mock_logger):
18 |         # given 4 read multi fast5 file and a summary, bin it in two barcode directories
19 |         output_dir = Path(self.save_path) / "1t"
20 |         output_dir.mkdir()
21 |         demux = Fast5Demux(input_dir=self.multi_fast5, output_dir=output_dir, summary_file=self.summary,
22 |                            demultiplex_column="barcode_arrangement",threads=1)
23 |         demux.run_batch()
24 |         self.check_output(output_dir)
25 | 
26 |     @patch('ont_fast5_api.conversion_tools.demux_fast5.logging')
27 |     @patch('ont_fast5_api.conversion_tools.conversion_utils.ProgressBar')
28 |     def test_demux_8t(self, mock_pbar, mock_logger):
29 |         # given 4 read multi fast5 file and a summary, bin it in two barcode directories
30 |         output_dir = Path(self.save_path) / "8t"
31 |         output_dir.mkdir()
32 |         demux = Fast5Demux(input_dir=self.multi_fast5, output_dir=output_dir, summary_file=self.summary,
33 |                            demultiplex_column="barcode_arrangement",threads=8)
34 |         demux.workers_setup()
35 |         # even with 8 threads allocated, only max 2 can be used
36 |         self.assertEqual(demux.max_threads, 2)
37 |         demux.run_batch()
38 |         self.check_output(output_dir)
39 | 
40 |     def check_output(self, result_path):
41 |         output_dir1 = result_path / "barcode01"
42 |         output_dir2 = result_path / "barcode02"
43 | 
44 |         for directory, barcodes in zip((output_dir1, output_dir2), self.barcodes):
45 |             self.assertTrue(directory.exists())
46 |             self.assertTrue(directory.is_dir())
47 |             batch_file = directory / "batch0.fast5"
48 |             self.assertTrue(batch_file.exists())
49 |             self.assertTrue(batch_file.is_file())
50 |             with MultiFast5File(batch_file, 'r') as fast5_in:
51 |                 read_ids = set(fast5_in.get_read_ids())
52 |                 self.assertEqual(read_ids, barcodes)
53 |             summary_file = directory / "filename_mapping.txt"
54 |             self.assertTrue(summary_file.exists())
55 |             self.assertTrue(summary_file.is_file())
56 | 
57 |     @patch('ont_fast5_api.conversion_tools.demux_fast5.logging')
58 |     @patch('ont_fast5_api.conversion_tools.conversion_utils.ProgressBar')
59 |     def test_parse_summary(self, mock_pbar, mock_logger):
60 |         # create a summary file with standard column names
61 |         summary_file = Path(self.generate_temp_filename())
62 |         truth = {"barcode01": self.barcode01, "barcode02": self.barcode02}
63 |         with open(summary_file, 'w') as summ:
64 |             header = "read_id\tbarcode_arrangement\n"
65 |             summ.write(header)
66 |             for barcode, read_ids in truth.items():
67 |                 for read_id in read_ids:
68 |                     line = read_id + "\t" + barcode + "\n"
69 |                     summ.write(line)
70 | 
71 |         demux = Fast5Demux(input_dir=self.multi_fast5, output_dir=Path(self.save_path), summary_file=summary_file,
72 |                            demultiplex_column="barcode_arrangement")
73 |         demux.workers_setup()
74 | 
75 |         self.assertEqual(demux.read_sets, truth)
76 | 
77 |         # create a summary file with non-standard column names
78 |         truth = {"genome1": self.barcode01, "genome2": self.barcode02}
79 |         summary_file = Path(self.generate_temp_filename())
80 |         with open(summary_file, 'w') as summ:
81 |             header = "genome\tread_name\n"
82 |             summ.write(header)
83 |             for genome, read_ids in truth.items():
84 |                 for read_id in read_ids:
85 |                     line = genome + "\t" + read_id + "\n"
86 |                     summ.write(line)
87 | 
88 |         demux = Fast5Demux(input_dir=self.multi_fast5, output_dir=Path(self.save_path), summary_file=summary_file,
89 |                            demultiplex_column="genome", read_id_column="read_name")
90 |         demux.workers_setup()
91 | 
92 |         self.assertEqual(demux.read_sets, truth)
93 | 


--------------------------------------------------------------------------------
/test/test_event_detection_tools.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from ont_fast5_api.fast5_file import Fast5File
 4 | from ont_fast5_api.analysis_tools.event_detection import EventDetectionTools
 5 | from test.helpers import TestFast5ApiHelper, test_data
 6 | 
 7 | 
 8 | class TestEventDetectionTools(TestFast5ApiHelper):
 9 | 
10 |     def test_001_read_events(self):
11 |         # Check that it is recognized properly.
12 |         fname = os.path.join(test_data, 'read_file_v1.0_single.fast5')
13 |         with EventDetectionTools(fname, mode='r', ) as fh:
14 |             self.assertTrue(fh.has_event_data)
15 |             self.assertTrue(fh.has_event_data(read_number=59))
16 |             self.assertEqual('EventDetection_000', fh.group_name)
17 |             data, attrs = fh.get_event_data()
18 |             self.assertDictEqual({'read_number': 59,
19 |                                   'strand_id': 60,
20 |                                   'start_mux': 1,
21 |                                   'end_mux': 1,
22 |                                   'start_time': 32463855,
23 |                                   'duration': 729468}, attrs)
24 |             self.assertEqual(7875, data.size)
25 |             self.assertEqual(118, data[0]['length'])
26 |             data, attrs = fh.get_event_data(time_in_seconds=True)
27 |             self.assertEqual(0.0236, data[0]['length'])
28 | 
29 |     def test_002_write_events(self):
30 |         fname = os.path.join(self.save_path, 'test.fast5')
31 |         with Fast5File(fname, 'w') as fh:
32 |             fh.add_channel_info({'channel_number': 1, 'sampling_rate': 4000})
33 |             fh.add_read(12, 'unique_snowflake', 12345, 111, 0, 120.75)
34 |             with EventDetectionTools(fh, group_name='EventDetection_000',
35 |                                      meta={'name': 'test', 'version': '0.1.0'}) as evdet:
36 |                 data = np.zeros(100, dtype=[('start', int), ('length', int), ('mean', float), ('stdv', float)])
37 |                 read_attrs = {'read_number': 12}
38 |                 evdet.set_event_data(data, read_attrs)
39 |         with Fast5File(fname, 'r') as fh:
40 |             self.assertEqual(1, len(fh.status.read_info))
41 |             read_info = fh.status.read_info[0]
42 |             self.assertEqual(12, read_info.read_number)
43 |             group = fh.get_latest_analysis('EventDetection')
44 |             self.assertEqual('EventDetection_000', group)
45 |             with EventDetectionTools(fh) as evdet:
46 |                 self.assertTrue(evdet.has_event_data())
47 |                 data, attrs = evdet.get_event_data()
48 |                 self.assertDictEqual({u'read_number': 12,
49 |                                       u'read_id': 'unique_snowflake',
50 |                                       u'start_time': 12345,
51 |                                       u'duration': 111,
52 |                                       u'start_mux': 0,
53 |                                       u'median_before': 120.75}, attrs)
54 |                 self.assertEqual(100, data.size)
55 | 


--------------------------------------------------------------------------------
/test/test_fast5_conversion_utils.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from ont_fast5_api.fast5_read import Fast5Read
  3 | from test.helpers import test_data, TestFast5ApiHelper
  4 | 
  5 | from ont_fast5_api.conversion_tools.conversion_utils import yield_fast5_files, yield_fast5_reads
  6 | 
  7 | class TestFast5ConversionUtilities(TestFast5ApiHelper):
  8 |     """
  9 |     Test the convenience functions yield_fast5_files and yield_fast5_reads
 10 |     """
 11 | 
 12 |     def setUp(self) -> None:
 13 |         super().setUp()
 14 | 
 15 |         # Known good read_ids from test_data/multi_read/batch_0.fast5
 16 |         self.read_id_set = {'fe849dd3-63bc-4044-8910-14e1686273bb',
 17 |                             'fe85b517-62ee-4a33-8767-41cab5d5ab39'}
 18 |         self.read_id_list = ['fe849dd3-63bc-4044-8910-14e1686273bb',
 19 |                             'fe85b517-62ee-4a33-8767-41cab5d5ab39']
 20 |         self.fast5_path = test_data + "/multi_read/batch_0.fast5"
 21 | 
 22 | 
 23 |     def test_yield_fast5_files_from_fast5_file(self):
 24 |         f5_gen = yield_fast5_files(self.fast5_path, recursive=False)
 25 |         f5_path = next(f5_gen)
 26 |         self.assertTrue(Path(f5_path).is_file(), "Filepath is not a file")
 27 |         self.assertTrue(f5_path.endswith('.fast5'), "Filepath does not end with fast5 extension")
 28 |         self.assertTrue(Path(f5_path).absolute() == Path(self.fast5_path).absolute(),
 29 |                         "Direct path did not return itself")
 30 | 
 31 |     def test_yield_fast5_files_from_dir(self):
 32 |         f5_gen = yield_fast5_files(test_data, recursive=False)
 33 | 
 34 |         for f5_path in f5_gen:
 35 |             self.assertTrue(Path(f5_path).is_file(), "Filepath is not a file")
 36 |             self.assertTrue(f5_path.endswith('.fast5'), "Filepath does not end with fast5 extension")
 37 | 
 38 |     def test_yield_fast5_reads_from_fast5_file(self):
 39 |         f5_read_gen = yield_fast5_reads(self.fast5_path, recursive=False)
 40 |         read_id, read_data = next(f5_read_gen)
 41 |         self.assertTrue(read_id is not None, "read_id is None")
 42 |         self.assertTrue(isinstance(read_data, Fast5Read), "Return is not Fast5Read instance")
 43 | 
 44 |     def test_yield_fast5_reads_from_dir(self):
 45 |         f5_read_gen = yield_fast5_reads(test_data, recursive=False)
 46 |         read_id, read_data = next(f5_read_gen)
 47 |         self.assertTrue(read_id is not None, "read_id is None")
 48 |         self.assertTrue(isinstance(read_data, Fast5Read), "Return is not Fast5Read instance")
 49 | 
 50 |     def test_yield_fast5_reads_with_set(self):
 51 |         f5_read_gen = yield_fast5_reads(self.fast5_path,
 52 |                                         recursive=False,
 53 |                                         read_ids=self.read_id_set)
 54 |         f5_reads = list(f5_read_gen)
 55 |         self.assertTrue(len(f5_reads) == len(self.read_id_set))
 56 | 
 57 |         for read_id, read_data in f5_reads:
 58 |             self.assertTrue(read_id in self.read_id_set, "A read_id is not a member of read_ids")
 59 |             self.assertTrue(isinstance(read_data, Fast5Read), "Return is not Fast5Read instance")
 60 | 
 61 |     def test_yield_fast5_reads_with_list(self):
 62 |         f5_read_gen = yield_fast5_reads(self.fast5_path,
 63 |                                         recursive=False,
 64 |                                         read_ids=self.read_id_set)
 65 |         f5_reads = list(f5_read_gen)
 66 |         self.assertTrue(len(f5_reads) == len(self.read_id_list))
 67 | 
 68 |         for read_id, read_data in f5_reads:
 69 |             self.assertTrue(read_id in self.read_id_set, "A read_id is not a member of read_id_list")
 70 |             self.assertTrue(isinstance(read_data, Fast5Read), "Return is not Fast5Read instance")
 71 | 
 72 |     def test_yield_fast5_reads_set_versus_list_equality(self):
 73 |         f5_read_gen_by_id_set = yield_fast5_reads(self.fast5_path,
 74 |                                                   recursive=False,
 75 |                                                   read_ids=self.read_id_set)
 76 | 
 77 |         f5_read_gen_by_id_list = yield_fast5_reads(self.fast5_path,
 78 |                                                    recursive=False,
 79 |                                                    read_ids=self.read_id_list)
 80 | 
 81 |         # Consume the generators into sets
 82 |         ids_by_set = set(rid for rid, _ in f5_read_gen_by_id_set)
 83 |         ids_by_list = set(rid for rid, _  in f5_read_gen_by_id_list)
 84 |         self.assertTrue(ids_by_list == ids_by_set, 'Ids differ when using read_id list versus set')
 85 | 
 86 | 
 87 |     def test_yield_fast5_reads_with_empty_set(self):
 88 |         f5_read_gen = yield_fast5_reads(self.fast5_path,
 89 |                                         recursive=False,
 90 |                                         read_ids=set([]))
 91 | 
 92 |         self.assertTrue(len(list(f5_read_gen)) != 0, "Empty read_ids resulted in zero returned reads")
 93 | 
 94 |     def test_yield_fast5_reads_with_garbage_set(self):
 95 |         f5_read_gen = yield_fast5_reads(self.fast5_path,
 96 |                                         recursive=False,
 97 |                                         read_ids={'_g4rbag£_'})
 98 |         f5_reads = list(f5_read_gen)
 99 |         self.assertTrue(len(f5_reads) == 0, "Garbage read_ids returned non-zero reads")
100 | 
101 |     def test_yield_fast5_reads_type_error(self):
102 |         with self.assertRaisesRegex(TypeError, 'read_ids'):
103 |             f5_read_gen = yield_fast5_reads(self.fast5_path,
104 |                                             recursive=False,
105 |                                             read_ids=int(1))
106 |             next(f5_read_gen)


--------------------------------------------------------------------------------
/test/test_fast5_converter.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import os
 4 | import h5py
 5 | import numpy
 6 | 
 7 | from ont_fast5_api.conversion_tools.multi_to_single_fast5 import convert_multi_to_single, try_multi_to_single_conversion
 8 | from ont_fast5_api.conversion_tools.single_to_multi_fast5 import batch_convert_single_to_multi, get_fast5_file_list, \
 9 |     create_multi_read_file
10 | from ont_fast5_api.multi_fast5 import MultiFast5File
11 | from ont_fast5_api.fast5_file import Fast5FileTypeError, Fast5File
12 | from test.helpers import TestFast5ApiHelper, test_data, disable_logging
13 | 
14 | 
15 | class TestFast5Converter(TestFast5ApiHelper):
16 | 
17 |     @patch('ont_fast5_api.conversion_tools.single_to_multi_fast5.get_progress_bar')
18 |     def test_single_to_multi(self, mock_pbar):
19 |         input_folder = os.path.join(test_data, "single_reads")
20 |         batch_size = 3
21 |         file_count = len(os.listdir(input_folder))
22 |         batch_convert_single_to_multi(input_folder, self.save_path, filename_base="batch", batch_size=batch_size,
23 |                                       threads=1, recursive=False, follow_symlinks=False, target_compression=None)
24 | 
25 |         expected_output_reads = {"filename_mapping.txt": 0,
26 |                                  "batch_0.fast5": batch_size,
27 |                                  "batch_1.fast5": file_count % batch_size}
28 |         self.assertEqual(sorted(os.listdir(self.save_path)), sorted(list(expected_output_reads.keys())))
29 |         for file, read_count in expected_output_reads.items():
30 |             if read_count > 0:
31 |                 with h5py.File(os.path.join(self.save_path, file), 'r') as f5:
32 |                     self.assertEqual(len(f5), read_count)
33 | 
34 |     def test_multi_to_single(self):
35 |         input_file = os.path.join(test_data, "multi_read", "batch_0.fast5")
36 |         with MultiFast5File(input_file, 'r') as f5:
37 |             read_count = len(f5.handle)
38 |             expected_files = sorted([os.path.join(self.save_path, "{}", i + '.fast5') for i in f5.get_read_ids()])
39 | 
40 |         subfolder = '0'
41 |         convert_multi_to_single(input_file, self.save_path, subfolder)
42 | 
43 |         out_files = sorted(get_fast5_file_list(self.save_path, recursive=True, follow_symlinks=True))
44 |         self.assertEqual(len(out_files), read_count)
45 |         self.assertEqual(out_files, [f.format(subfolder) for f in expected_files])
46 | 
47 |     @disable_logging
48 |     def test_single_to_multi_incorrect_types(self):
49 |         input_files = [os.path.join(test_data, "multi_read", "batch_0.fast5")]
50 |         with self.assertRaises(Fast5FileTypeError):
51 |             create_multi_read_file(input_files, self.generate_temp_filename(), target_compression=None)
52 | 
53 |     def test_multi_to_single_incorrect_types(self):
54 |         input_folder = os.path.join(test_data, "single_reads")
55 |         input_file = os.path.join(input_folder, os.listdir(input_folder)[0])
56 |         with self.assertRaises(Fast5FileTypeError):
57 |             try_multi_to_single_conversion(input_file, self.save_path, subfolder='0')
58 | 
59 |     def test_add_read_to_multi(self):
60 |         with Fast5File(os.path.join(test_data, "single_reads", "read0.fast5"), 'r') as single_fast5, \
61 |                 MultiFast5File(self.generate_temp_filename(), 'w') as multi_out:
62 |             multi_out.add_existing_read(single_fast5)
63 |             expected_raw = single_fast5.get_raw_data()
64 |             actual_raw = multi_out.get_read(single_fast5.get_read_id()).get_raw_data()
65 |             self.assertTrue(numpy.array_equal(actual_raw, expected_raw))
66 | 


--------------------------------------------------------------------------------
/test/test_fast5_interface.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from ont_fast5_api.fast5_file import Fast5File
 5 | from ont_fast5_api.fast5_interface import get_fast5_file, check_file_type, MULTI_READ, SINGLE_READ
 6 | from ont_fast5_api.multi_fast5 import MultiFast5File
 7 | from test.helpers import test_data
 8 | 
 9 | 
10 | 
11 | class TestFast5Interface(unittest.TestCase):
12 | 
13 |     def test_correct_type(self):
14 |         single_read_path = os.path.join(test_data, "single_reads", "read0.fast5")
15 |         single_read_id = Fast5File(single_read_path).get_read_id()
16 |         with get_fast5_file(single_read_path) as f5:
17 |             self.assertTrue(isinstance(f5, Fast5File))
18 |             self.assertEqual(check_file_type(f5), SINGLE_READ)
19 |             self.assertEqual(len(f5.get_read_ids()), 1)
20 |             self.assertEqual(single_read_id, f5.get_read_ids()[0])
21 |             self.get_raw(f5)
22 | 
23 |         multi_read_path = os.path.join(test_data, "multi_read", "batch_0.fast5")
24 |         with get_fast5_file(multi_read_path) as f5:
25 |             self.assertTrue(isinstance(f5, MultiFast5File))
26 |             self.assertEqual(check_file_type(f5), MULTI_READ)
27 |             self.assertTrue(len(f5.get_read_ids()) >= 1)
28 |             self.get_raw(f5)
29 | 
30 |     def get_raw(self, f5):
31 |         # Test we can get raw data using the same method for single and multi
32 |         raw_data = f5.get_read(f5.get_read_ids()[0]).get_raw_data()
33 |         self.assertTrue(len(raw_data) >= 0)
34 | 


--------------------------------------------------------------------------------
/test/test_fast5_subset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy
  3 | from unittest.mock import patch
  4 | from pathlib import Path
  5 | 
  6 | from ont_fast5_api.compression_settings import VBZ
  7 | from ont_fast5_api.conversion_tools.fast5_subset import Fast5Filter
  8 | from ont_fast5_api.conversion_tools.conversion_utils import Fast5FilterWorker, extract_selected_reads, read_generator
  9 | from ont_fast5_api.multi_fast5 import MultiFast5File
 10 | from ont_fast5_api.fast5_file import Fast5File
 11 | from test.helpers import TestFast5ApiHelper, test_data
 12 | 
 13 | 
 14 | class TestFast5Subset(TestFast5ApiHelper):
 15 |     input_multif5_path = Path(test_data) / "multi_read" / "batch_0.fast5"
 16 |     read_set = {"fe85b517-62ee-4a33-8767-41cab5d5ab39", "fe9374ee-b86a-4ca4-81dc-ac06e3297728"}
 17 | 
 18 |     def test_read_generator(self):
 19 |         count = 0
 20 |         for read_id, read in read_generator(input_file=self.input_multif5_path, read_set=self.read_set):
 21 |             self.assertIn(read_id, self.read_set)
 22 |             count += 1
 23 | 
 24 |         self.assertEqual(len(self.read_set), count)
 25 | 
 26 |     def _create_read_list_file(self, read_ids):
 27 |         output_path = os.path.join(self.save_path, 'read_list.txt')
 28 |         with open(output_path, 'w') as fh:
 29 |             for read_id in read_ids:
 30 |                 fh.write(read_id + "\n")
 31 |         return output_path
 32 | 
 33 |     @patch('ont_fast5_api.conversion_tools.fast5_subset.logging')
 34 |     @patch('ont_fast5_api.conversion_tools.fast5_subset.get_progress_bar')
 35 |     def test_subset_from_single(self, mock_log, mock_pbar):
 36 |         input_path = os.path.join(test_data, "single_reads")
 37 |         read_list = self._create_read_list_file(self.read_set)
 38 |         f5_filter = Fast5Filter(input_folder=input_path,
 39 |                                 output_folder=self.save_path,
 40 |                                 read_list_file=read_list)
 41 |         f5_filter.run_batch()
 42 | 
 43 |         count = 0
 44 |         with MultiFast5File(os.path.join(self.save_path, 'batch0.fast5'), 'r') as output_f5:
 45 |             for input_file in os.listdir(input_path):
 46 |                 with Fast5File(os.path.join(input_path, input_file), 'r') as input_f5:
 47 |                     read_id = input_f5.get_read_id()
 48 |                     if read_id in self.read_set:
 49 |                         read_in = input_f5.get_read(read_id)
 50 |                         read_out = output_f5.get_read(read_id)
 51 |                         self.assertTrue(numpy.array_equal(read_in.get_raw_data(), read_out.get_raw_data()))
 52 |                         count += 1
 53 |         self.assertEqual(len(self.read_set), count)
 54 | 
 55 |     @patch('ont_fast5_api.conversion_tools.fast5_subset.logging')
 56 |     @patch('ont_fast5_api.conversion_tools.fast5_subset.get_progress_bar')
 57 |     def test_subset_from_multi(self, mock_log, mock_pbar):
 58 |         read_list = self._create_read_list_file(self.read_set)
 59 |         f5_filter = Fast5Filter(input_folder=os.path.dirname(self.input_multif5_path),
 60 |                                 output_folder=self.save_path,
 61 |                                 read_list_file=read_list)
 62 |         f5_filter.run_batch()
 63 |         with MultiFast5File(self.input_multif5_path, 'r') as input_f5, \
 64 |                 MultiFast5File(os.path.join(self.save_path, 'batch0.fast5'), 'r') as output_f5:
 65 |             self.assertEqual(len(self.read_set), len(output_f5.get_read_ids()))
 66 |             for read_id in self.read_set:
 67 |                 read_in = input_f5.get_read(read_id)
 68 |                 read_out = output_f5.get_read(read_id)
 69 |                 self.assertTrue(numpy.array_equal(read_in.get_raw_data(), read_out.get_raw_data()))
 70 | 
 71 |     def test_extract_selected_reads(self):
 72 |         # three test for count below, equaling and above number of read in input file
 73 |         for count in (1, 2, 3):
 74 |             temp_file_name = self.generate_temp_filename()
 75 |             found_reads, output_file, input_file = extract_selected_reads(input_file=self.input_multif5_path,
 76 |                                                                           output_file=temp_file_name,
 77 |                                                                           count=count, read_set=self.read_set)
 78 |             if count < len(self.read_set):
 79 |                 self.assertTrue(found_reads.issubset(self.read_set))
 80 |                 self.assertEqual(input_file, self.input_multif5_path)
 81 |             elif count == len(self.read_set):
 82 |                 self.assertEqual(found_reads, self.read_set)
 83 |                 self.assertEqual(input_file, self.input_multif5_path)
 84 |             elif count >= len(self.read_set):
 85 |                 self.assertEqual(found_reads, self.read_set)
 86 |                 self.assertIsNone(input_file)
 87 | 
 88 |             self.assertEqual(output_file, temp_file_name)
 89 |             # verify that resulting output file is a legal MultiFast5 with desired reads in it
 90 |             with MultiFast5File(output_file) as multi_file:
 91 |                 readlist = multi_file.get_read_ids()
 92 |                 self.assertTrue(set(readlist).issubset(self.read_set))
 93 | 
 94 |     @patch('ont_fast5_api.conversion_tools.conversion_utils.ProgressBar')
 95 |     @patch('ont_fast5_api.conversion_tools.fast5_subset.logging')
 96 |     def test_selector_args_generator(self, mock_pbar, mock_logger):
 97 |         single_reads = os.path.join(test_data, "single_reads")
 98 |         self.assertTrue(os.path.isdir(single_reads), msg=single_reads)
 99 | 
100 |         input_f5s = list(Path(single_reads).glob('*.fast5'))
101 |         batch_size = 1
102 | 
103 |         f = Fast5FilterWorker(
104 |             input_file_list=input_f5s,
105 |             output_dir=Path(self.save_path),
106 |             read_set=self.read_set,
107 |             batch_size=batch_size,
108 |             filename_base="batch",
109 |             target_compression=VBZ,
110 |             progressbar=mock_pbar,
111 |             logger=mock_logger
112 |         )
113 | 
114 |         args_combos = list(f._args_generator())
115 |         # there should be two tuples of arguments
116 |         self.assertEqual(len(args_combos), len(self.read_set) / batch_size)
117 | 
118 |         num_files_queued = len(f.input_f5s) # should be 0
119 |         self.assertEqual(num_files_queued, (len(input_f5s) - len(args_combos)), msg=f.input_f5s)
120 |         self.assertEqual(len(f.available_out_files), 0)
121 | 
122 |         # "exhaust" an input file and put output file back on queue
123 |         input_file, output_file, reads, count, compression = args_combos[0]
124 |         f._update_file_lists(reads={}, in_file=None, out_file=output_file)
125 |         self.assertEqual(len(f.input_f5s), num_files_queued)
126 |         self.assertEqual(len(f.available_out_files), 1)
127 |         self.assertEqual(compression, VBZ)
128 | 
129 |         # this results in another args tuple generated
130 |         new_args_combos = list(f._args_generator())
131 |         self.assertEqual(len(new_args_combos), 1, msg=len(new_args_combos))


--------------------------------------------------------------------------------
/test/test_hardlink_metadata.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from ont_fast5_api.compression_settings import VBZ
 4 | from ont_fast5_api.conversion_tools.compress_fast5 import compress_file
 5 | from ont_fast5_api.conversion_tools.conversion_utils import extract_selected_reads
 6 | from ont_fast5_api.fast5_interface import get_fast5_file
 7 | from ont_fast5_api.multi_fast5 import MultiFast5File
 8 | from ont_fast5_api.static_data import HARDLINK_GROUPS
 9 | from test.helpers import TestFast5ApiHelper, test_data
10 | 
11 | 
12 | class TestHardlinkMetaData(TestFast5ApiHelper):
13 |     read_subset = {'00031f3e-415c-4ab5-9c16-fb6fe45ff519',
14 |                    "000c0b4e-46c2-4fb5-9b17-d7031eefb975",
15 |                    '000ebd63-3e1a-4499-9ded-26af3225a022',
16 |                    '002ad0e4-c6bb-4eff-a30f-5fec01475ab8',
17 |                    '0059d270-3238-4413-b38b-f588e28326df'}
18 | 
19 |     def test_create_read(self):
20 |         input_path = os.path.join(test_data, 'hardlink', 'unlinked', 'batch0.fast5')
21 |         output_path = self.generate_temp_filename()
22 |         compress_file(input_path, output_path, target_compression=VBZ)
23 |         new_read_id = "123456789abcdef"
24 |         with MultiFast5File(output_path, 'a') as f5:
25 |             # Test we can hardlink to existing metadata when creating an new empty read
26 |             run_id = list(f5.run_id_map.keys())[0]
27 |             master_read_id = f5.run_id_map[run_id]
28 |             f5.create_empty_read(new_read_id, run_id)
29 |             for group in HARDLINK_GROUPS:
30 |                 self.assertTrue(self.is_read_hardlinked(f5, new_read_id, master_read_id, group))
31 | 
32 |             # Test we don't explode if there is no metadata
33 |             f5.create_empty_read(new_read_id[::-1], "not an existing run_id")
34 | 
35 |     def test_hardlink_multi_compression(self):
36 |         input_path = os.path.join(test_data, 'hardlink', 'unlinked', 'batch0.fast5')
37 |         output_path = self.generate_temp_filename()
38 | 
39 |         self.assertFalse(self.is_file_hardlinked(input_path))
40 |         compress_file(input_path, output_path, target_compression=VBZ)
41 |         self.assertTrue(self.is_file_hardlinked(output_path))
42 | 
43 |     def test_hardlink_subset(self):
44 |         input_path = os.path.join(test_data, 'hardlink', 'unlinked', 'batch0.fast5')
45 |         output_path = self.generate_temp_filename()
46 | 
47 |         self.assertFalse(self.is_file_hardlinked(input_path))
48 |         extract_selected_reads(input_path, output_path, self.read_subset, count=len(self.read_subset))
49 |         self.assertTrue(self.is_file_hardlinked(output_path))
50 | 
51 |     def test_hardlink_subset_single_reads(self):
52 |         input_path = os.path.join(test_data, 'hardlink', 'single_reads')
53 |         output_path = self.generate_temp_filename()
54 | 
55 |         for single_read_file in os.listdir(input_path):
56 |             extract_selected_reads(os.path.join(input_path, single_read_file), output_path, self.read_subset, count=1)
57 |         self.assertTrue(self.is_file_hardlinked(output_path))
58 | 
59 |     def test_hardlink_single_to_multi(self):
60 |         input_folder = os.path.join(test_data, 'hardlink', 'single_reads')
61 |         input_files = [os.path.join(input_folder, file) for file in os.listdir(input_folder)]
62 |         output_path = self.generate_temp_filename()
63 | 
64 |         with MultiFast5File(output_path, 'a') as multi_f5:
65 |             for input_file in input_files:
66 |                 with get_fast5_file(input_file, 'r') as f5_file:
67 |                     for read in f5_file.get_reads():
68 |                         multi_f5.add_existing_read(read)
69 | 
70 |         with MultiFast5File(output_path, 'r') as multi_f5:
71 |             self.assertEqual(len(input_files), len(multi_f5.get_read_ids()))
72 |         self.assertTrue(self.is_file_hardlinked(output_path))
73 | 
74 |     def is_file_hardlinked(self, input_path):
75 |         file_hardlinked = True
76 |         with MultiFast5File(input_path, 'r') as f5_file:
77 |             for read in f5_file.get_reads():
78 |                 master_read_id = f5_file.run_id_map[read.get_run_id()]
79 |                 for group in HARDLINK_GROUPS:
80 |                     file_hardlinked &= self.is_read_hardlinked(f5_file, read.read_id, master_read_id, group)
81 |         return file_hardlinked
82 | 
83 |     def is_read_hardlinked(self, f5_handle, read_id1, read_id2, group):
84 |         if read_id1 == read_id2:
85 |             return True
86 |         group1 = f5_handle.get_read(read_id1).handle[group]
87 |         group2 = f5_handle.get_read(read_id2).handle[group]
88 |         return group1 == group2
89 | 


--------------------------------------------------------------------------------
/test/test_multi_fast5.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import os
  3 | import random
  4 | 
  5 | from ont_fast5_api.fast5_file import Fast5File
  6 | from ont_fast5_api.fast5_read import Fast5Read
  7 | from ont_fast5_api.multi_fast5 import MultiFast5File
  8 | from test.helpers import TestFast5ApiHelper
  9 | 
 10 | hexdigits = "0123456789abcdef"
 11 | run_id = "123abc"
 12 | 
 13 | 
 14 | class TestMultiFast5(TestFast5ApiHelper):
 15 | 
 16 |     def create_multi_file(self, read_ids):
 17 |         filename = self.generate_temp_filename()
 18 |         # driver=None is the default, but adding this in here makes sure we
 19 |         # preserve the constructor argument.
 20 |         with MultiFast5File(filename, 'w', driver=None) as multi_f5:
 21 |             for read_id in read_ids:
 22 |                 multi_f5.create_empty_read(read_id, run_id)
 23 |         return filename
 24 | 
 25 |     def test_read_interface(self):
 26 |         read_ids = generate_read_ids(6)
 27 |         f5_file = self.create_multi_file(read_ids)
 28 | 
 29 |         with MultiFast5File(f5_file, 'a') as multi_f5:
 30 |             # Check we have the read_ids we expect
 31 |             self.assertEqual(sorted(read_ids), sorted(multi_f5.get_read_ids()))
 32 | 
 33 |             # Try and add another read with the same read_id and expect error
 34 |             with self.assertRaises(ValueError):
 35 |                 multi_f5.create_empty_read(read_ids[0], run_id)
 36 | 
 37 |             # Test we can get a read from the file and it has the interface we expect
 38 |             read_0 = multi_f5.get_read(read_ids[0])
 39 |             self.assertTrue(isinstance(read_0, Fast5Read))
 40 | 
 41 |             # Test we cannot get a read which doesn't exit
 42 |             with self.assertRaises(KeyError):
 43 |                 multi_f5.get_read("0123")
 44 | 
 45 |     def test_raw_data(self):
 46 |         f5_file = self.create_multi_file(generate_read_ids(4))
 47 |         data = list(range(10))
 48 |         raw_attrs = {
 49 |             "duration": 1,
 50 |             "median_before": 2.5,
 51 |             "read_id": "abcd",
 52 |             "read_number": 8,
 53 |             "start_mux": 2,
 54 |             "start_time": 99
 55 |         }
 56 |         with MultiFast5File(f5_file, 'a') as multi_f5:
 57 |             read0 = multi_f5.get_read(multi_f5.get_read_ids()[0])
 58 |             read0.add_raw_data(data, attrs=raw_attrs)
 59 |             output_data = read0.get_raw_data()
 60 |             numpy.testing.assert_array_equal(output_data, data)
 61 | 
 62 |     def test_channel_info(self):
 63 |         f5_file = self.create_multi_file(generate_read_ids(4))
 64 |         channel_info = {
 65 |             "digitisation": 2048,
 66 |             "offset": -119.5,
 67 |             "range": 74.2,
 68 |             "sampling_rate": 4000,
 69 |             "channel_number": "72"
 70 |         }
 71 |         # Fast5File explicitly casts the channel number on reading
 72 |         expected_out = channel_info.copy()
 73 |         expected_out['channel_number'] = int(channel_info['channel_number'])
 74 |         with MultiFast5File(f5_file, 'a') as multi_f5:
 75 |             read0 = multi_f5.get_read(multi_f5.get_read_ids()[0])
 76 |             read0.add_channel_info(channel_info)
 77 |             output_data = read0.get_channel_info()
 78 |             self.assertEqual(output_data, expected_out)
 79 | 
 80 |     def test_tracking_id(self):
 81 |         f5_file = self.create_multi_file(generate_read_ids(4))
 82 |         tracking_id = {
 83 |             "asic_id_eeprom": "some string",
 84 |             "device_id": "some string",
 85 |             "exp_script_name": "some string",
 86 |             "exp_script_purpose": "some string",
 87 |             "exp_start_time": "some string",
 88 |             "flow_cell_id": "some string",
 89 |             "hostname": "some string",
 90 |             "protocol_run_id": "some string",
 91 |             "protocols_version": "some string",
 92 |             "run_id": "some string",
 93 |             "version": "some string",
 94 |         }
 95 | 
 96 |         with MultiFast5File(f5_file, 'a') as multi_f5:
 97 |             read0 = multi_f5.get_read(multi_f5.get_read_ids()[0])
 98 |             read0.add_tracking_id(tracking_id)
 99 |             output_data = read0.get_tracking_id()
100 |             self.assertEqual(output_data, tracking_id)
101 | 
102 |     def test_add_analysis(self):
103 |         f5_file = self.create_multi_file(generate_read_ids(4))
104 |         group = "Test"
105 |         component = "test_component"
106 |         attrs = {"attribute": 1}
107 | 
108 |         # Fast5File.add_analysis includes the component name in the analysis attributes
109 |         expected_attributes = attrs.copy()
110 |         expected_attributes['component'] = component
111 |         with MultiFast5File(f5_file, 'a') as multi_f5:
112 |             read0 = multi_f5.get_read(multi_f5.get_read_ids()[0])
113 |             self.assertEqual(read0.list_analyses(), [])
114 |             read0.add_analysis(component, group, attrs)
115 |             self.assertEqual(read0.list_analyses(), [(component, group)])
116 |             self.assertEqual(read0.get_analysis_attributes(group), expected_attributes)
117 | 
118 | 
119 | def generate_read_ids(num_ids, id_len=8):
120 |     return ["".join(random.choice(hexdigits) for _ in range(id_len)) for _ in range(num_ids)]
121 | 


--------------------------------------------------------------------------------
/test/test_segmentation_tools.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | 
 4 | from ont_fast5_api.fast5_file import Fast5File
 5 | from ont_fast5_api.analysis_tools.event_detection import EventDetectionTools
 6 | from ont_fast5_api.analysis_tools.segmentation import SegmentationTools
 7 | from test.helpers import TestFast5ApiHelper
 8 | 
 9 | 
10 | class TestSegmentationTools(TestFast5ApiHelper):
11 | 
12 |     def test_001_raw_only(self):
13 |         fname = self.generate_temp_filename()
14 |         with Fast5File(fname, mode='w') as fh:
15 |             fh.add_channel_info({'channel_number': 1,
16 |                                  'sampling_rate': 4000,
17 |                                  'digitisation': 8192,
18 |                                  'range': 819.2,
19 |                                  'offset': 0})
20 |             fh.add_read(12, 'unique_snowflake', 12345, 1000, 0, 120.75)
21 |             raw = np.empty(1000, dtype=np.int16)
22 |             raw[:] = range(1000)
23 |             fh.add_raw_data(raw)
24 |             attrs = {'name': 'test', 'version': 0, 'time_stamp': 'just now'}
25 |             fh.add_analysis('segmentation', 'Segmentation_000', attrs)
26 |             segment_data = {'has_template': 1,
27 |                             'has_complement': 1,
28 |                             'first_sample_template': 10,
29 |                             'duration_template': 470,
30 |                             'first_sample_complement': 520,
31 |                             'duration_complement': 460}
32 |             fh.set_summary_data('Segmentation_000', 'segmentation', segment_data)
33 |             with SegmentationTools(fh, group_name='Segmentation_000') as segment:
34 |                 results = segment.get_results()
35 |                 self.assertDictEqual({'has_template': True,
36 |                                       'has_complement': True,
37 |                                       'first_sample_template': 10,
38 |                                       'duration_template': 470,
39 |                                       'first_sample_complement': 520,
40 |                                       'duration_complement': 460}, results)
41 |                 temp_raw = segment.get_raw_data('template', scale=False)
42 |                 np.testing.assert_array_equal(temp_raw, raw[10:480])
43 |                 comp_raw = segment.get_raw_data('complement', scale=False)
44 |                 np.testing.assert_array_equal(comp_raw, raw[520:980])
45 |                 temp_raw, comp_raw = segment.get_raw_data('both', scale=False)
46 |                 np.testing.assert_array_equal(temp_raw, raw[10:480])
47 |                 np.testing.assert_array_equal(comp_raw, raw[520:980])
48 |                 temp_raw, comp_raw = segment.get_raw_data('both', scale=True)
49 |                 scaled_temp = raw[10:480] * 0.1
50 |                 scaled_comp = raw[520:980] * 0.1
51 |                 np.testing.assert_array_almost_equal(temp_raw, scaled_temp, decimal=5)
52 |                 np.testing.assert_array_almost_equal(comp_raw, scaled_comp, decimal=5)
53 | 
54 |     def test_002_events_only(self):
55 |         fname = self.generate_temp_filename()
56 |         with Fast5File(fname, mode='w') as fh:
57 |             fh.add_channel_info({'channel_number': 1,
58 |                                  'sampling_rate': 4000,
59 |                                  'digitisation': 8192,
60 |                                  'range': 819.2,
61 |                                  'offset': 0})
62 |             fh.add_read(12, 'unique_snowflake', 10000, 1000, 0, 120.75)
63 |             with EventDetectionTools(fh, group_name='EventDetection_000', meta={'name': 'test'}) as evdet:
64 |                 data = np.zeros(100, dtype=[('start', int), ('length', int), ('mean', float), ('stdv', float)])
65 |                 data['start'][2] = 10010
66 |                 data['start'][46] = 10470
67 |                 data['length'][46] = 10
68 |                 data['start'][53] = 10520
69 |                 data['start'][97] = 10960
70 |                 data['length'][97] = 20
71 |                 read_attrs = {'read_number': 12}
72 |                 evdet.set_event_data(data, read_attrs)
73 |             attrs = {'name': 'test', 'version': 0, 'time_stamp': 'just now',
74 |                      'event_detection': 'Analyses/EventDetection_000'}
75 |             fh.add_analysis('segmentation', 'Segmentation_000', attrs)
76 |             segment_data = {'has_template': 1,
77 |                             'has_complement': 1,
78 |                             'start_event_template': 2,
79 |                             'end_event_template': 47,
80 |                             'start_event_complement': 53,
81 |                             'end_event_complement': 98}
82 |             fh.set_summary_data('Segmentation_000', 'segmentation', segment_data)
83 |             with SegmentationTools(fh, group_name='Segmentation_000') as segment:
84 |                 results = segment.get_results()
85 |                 self.assertDictEqual({'has_template': True,
86 |                                       'has_complement': True,
87 |                                       'start_event_template': 2,
88 |                                       'end_event_template': 47,
89 |                                       'start_event_complement': 53,
90 |                                       'end_event_complement': 98,
91 |                                       'first_sample_template': 10,
92 |                                       'duration_template': 470,
93 |                                       'first_sample_complement': 520,
94 |                                       'duration_complement': 460}, results)
95 | 


--------------------------------------------------------------------------------