├── .pydevproject ├── CHANGELOG.md ├── LICENSE.md ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile └── source │ ├── conf.py │ └── index.rst ├── img └── ONT_logo.png ├── ont_fast5_api ├── __init__.py ├── analysis_tools │ ├── __init__.py │ ├── alignment.py │ ├── base_tool.py │ ├── basecall_1d.py │ ├── basecall_2d.py │ ├── event_detection.py │ └── segmentation.py ├── compression_settings.py ├── conversion_tools │ ├── __init__.py │ ├── check_file_compression.py │ ├── compress_fast5.py │ ├── conversion_utils.py │ ├── demux_fast5.py │ ├── fast5_subset.py │ ├── multi_to_single_fast5.py │ └── single_to_multi_fast5.py ├── data_sanitisation.py ├── fast5_file.py ├── fast5_info.py ├── fast5_interface.py ├── fast5_read.py ├── helpers.py ├── multi_fast5.py ├── static_data.py └── vbz_plugin │ ├── libvbz_hdf_plugin.dylib │ ├── libvbz_hdf_plugin_aarch64.so │ ├── libvbz_hdf_plugin_m1.dylib │ ├── libvbz_hdf_plugin_x86_64.so │ └── vbz_hdf_plugin.dll ├── setup.py └── test ├── __init__.py ├── data ├── basecall_2d_file_v1.0.fast5 ├── hardlink │ ├── single_reads │ │ ├── 00031f3e-415c-4ab5-9c16-fb6fe45ff519.fast5 │ │ ├── 000c0b4e-46c2-4fb5-9b17-d7031eefb975.fast5 │ │ ├── 000ebd63-3e1a-4499-9ded-26af3225a022.fast5 │ │ ├── 002ad0e4-c6bb-4eff-a30f-5fec01475ab8.fast5 │ │ ├── 002b0891-03bf-4622-ae66-ae6984890ed4.fast5 │ │ ├── 0048058c-ecb4-4a0f-b283-9a128bd598c5.fast5 │ │ ├── 004a87b0-c9f6-4237-b4d6-466ab979aee2.fast5 │ │ └── 0059d270-3238-4413-b38b-f588e28326df.fast5 │ └── unlinked │ │ └── batch0.fast5 ├── multi_read │ └── batch_0.fast5 ├── multi_read_analyses │ └── batch_0.fast5 ├── read_file_v0.6_raw.fast5 ├── read_file_v0.6_single.fast5 ├── read_file_v1.0_single.fast5 ├── rle_basecall_table │ └── rle_example.fast5 ├── single_read_analyses │ └── read.fast5 ├── single_reads │ ├── fe85b517-62ee-4a33-8767-41cab5d5ab39.fast5 │ ├── fe8a3026-d1f4-46b3-8daa-e610f27acde1.fast5 │ ├── fe9374ee-b86a-4ca4-81dc-ac06e3297728.fast5 │ └── read0.fast5 ├── summaries │ └── two_barcode_summary.txt ├── telemetry_test.fast5 └── vbz_reads │ └── vbz_reads.fast5 ├── helpers.py ├── test_alignment_tools.py ├── test_basecall_1d_tools.py ├── test_basecall_2d_tools.py ├── test_check_compression.py ├── test_compress_fast5.py ├── test_compression_settings.py ├── test_data_sanitisation.py ├── test_demux_fast5.py ├── test_event_detection_tools.py ├── test_fast5_conversion_utils.py ├── test_fast5_converter.py ├── test_fast5_file.py ├── test_fast5_interface.py ├── test_fast5_subset.py ├── test_hardlink_metadata.py ├── test_multi_fast5.py └── test_segmentation_tools.py /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | python 2.7 6 | Default 7 | 8 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes and fixes to ont_fast5_api will be documented here 3 | 4 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) 5 | This project (aspires to) adhere to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). 6 | 7 | ## [4.1.3] 8 | 9 | ### Added 10 | - Support for python up to 3.12 11 | 12 | ## [4.1.2] 13 | 14 | ### Added 15 | - Support for h5py>=3.9 16 | ### Changed 17 | - Renamed VBZ compression settings to make it clearer which version is used in production 18 | ### Removed 19 | - Support for python3.6 20 | 21 | ## [4.1.1] 22 | 23 | ### Fixed 24 | - Compatibility with numpy==1.24 unicode type 25 | 26 | ### Changed 27 | - Updated Windows VBZ Plugin dll 28 | 29 | ## [4.1.0] 30 | 31 | ### Added 32 | - Support for fast5_api on macOS-M1 33 | 34 | ## [4.0.2] 35 | 36 | ### Fixed 37 | - Fixed Fast5Read import error 38 | 39 | ## [4.0.1] 40 | 41 | ### Changed 42 | - Fixed unresolved reference in `compress_fast5.py` 43 | - Fixed issue with `compress_fast5.py` not retaining enumeration metadata for the end_reason attribute 44 | - Increased minimum h5py version to 2.10 45 | 46 | ## [4.0.0] 47 | 48 | ### Added 49 | - Script `demux_fast5` for demultiplexing fast5 reads based on column in summary file, e.g. for barcoded experiments 50 | 51 | ### Removed 52 | - Removed deb builds which are no longer supported 53 | - Python3.5 support 54 | 55 | ## [3.3.0] 2021-02-17 56 | 57 | ### Added 58 | - Added `yield_fast5_reads` to conversion_tools. 59 | 60 | ## [3.2.0] 2021-01-28 61 | 62 | ### Changed 63 | - Dropped support for older h5py/numpy versions, min now h5py>=2.8, numpy>=1.16 64 | - fast5_subset now displays errors (but continues processing) when it encounters input fast5 files it can't read. 65 | 66 | ### Added 67 | - Add support for explicitly specifying file drivers when loading 68 | multi-read fast5 files. 69 | 70 | ## [3.1.6] 2020-08-20 71 | ### Added 72 | - `compress_fast5` now has a `--sanitize` option to remove optional groups. 73 | 74 | ### Fixed 75 | - Correctly handle the case where h5pl can be imported but doesn't have the prepend() function available. 76 | 77 | ## [3.1.5] 2020-06-15 78 | ### Added 79 | - Added explicit requirements and checks to prevent running on Python 2. 80 | 81 | ## [3.1.4] 2020-06-12 82 | ### Fixed 83 | - Compression now works in `single_to_multi`. 84 | 85 | ## [3.1.3] 2020-05-28 86 | ### Fixed 87 | - Compression argument in `fast5_subset` and `single_to_multi` failed if not set 88 | 89 | ## [3.1.2] 2020-05-04 90 | ### Fixed 91 | - Compression argument in `fast5_subset` and `single_to_multi` was parsed incorrectly 92 | 93 | ## [3.1.1] 2020-04-03 94 | ### Fixed 95 | - Argument list for `fast5_subset` and `single_to_multi` had a syntax error 96 | 97 | ## [3.1.0] 2020-04-02 98 | ### Added 99 | - Hardlinking of metadata to prevent duplication and reduce filesize 100 | - Ability to enable compression when using `fast5_subset` and `single_to_multi` 101 | ### Fixed 102 | - `fast5_subset` thread pool could sometimes close before all tasks were completed 103 | - `fast5_subset` will create output directory if it doesn't exist 104 | 105 | ## [3.0.2] 2020-03-17 106 | ### Fixed 107 | - Comparison of file_versions could throw an error 108 | 109 | ## [3.0.1] 2020-01-29 110 | ### Fixed 111 | - Basecall1DTools could not load data from a Fast5Read 112 | 113 | ## [3.0.0] 2020-01-20 114 | ### Removed 115 | - python2 compatibility 116 | ### Fixed 117 | - minor documentation errors: https://github.com/nanoporetech/ont_fast5_api/issues/28 118 | 119 | ## [2.1.0] 2019-12-16 120 | ### Added 121 | - Script to check the compression type of fast5 files in a folder 122 | - `compress_fast5` can now be used `--in_place` 123 | ### Fixed 124 | - Reading arrays with padded strings now succeeds (on h5py>2.7) 125 | - Compatibility bugs with h5py==2.6 now raises appropriate errors 126 | - Fast5File now has attribute read_id to match documentation 127 | ### Changed 128 | - Now use standard settings for gzip compression (gzip=1, shuffle=None) 129 | - Inverted dependency between `Fast5File` and `Fast5Read` so `Fast5Read` is now the primary object 130 | 131 | ## [2.0.1] 2019-11-28 132 | ### Added 133 | - Option to `--ignore_symlinks` in fast5 conversion scripts 134 | - Explicit check to file_type for detemining single/multi-read files 135 | ### Fixed 136 | - `fast5_subset` with single read fast5s was failing 137 | - unit test data now cleaned up properly 138 | 139 | ## [2.0.0] 2019-11-19 140 | ### Added 141 | - Compatibility for VBZ compressed reads 142 | - `compress_fast5` script for compressing/decompressing fast5 files 143 | - `get_reads()` helper method to more easily loop through reads in a fast5 file 144 | ### Changed 145 | - `Fast5File().get_raw_data()` updated interface to match `Fast5Read` and remove support for legacy files with multiple read numbers in a single `Fast5File` 146 | - Minimum depedency version requirements bumped. Set to Ubuntu16 `apt` python3-package defaults 147 | ### Removed 148 | - Legacy `Fast5Writer` object. `MultiReadFast5` or `EmptyFast5File` are preferred 149 | 150 | ## [1.4.9] 2019-11-01 151 | ### Added 152 | - Check for progressbar2 package and fail early if it's installed. 153 | 154 | ## [1.4.8] 2019-10-22 155 | ### Added 156 | - Support for h5py==2.10 string data type encoding changes 157 | ### Fixed 158 | - Corrected some "for for" typos in argparse help text. 159 | 160 | ## [1.4.7] 2019-07-29 161 | ### Fixed 162 | - Bug in read string and read_id concatenation resulted in broken output file 163 | 164 | ## [1.4.6] 2019-07-03 165 | ### Added 166 | - Updated fast5_subset script to extract also from single-read fast5 files 167 | ### Changed 168 | - Renamed fast5_subset source script from multi_fast5_subset.py to fast5_subset.py 169 | 170 | ## [1.4.5] 2019-07-01 171 | ### Fixed 172 | - Bug in number of processes being 0 when batch size is greater than number of reads (py2) 173 | 174 | ## [1.4.4] 2019-06-18 175 | ### Fixed 176 | - Bug in path name output from pathlib changes 177 | 178 | ## [1.4.3] 2019-06-12 179 | ### Fixed 180 | - Bug with apt-install and pathlib2 181 | 182 | ## [1.4.2] 2019-06-10 183 | ### Fixed 184 | - get_raw_data() now works with scale=True when start,end are None 185 | 186 | ## [1.4.1] 2019-06-06 187 | ### Added 188 | - Useful error message if no input files found 189 | ### Fixed 190 | - filename_mapping output gave incorrect filenames 191 | 192 | ## [1.4.0] 2019-05-29 193 | ### Added 194 | - Script for extracting reads by id from `multi_read` files 195 | 196 | ## [1.3.0] 2019-03-01 197 | ### Fixed 198 | - Bug in output to `filename_mapping.txt` 199 | 200 | ## [1.2.0] 2019-01-11 201 | ### Added 202 | - Multi-threading support for multi<->single conversion for improved performance 203 | 204 | ### Fixed 205 | - Removed incorrect license accidentally added to README 206 | 207 | ## [1.1.1] 2019-01-10 208 | ### Changed 209 | - Minor documentation updates 210 | - Follow symlinks when finding files recursively 211 | 212 | ## [1.1.0] 2019-01-07 213 | ### Added 214 | - Generic single- and multi- read interface via `get_fast5_file` 215 | 216 | ### Fixed 217 | - Incorrect time estimates for single-multi conversion 218 | - Fixed path creation if not exist 219 | 220 | ## [1.0.1] 2018-09-26 221 | ### Added 222 | - Support for multi-read fast5 files 223 | - Conversion tools for single-multi read files 224 | 225 | ### Fixed 226 | - Support for deprecated interface to Basecall2D following 0.4.0, support will end in v1.x.x 227 | 228 | 229 | ## [0.4.0] 2017-07-16 (internal only) 230 | ### Fixed 231 | - Basecall1d and Basecall2d raise consistent KeyError when fastq data missing 232 | 233 | ### Changed 234 | - Interface to Basecall1d and Basecall2d unified for add_sequence() and get_sequence() 235 | 236 | 237 | ## [0.3.3] 2017-06-23 238 | ### Added 239 | - Fast5 file now supports logging via 'Fast5File.add_log()' 240 | 241 | ### Fixed 242 | - Invalid component names no longer checked against LEGACY_COMPENENTS 243 | - Raise KeyError when fastq data missing from Basecall1d 244 | - median_before and start_mux populate correctly with sensible defaults 245 | 246 | 247 | ## [0.3.2] 2017-03-22 248 | ### Added 249 | Major release - changes not logged before this point 250 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # This file tells sdist which addional files to include in the distribution it builds. 2 | # That distribution is used as the base for building the .deb with stdeb, and certain files 3 | # (such as header files and .md files) are not included by default. 4 | # See https://docs.python.org/2/distutils/sourcedist.html#manifest-template 5 | 6 | include README.md 7 | include LICENSE.md 8 | prune test 9 | prune build 10 | prune docs 11 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | VERSION ?= unknown 10 | 11 | # Internal variables. 12 | PAPEROPT_a4 = -D latex_paper_size=a4 13 | PAPEROPT_letter = -D latex_paper_size=letter 14 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) -D version=$(VERSION) -D release=$(VERSION) $(SPHINXOPTS) source 15 | # the i18n builder cannot share the environment and doctrees with the others 16 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 17 | 18 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 19 | 20 | help: 21 | @echo "Please use \`make ' where is one of" 22 | @echo " api to autogenerate API documentation" 23 | @echo " html to make standalone HTML files" 24 | @echo " dirhtml to make HTML files named index.html in directories" 25 | @echo " singlehtml to make a single large HTML file" 26 | @echo " pickle to make pickle files" 27 | @echo " json to make JSON files" 28 | @echo " htmlhelp to make HTML files and a HTML help project" 29 | @echo " qthelp to make HTML files and a qthelp project" 30 | @echo " devhelp to make HTML files and a Devhelp project" 31 | @echo " epub to make an epub" 32 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 33 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 34 | @echo " text to make text files" 35 | @echo " man to make manual pages" 36 | @echo " texinfo to make Texinfo files" 37 | @echo " info to make Texinfo files and run them through makeinfo" 38 | @echo " gettext to make PO message catalogs" 39 | @echo " changes to make an overview of all changed/added/deprecated items" 40 | @echo " linkcheck to check all external links for integrity" 41 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 42 | 43 | clean: 44 | $(eval NON_INDEX_FILES := $(filter-out source/index.rst, $(wildcard source/*.rst))) 45 | -rm -rf $(BUILDDIR) 46 | mkdir $(BUILDDIR) 47 | ifneq ($(NON_INDEX_FILES),) 48 | rm $(NON_INDEX_FILES) 49 | endif 50 | 51 | api: 52 | $(eval NON_INDEX_FILES := $(filter-out source/index.rst, $(wildcard source/*.rst))) 53 | ifneq ($(NON_INDEX_FILES),) 54 | rm $(NON_INDEX_FILES) 55 | endif 56 | sphinx-apidoc --no-toc -o source/ .. 57 | rm source/test.rst 58 | rm source/setup.rst 59 | @echo 60 | @echo "API gubbins generated in source directory for version $(VERSION)." 61 | 62 | html: 63 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 64 | @echo 65 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 66 | 67 | dirhtml: 68 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 69 | @echo 70 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 71 | 72 | singlehtml: 73 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 74 | @echo 75 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 76 | 77 | pickle: 78 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 79 | @echo 80 | @echo "Build finished; now you can process the pickle files." 81 | 82 | json: 83 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 84 | @echo 85 | @echo "Build finished; now you can process the JSON files." 86 | 87 | htmlhelp: 88 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 89 | @echo 90 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 91 | ".hhp project file in $(BUILDDIR)/htmlhelp." 92 | 93 | qthelp: 94 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 95 | @echo 96 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 97 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 98 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/fast5_api.qhcp" 99 | @echo "To view the help file:" 100 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/fast5_api.qhc" 101 | 102 | devhelp: 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/ont_fast5_api" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/ont_fast5_api" 109 | @echo "# devhelp" 110 | 111 | epub: 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | latex: 117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 118 | @echo 119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 121 | "(use \`make latexpdf' here to do that automatically)." 122 | 123 | latexpdf: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through pdflatex..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | text: 130 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 131 | @echo 132 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 133 | 134 | man: 135 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 136 | @echo 137 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 138 | 139 | texinfo: 140 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 141 | @echo 142 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 143 | @echo "Run \`make' in that directory to run these through makeinfo" \ 144 | "(use \`make info' here to do that automatically)." 145 | 146 | info: 147 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 148 | @echo "Running Texinfo files through makeinfo..." 149 | make -C $(BUILDDIR)/texinfo info 150 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 151 | 152 | gettext: 153 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 154 | @echo 155 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 156 | 157 | changes: 158 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 159 | @echo 160 | @echo "The overview file is in $(BUILDDIR)/changes." 161 | 162 | linkcheck: 163 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 164 | @echo 165 | @echo "Link check complete; look for any errors in the above output " \ 166 | "or in $(BUILDDIR)/linkcheck/output.txt." 167 | 168 | doctest: 169 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 170 | @echo "Testing of doctests in the sources finished, look at the " \ 171 | "results in $(BUILDDIR)/doctest/output.txt." 172 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # fast5_api documentation build configuration file, created by 4 | # sphinx-quickstart on Fri Nov 21 09:32:46 2014. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | sys.path.insert(0, os.path.abspath(os.path.join('..', '..'))) 16 | sys.path.insert(0, os.path.abspath(os.path.join('..', '..', 'ont_fast5_api'))) 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | #sys.path.insert(0, os.path.abspath('.')) 22 | 23 | # -- General configuration ----------------------------------------------------- 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | #needs_sphinx = '1.0' 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be extensions 29 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 30 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] 31 | 32 | # Add any paths that contain templates here, relative to this directory. 33 | templates_path = ['_templates'] 34 | 35 | # The suffix of source filenames. 36 | source_suffix = '.rst' 37 | 38 | # The encoding of source files. 39 | #source_encoding = 'utf-8-sig' 40 | 41 | # The master toctree document. 42 | master_doc = 'index' 43 | 44 | # General information about the project. 45 | project = u'ont_fast5_api' 46 | copyright = u'2016, Oxford Nanopore Technologies' 47 | 48 | # The version info for the project you're documenting, acts as replacement for 49 | # |version| and |release|, also used in various other places throughout the 50 | # built documents. 51 | # 52 | # The short X.Y version. 53 | version = '1.6.2' 54 | # The full version, including alpha/beta/rc tags. 55 | release = '1.6.2' 56 | 57 | # The language for content autogenerated by Sphinx. Refer to documentation 58 | # for a list of supported languages. 59 | #language = None 60 | 61 | # There are two options for replacing |today|: either, you set today to some 62 | # non-false value, then it is used: 63 | #today = '' 64 | # Else, today_fmt is used as the format for a strftime call. 65 | #today_fmt = '%B %d, %Y' 66 | 67 | # List of patterns, relative to source directory, that match files and 68 | # directories to ignore when looking for source files. 69 | exclude_patterns = [] 70 | 71 | # The reST default role (used for this markup: `text`) to use for all documents. 72 | #default_role = None 73 | 74 | # If true, '()' will be appended to :func: etc. cross-reference text. 75 | #add_function_parentheses = True 76 | 77 | # If true, the current module name will be prepended to all description 78 | # unit titles (such as .. function::). 79 | #add_module_names = True 80 | 81 | # If true, sectionauthor and moduleauthor directives will be shown in the 82 | # output. They are ignored by default. 83 | #show_authors = False 84 | 85 | # The name of the Pygments (syntax highlighting) style to use. 86 | pygments_style = 'sphinx' 87 | 88 | # A list of ignored prefixes for module index sorting. 89 | #modindex_common_prefix = [] 90 | 91 | 92 | # -- Options for HTML output --------------------------------------------------- 93 | 94 | # The theme to use for HTML and HTML Help pages. See the documentation for 95 | # a list of builtin themes. 96 | html_theme = 'sphinxdoc' 97 | 98 | # Theme options are theme-specific and customize the look and feel of a theme 99 | # further. For a list of options available for each theme, see the 100 | # documentation. 101 | #html_theme_options = {} 102 | 103 | # Add any paths that contain custom themes here, relative to this directory. 104 | #html_theme_path = [] 105 | 106 | # The name for this set of Sphinx documents. If None, it defaults to 107 | # " v documentation". 108 | #html_title = None 109 | 110 | # A shorter title for the navigation bar. Default is the same as html_title. 111 | #html_short_title = None 112 | 113 | # The name of an image file (relative to this directory) to place at the top 114 | # of the sidebar. 115 | #html_logo = None 116 | 117 | # The name of an image file (within the static path) to use as favicon of the 118 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 119 | # pixels large. 120 | #html_favicon = None 121 | 122 | # Add any paths that contain custom static files (such as style sheets) here, 123 | # relative to this directory. They are copied after the builtin static files, 124 | # so a file named "default.css" will overwrite the builtin "default.css". 125 | html_static_path = ['_static'] 126 | 127 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 128 | # using the given strftime format. 129 | #html_last_updated_fmt = '%b %d, %Y' 130 | 131 | # If true, SmartyPants will be used to convert quotes and dashes to 132 | # typographically correct entities. 133 | #html_use_smartypants = True 134 | 135 | # Custom sidebar templates, maps document names to template names. 136 | #html_sidebars = {} 137 | 138 | # Additional templates that should be rendered to pages, maps page names to 139 | # template names. 140 | #html_additional_pages = {} 141 | 142 | # If false, no module index is generated. 143 | #html_domain_indices = True 144 | 145 | # If false, no index is generated. 146 | #html_use_index = True 147 | 148 | # If true, the index is split into individual pages for each letter. 149 | #html_split_index = False 150 | 151 | # If true, links to the reST sources are added to the pages. 152 | #html_show_sourcelink = True 153 | 154 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 155 | #html_show_sphinx = True 156 | 157 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 158 | #html_show_copyright = True 159 | 160 | # If true, an OpenSearch description file will be output, and all pages will 161 | # contain a tag referring to it. The value of this option must be the 162 | # base URL from which the finished HTML is served. 163 | #html_use_opensearch = '' 164 | 165 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 166 | #html_file_suffix = None 167 | 168 | # Output file base name for HTML help builder. 169 | htmlhelp_basename = 'fast5_api_doc' 170 | 171 | 172 | # -- Options for LaTeX output -------------------------------------------------- 173 | 174 | latex_elements = { 175 | # The paper size ('letterpaper' or 'a4paper'). 176 | #'papersize': 'letterpaper', 177 | 178 | # The font size ('10pt', '11pt' or '12pt'). 179 | #'pointsize': '10pt', 180 | 181 | # Additional stuff for the LaTeX preamble. 182 | #'preamble': '', 183 | } 184 | 185 | # Grouping the document tree into LaTeX files. List of tuples 186 | # (source start file, target name, title, author, documentclass [howto/manual]). 187 | latex_documents = [ 188 | ('index', 'fast5_api.tex', u'fast5_api Documentation', 189 | u'Kevin Dolan, Forrest Brennen', 'manual'), 190 | ] 191 | 192 | # The name of an image file (relative to this directory) to place at the top of 193 | # the title page. 194 | #latex_logo = None 195 | 196 | # For "manual" documents, if this is true, then toplevel headings are parts, 197 | # not chapters. 198 | #latex_use_parts = False 199 | 200 | # If true, show page references after internal links. 201 | #latex_show_pagerefs = False 202 | 203 | # If true, show URL addresses after external links. 204 | #latex_show_urls = False 205 | 206 | # Documents to append as an appendix to all manuals. 207 | #latex_appendices = [] 208 | 209 | # If false, no module index is generated. 210 | #latex_domain_indices = True 211 | 212 | 213 | # -- Options for manual page output -------------------------------------------- 214 | 215 | # One entry per manual page. List of tuples 216 | # (source start file, name, description, authors, manual section). 217 | man_pages = [ 218 | ('index', 'ont_fast5_api', u'ont_fast5_api Documentation', 219 | [u'Kevin Dolan, Forrest Brennen'], 1) 220 | ] 221 | 222 | # If true, show URL addresses after external links. 223 | #man_show_urls = False 224 | 225 | 226 | # -- Options for Texinfo output ------------------------------------------------ 227 | 228 | # Grouping the document tree into Texinfo files. List of tuples 229 | # (source start file, target name, title, author, 230 | # dir menu entry, description, category) 231 | texinfo_documents = [ 232 | ('index', 'ont_fast5_api', u'ont_fast5_api Documentation', 233 | u'Kevin Dolan, Forrest Brennen', 'fast5_api', 'One line description of project.', 234 | 'Miscellaneous'), 235 | ] 236 | 237 | # Documents to append as an appendix to all manuals. 238 | #texinfo_appendices = [] 239 | 240 | # If false, no module index is generated. 241 | #texinfo_domain_indices = True 242 | 243 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 244 | #texinfo_show_urls = 'footnote' 245 | 246 | # Included to display docstrings from class __init__() functions. 247 | autoclass_content = "both" 248 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. ont_fast5_api documentation master file, created by 2 | sphinx-quickstart on Fri Nov 21 09:32:46 2014. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. include:: ../../README.rst 7 | 8 | Contents: 9 | 10 | .. toctree:: 11 | :maxdepth: 4 12 | :glob: 13 | 14 | ont_fast5_api 15 | 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` -------------------------------------------------------------------------------- /img/ONT_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/img/ONT_logo.png -------------------------------------------------------------------------------- /ont_fast5_api/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '4.1.3' 2 | __version_info__ = tuple([int(num) for num in __version__.split('.')]) 3 | CURRENT_FAST5_VERSION = 2.0 4 | 5 | import sys 6 | if sys.version_info < (3,): 7 | raise ImportError( 8 | """ont-fast5-api requires Python 3.7 9 | 10 | Somehow you have ended up running this on Python 2, which reached its end of 11 | life in 2019. Apologies! To avoid this issue, either: 12 | 13 | - Upgrade to Python 3, or 14 | 15 | - Download an older ont-fast5-api version: 16 | 17 | $ pip install 'ont-fast5-api<3.0' 18 | 19 | Note that you will be missing features and bug fixes by running older versions 20 | of ont-fast5-api. 21 | 22 | """) 23 | 24 | # Set up a default NullHandler in case we don't end up using another one 25 | # Taken from http://docs.python-guide.org/en/latest/writing/logging/ 26 | import logging 27 | logging.getLogger(__name__).addHandler(logging.NullHandler()) 28 | 29 | from ont_fast5_api.compression_settings import register_plugin 30 | register_plugin() 31 | -------------------------------------------------------------------------------- /ont_fast5_api/analysis_tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/ont_fast5_api/analysis_tools/__init__.py -------------------------------------------------------------------------------- /ont_fast5_api/analysis_tools/alignment.py: -------------------------------------------------------------------------------- 1 | """ Helper class for working with alignment type analyses. 2 | """ 3 | import numpy as np 4 | 5 | from ont_fast5_api.analysis_tools.base_tool import BaseTool 6 | from ont_fast5_api.fast5_file import Fast5File 7 | from ont_fast5_api.analysis_tools.segmentation import SegmentationTools 8 | from ont_fast5_api.fast5_read import Fast5Read 9 | 10 | 11 | class AlignmentTools(BaseTool): 12 | """ Provides helper methods specific to alignment analyses. 13 | """ 14 | 15 | def __init__(self, source, mode='r', group_name=None, meta=None, config=None): 16 | """ Create a new alignment tools object. 17 | 18 | :param source: Either an open Fast5File object, or a filename 19 | of a fast5 file. 20 | :param mode: The open mode (r or r+). Only if a filename is used 21 | for the source argument. 22 | :param group_name: The specific alignment analysis instance 23 | you are interested in. 24 | :param meta: Metadata for a new alignment analysis. 25 | :param config: Configuration data for a new alignment analysis. 26 | 27 | To create a new alignment analysis, provide a group name that 28 | does not already exist, and an optional dictionary with the metadata. 29 | The following fields are recommended, as a minimum: 30 | 31 | * name - The name of the basecall software used. 32 | * time_stamp - The time at which the analysis was performed. 33 | 34 | If the group name already exists, the "meta" parameter is ignored. If 35 | the specified group has a "component" attribute, and its value is not 36 | "alignment", an exception will be thrown. 37 | """ 38 | if isinstance(source, Fast5Read): 39 | self.handle = source 40 | self.close_handle_when_done = False 41 | elif isinstance(source, str): 42 | self.handle = Fast5File(source, mode) 43 | self.close_handle_when_done = True 44 | else: 45 | raise Exception('Unrecognized type for argument "source".') 46 | if group_name is None: 47 | group_name = self.handle.get_latest_analysis('Alignment') 48 | if group_name is None: 49 | raise Exception('No Alignment analysis group found in file.') 50 | self.group_name = group_name 51 | attrs = self.handle.get_analysis_attributes(group_name) 52 | if attrs is None: 53 | if meta is None: 54 | meta = {} 55 | self.handle.add_analysis('alignment', group_name, meta, config) 56 | attrs = self.handle.get_analysis_attributes(group_name) 57 | if ('component' in attrs 58 | and attrs['component'] not in ['alignment', 59 | 'calibration_strand']): 60 | self.close() 61 | raise Exception('Analysis does not appear to be an alignment component.') 62 | 63 | def get_results(self): 64 | """ Get details about the alignments that have been performed. 65 | 66 | :return: A dict of dicts. 67 | 68 | The keys of the top level are 'template', 'complement' and '2d'. 69 | Each of these dicts contains the following fields: 70 | 71 | * status: Can be 'no data', 'no match found', or 'match found'. 72 | * direction: Can be 'forward', 'reverse'. 73 | * ref_name: Name of reference. 74 | * ref_span: Section of reference aligned to, as a tuple (start, end). 75 | * seq_span: Section of the called sequence that aligned, as a tuple (start, end). 76 | * seq_len: Total length of the called sequence. 77 | * num_aligned: Number of bases that aligned to bases in the reference. 78 | * num_correct: Number of aligned bases that match the reference. 79 | * num_deletions: Number of bases in the aligned section of the 80 | reference that are not aligned to bases in the called sequence. 81 | * num_insertions: Number of bases in the aligned section of the called 82 | sequence that are not aligned to bases in the reference. 83 | * identity: The fraction of aligned bases that are correct (num_correct / 84 | num_aligned). 85 | * accuracy: The overall basecall accuracy, according to the alignment. 86 | (num_correct / (num_aligned + num_deletions + num_insertions)). 87 | 88 | Note that if the status field is not 'match found', then all the other 89 | fields will be absent. 90 | """ 91 | summary = self.handle.get_summary_data(self.group_name) 92 | results = {'template': {'status': 'no data'}, 93 | 'complement': {'status': 'no data'}, 94 | '2d': {'status': 'no data'}} 95 | if 'genome_mapping_template' in summary: 96 | results['template'] = self._get_results(summary['genome_mapping_template']) 97 | if 'genome_mapping_complement' in summary: 98 | results['complement'] = self._get_results(summary['genome_mapping_complement']) 99 | if 'genome_mapping_2d' in summary: 100 | results['2d'] = self._get_results(summary['genome_mapping_2d']) 101 | return results 102 | 103 | def get_alignment_data(self, section): 104 | """ Get the alignment SAM and Fasta, if present. 105 | 106 | :param section: Can be 'template', 'complement', or '2d'. 107 | :return: A tuple containing the SAM and the section of the reference 108 | aligned to (both as strings). Returns None if no alignment is 109 | present for that section. 110 | """ 111 | subgroup = '{}/Aligned_{}'.format(self.group_name, section) 112 | sam = self.handle.get_analysis_dataset(subgroup, 'SAM') 113 | fasta = self.handle.get_analysis_dataset(subgroup, 'Fasta') 114 | if sam is None or fasta is None: 115 | return None 116 | sequence = fasta.split('\n')[1] 117 | return sam, sequence 118 | 119 | def add_alignment_data(self, section, sam, sequence): 120 | """ Add the SAM and Fasta alignment data for a section. 121 | 122 | :param section: Can be 'template', 'complement', or '2d'. 123 | :param sam: A string containing the SAM contents. 124 | :param sequence: A string containing the section of the 125 | reference the basecall aligned to. 126 | """ 127 | subgroup = 'Aligned_{}'.format(section) 128 | if not subgroup in self.handle.handle['Analyses/{}'.format(self.group_name)]: 129 | self.handle.add_analysis_subgroup(self.group_name, subgroup) 130 | sam_arr = np.array(sam, dtype=str) 131 | self.handle.add_analysis_dataset('{}/{}'.format(self.group_name, subgroup), 'SAM', sam_arr) 132 | fasta_arr = np.array('>{}\n{}\n'.format(section, sequence), dtype=str) 133 | self.handle.add_analysis_dataset('{}/{}'.format(self.group_name, subgroup), 'Fasta', fasta_arr) 134 | 135 | def calculate_speed(self, section, alignment_results=None): 136 | """ Calculate speed using alignment information. 137 | 138 | :param section: The section (template or complement) we're calculating 139 | speed for. 140 | :param alignment_results: Optional dictionary of the alignment summary, 141 | so that speed can be calculated without having to write the summary 142 | out to the fast5 file first. 143 | :return: Speed in bases per second or zero if the speed could not be 144 | calculated. 145 | 146 | The only reliable way we have of finding out how many bases have gone through the pore is by 147 | looking at how much of the reference the sequence aligned to. This takes that information and 148 | uses it to calculate speed in reference-bases-per-second. 149 | """ 150 | speed = 0.0 151 | if alignment_results: 152 | results = self._get_results(alignment_results) 153 | else: 154 | results = self.get_results()[section] 155 | if results['status'] != 'match found': 156 | return 0.0 157 | ref_span = results['ref_span'] 158 | ref_len = ref_span[1] - ref_span[0] 159 | seq_span = results['seq_span'] 160 | seq_len = seq_span[1] - seq_span[0] 161 | total_len = results['seq_len'] 162 | 163 | sample_rate = self.handle.get_channel_info()['sampling_rate'] 164 | 165 | # We need the duration from the segmentation results 166 | chain = self.handle.get_chain(self.group_name) 167 | if chain is not None: 168 | segmentation_group = dict(chain).get('segmentation') 169 | else: 170 | segmentation_group = None 171 | duration = 0 172 | if segmentation_group is not None: 173 | with SegmentationTools(self.handle, group_name=segmentation_group) as seg: 174 | summary = seg.get_results() 175 | if summary is not None: 176 | duration = summary['duration_{}'.format(section)] 177 | if duration == 0: 178 | return 0.0 179 | 180 | normalized_duration = duration * seq_len / float(total_len) 181 | speed = sample_rate * ref_len / normalized_duration 182 | return speed 183 | 184 | ########################## 185 | # 186 | # Private methods below 187 | # 188 | ########################## 189 | 190 | def _get_results(self, summary): 191 | results = {'status': 'no data'} 192 | ref_name = summary['genome'] 193 | if ref_name == 'no_match': 194 | results['status'] = 'no match found' 195 | return results 196 | results['status'] = 'match found' 197 | results['direction'] = 'forward' 198 | if ref_name.endswith('_rc'): 199 | ref_name = ref_name[:-3] 200 | results['direction'] = 'reverse' 201 | results['ref_name'] = ref_name 202 | results['ref_span'] = (summary['genome_start'], summary['genome_end']) 203 | results['seq_span'] = (summary['strand_start'], summary['strand_end']) 204 | results['seq_len'] = summary['num_events'] 205 | results.update({key: summary[key] for key in ['num_aligned', 'num_correct', 'num_insertions', 206 | 'num_deletions', 'identity', 'accuracy']}) 207 | return results 208 | -------------------------------------------------------------------------------- /ont_fast5_api/analysis_tools/base_tool.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from abc import abstractmethod 3 | 4 | from ont_fast5_api.fast5_file import Fast5File, Fast5FileTypeError 5 | from ont_fast5_api.fast5_read import Fast5Read 6 | 7 | 8 | class BaseTool(object): 9 | @property 10 | def group_id(self): 11 | raise NotImplementedError("BaseTool does not have a group_id") 12 | 13 | @property 14 | def analysis_id(self): 15 | raise NotImplementedError("BaseTool does not have a analysis_id") 16 | 17 | def __init__(self, source, mode='r', group_name=None, meta=None, config=None): 18 | """ Create a new analysis_tools object. 19 | 20 | :param source: Either an open Fast5File object, or a filename 21 | of a fast5 file. 22 | :param mode: The open mode (r or r+). Only if a filename is used 23 | for the source argument. 24 | :param group_name: The specific analysis instance you are interested in. 25 | :param meta: Metadata for a new analysis. 26 | :param config: Configuration data for a new analysis. 27 | 28 | To create a new analysis group, provide a group name that 29 | does not already exist, and an optional dictionary with the metadata. 30 | The following fields are recommended, as a minimum: 31 | 32 | * name - The name of the software used. 33 | * time_stamp - The time at which the analysis was performed. 34 | 35 | If the group name already exists, the "meta" parameter is ignored. If 36 | the specified group has a "component" attribute, and its value does not 37 | match self.analysis_id, an exception will be thrown. 38 | """ 39 | if isinstance(source, Fast5Read): 40 | self.filename = source.filename # Useful for debugging purposes 41 | self.handle = source 42 | self.close_handle_when_done = False 43 | elif isinstance(source, str): 44 | self.filename = source # Useful for debugging purposes 45 | try: 46 | self.handle = Fast5File(source, mode) 47 | except Fast5FileTypeError : 48 | raise NotImplementedError("AnalysisTools do not support accessing MultiReadFast5 files by filepath") 49 | self.close_handle_when_done = True 50 | else: 51 | raise KeyError('Unrecognized type for argument "source": {}'.format(source)) 52 | if group_name is None: 53 | group_name = self.handle.get_latest_analysis(self.group_id) 54 | if group_name is None: 55 | raise KeyError('No group: {} found in file: {}'.format(group_name, self.filename)) 56 | self.group_name = group_name 57 | attrs = self.handle.get_analysis_attributes(group_name) 58 | 59 | if attrs is None: 60 | self.handle.add_analysis(self.analysis_id, group_name, meta, config) 61 | attrs = self.handle.get_analysis_attributes(group_name) 62 | if 'component' in attrs and attrs['component'] != self.analysis_id: 63 | raise ValueError('Component {} is not {}'.format(attrs.get('component'), self.analysis_id)) 64 | 65 | def __enter__(self): 66 | return self 67 | 68 | def __exit__(self, exception_type, exception_value, traceback): 69 | self.close() 70 | return False 71 | 72 | def close(self): 73 | """ Closes the object. 74 | """ 75 | if self.handle and self.close_handle_when_done: 76 | self.handle.close() 77 | -------------------------------------------------------------------------------- /ont_fast5_api/analysis_tools/basecall_1d.py: -------------------------------------------------------------------------------- 1 | """ Helper class for working with 1D basecall type analyses. 2 | """ 3 | import numpy as np 4 | 5 | from ont_fast5_api.analysis_tools.base_tool import BaseTool 6 | 7 | 8 | class Basecall1DTools(BaseTool): 9 | """ Provides helper methods specific to 1D basecall analyses. 10 | """ 11 | group_id = 'Basecall_1D' 12 | analysis_id = 'basecall_1d' 13 | 14 | 15 | def get_event_data(self, section): 16 | """ Return either the template or complement event data, if present. 17 | 18 | :param section: Either template or complement. 19 | :return: Event data table. 20 | """ 21 | event_group = '{}/BaseCalled_{}'.format(self.group_name, section) 22 | data = self.handle.get_analysis_dataset(event_group, 'Events') 23 | return data 24 | 25 | def add_event_data(self, section, data): 26 | """ Add template or complement basecalled event data. 27 | 28 | :param section: Either template or complement. 29 | :param data: Event data table to be written. 30 | """ 31 | event_group = 'BaseCalled_{}'.format(section) 32 | if not event_group in self.handle.handle['Analyses/{}'.format(self.group_name)]: 33 | self.handle.add_analysis_subgroup(self.group_name, event_group) 34 | self.handle.add_analysis_dataset('{}/{}'.format(self.group_name, event_group), 'Events', data) 35 | 36 | def get_called_sequence(self, section, fastq=False): 37 | """ Return either the called sequence data, if present. 38 | 39 | :param section: ['template', 'complement' or '2D'] 40 | :param fastq: If True, return a single, multiline fastq string. If 41 | False, return a tuple of (name, sequence, qstring). 42 | :return: Either the fastq string or the (name, sequence, qstring) tuple. 43 | """ 44 | 45 | event_group = '{}/BaseCalled_{}'.format(self.group_name, section) 46 | data = self.handle.get_analysis_dataset(event_group, 'Fastq') 47 | if data is None: 48 | raise KeyError("No fastq data in: {} {}".format(event_group, self.filename)) 49 | if fastq: 50 | return data 51 | name, sequence, _, qstring = data.strip().split('\n') 52 | name = name[1:] 53 | return name, sequence, qstring 54 | 55 | def add_called_sequence(self, section, name, sequence, qstring): 56 | """ Add basecalled sequence data 57 | 58 | :param section: ['template', 'complement' or '2D'] 59 | :param name: The record ID to use for the fastq. 60 | :param sequence: The called sequence. 61 | :param qstring: The quality string. 62 | """ 63 | event_group = 'BaseCalled_{}'.format(section) 64 | if not event_group in self.handle.handle['Analyses/{}'.format(self.group_name)]: 65 | self.handle.add_analysis_subgroup(self.group_name, event_group) 66 | fastq_text = '@{}\n{}\n+\n{}\n'.format(name, sequence, qstring) 67 | fastq_arr = np.array(fastq_text, dtype=str) 68 | self.handle.add_analysis_dataset('{}/{}'.format(self.group_name, event_group), 'Fastq', fastq_arr) 69 | -------------------------------------------------------------------------------- /ont_fast5_api/analysis_tools/basecall_2d.py: -------------------------------------------------------------------------------- 1 | """ Helper class for working with 2D basecall type analyses. 2 | """ 3 | import warnings 4 | from ont_fast5_api.analysis_tools.basecall_1d import Basecall1DTools 5 | 6 | 7 | class Basecall2DTools(Basecall1DTools): 8 | """ Provides helper methods specific to 2D basecall analyses. 9 | """ 10 | 11 | group_id = 'Basecall_2D' 12 | analysis_id = 'basecall_2d' 13 | 14 | def get_prior_alignment(self): 15 | """ Return the prior alignment that was used for 2D basecalling. 16 | 17 | :return: Alignment data table. 18 | """ 19 | data_group = '{}/HairpinAlign'.format(self.group_name) 20 | data = self.handle.get_analysis_dataset(data_group, 'Alignment') 21 | return data 22 | 23 | def get_2d_call_alignment(self): 24 | """ Return the alignment and model_states from the 2D basecall. 25 | 26 | :return: Alignment data table. 27 | """ 28 | data_group = '{}/BaseCalled_2D'.format(self.group_name) 29 | data = self.handle.get_analysis_dataset(data_group, 'Alignment') 30 | return data 31 | 32 | def add_prior_alignment(self, data): 33 | """ Add template or complement basecalled event data. 34 | 35 | :param data: Alignment table to be written. 36 | """ 37 | path = 'Analyses/{}'.format(self.group_name) 38 | if 'HairpinAlign' not in self.handle.handle[path]: 39 | self.handle.add_analysis_subgroup(self.group_name, 'HairpinAlign') 40 | 41 | path = '{}/HairpinAlign'.format(self.group_name) 42 | self.handle.add_analysis_dataset(path, 'Alignment', data) 43 | 44 | def add_2d_call_alignment(self, data): 45 | """ Add the alignment and model_state data table.. 46 | 47 | :param data: Alignment and model_state table to be written. 48 | """ 49 | path = 'Analyses/{}'.format(self.group_name) 50 | if 'BaseCalled_2D' not in self.handle.handle[path]: 51 | self.handle.add_analysis_subgroup(self.group_name, 'BaseCalled_2D') 52 | 53 | path = '{}/BaseCalled_2D'.format(self.group_name) 54 | self.handle.add_analysis_dataset(path, 'Alignment', data) 55 | 56 | def get_called_sequence(self, section=None, fastq=False): 57 | """ Return either the called sequence data, if present. 58 | :param section: ['template', 'complement' or '2D'] 59 | :param fastq: If True, return a single, multiline fastq string. If 60 | False, return a tuple of (name, sequence, qstring). 61 | :return: Either the fastq string or the (name, sequence, qstring) tuple. 62 | """ 63 | if section != "2D": 64 | warnings.warn("Basecall2DTools.get_called_sequence() should specify section='2D'", DeprecationWarning) 65 | # Backwards compatibilty to 0.3.3, if no "2D" section, bump args by 1 and pass to super 66 | if section == None: 67 | # We assume that a named arg or no-arg was given 68 | return super(Basecall2DTools, self).get_called_sequence("2D", fastq) 69 | # We assume that a single unnamed arg was given for fastq 70 | return super(Basecall2DTools, self).get_called_sequence("2D", section) 71 | return super(Basecall2DTools, self).get_called_sequence(section, fastq) 72 | -------------------------------------------------------------------------------- /ont_fast5_api/analysis_tools/event_detection.py: -------------------------------------------------------------------------------- 1 | """ Helper class for working with event detection type analyses. 2 | """ 3 | import numpy as np 4 | 5 | from ont_fast5_api.analysis_tools.base_tool import BaseTool 6 | 7 | 8 | class EventDetectionTools(BaseTool): 9 | """ Provides helper methods specific to event detection analyses. 10 | """ 11 | 12 | group_id = 'EventDetection' 13 | analysis_id = 'event_detection' 14 | 15 | def set_event_data(self, data, read_attrs): 16 | """ Set event data with the specied attributes. 17 | 18 | :param data: Event data table. 19 | :param read_attrs: Attributes to put on the read group. This must include 20 | the read_number, which must refer to a read present in the object. The 21 | attributes should not include the standard read attributes: 22 | 23 | * read_id 24 | * start_time 25 | * duration 26 | * start_mux 27 | 28 | Those will be pulled from the read information already present in the 29 | object for the specified read. 30 | """ 31 | if self.handle.mode == 'r': 32 | raise Exception('File is not open for writing.') 33 | read_number = read_attrs['read_number'] 34 | read_group = '{}/Reads/Read_{}'.format(self.group_name, read_number) 35 | read_info = self.handle.status.read_info 36 | read_number_map = self.handle.status.read_number_map 37 | index = read_number_map.get(read_number) 38 | if index is None: 39 | raise Exception('Cannot add event detection data for a read that does not exist.') 40 | info = read_info[index] 41 | read_attrs.update({'read_id': info.read_id, 42 | 'start_time': info.start_time, 43 | 'duration': info.duration, 44 | 'start_mux': info.start_mux, 45 | 'median_before': info.median_before}) 46 | attrs = self.handle.get_analysis_attributes(read_group) 47 | if attrs is None: 48 | self.handle.add_analysis_subgroup(self.group_name, 'Reads/Read_{}'.format(read_number), 49 | attrs=read_attrs) 50 | self.handle.add_analysis_dataset(read_group, 'Events', data) 51 | else: 52 | raise Exception('Event detection data already exists for this analysis and read.') 53 | 54 | def get_event_data(self, read_number=None, time_in_seconds=False): 55 | """ Get event data for the specified (or only) read. 56 | 57 | :param read_number: The read number to grab event data for. If this 58 | is None, and there is only one read, it will grab event data for 59 | that read. 60 | :param time_in_seconds: If True, this will convert (if necessary) the 61 | start and length fields from samples to seconds. If they are already 62 | in seconds, this option has no effect. 63 | :return: A tuple containing the event data, and the read attributes. 64 | """ 65 | read_info = self.handle.status.read_info 66 | if read_number is None: 67 | if len(read_info) != 1: 68 | raise Exception('Must specify a read number if there is not exactly 1 read.') 69 | read_number = read_info[0].read_number 70 | else: 71 | read_numbers = [info.read_number for info in read_info] 72 | if read_number not in read_numbers: 73 | raise Exception('Specified read does not exist.') 74 | group = '{}/Reads/Read_{}'.format(self.group_name, read_number) 75 | attrs = self.handle.get_analysis_attributes(group) 76 | dataset = self.handle.get_analysis_dataset(group, 'Events', skip_decoding=True) 77 | if dataset is None: 78 | raise Exception('Read number {} has no event data.'.format(read_number)) 79 | if time_in_seconds and dataset['start'].dtype.kind in ['i', 'u']: 80 | channel_info = self.handle.get_channel_info() 81 | sample_size = 1.0 / channel_info['sampling_rate'] 82 | descr = [(x[0], 'float64') if x[0] in ('start', 'length') else x 83 | for x in dataset.dtype.descr] 84 | data = dataset.astype(np.dtype(descr))[()] 85 | data['start'] *= sample_size 86 | data['length'] *= sample_size 87 | else: 88 | data = dataset[()] 89 | return data, attrs 90 | 91 | def has_event_data(self, read_number=None): 92 | """ Find out if the specified (or only) read has event data. 93 | 94 | :param read_number: The read number to check for event data. If this 95 | is ``None``, and there is only one read, it will check that read. 96 | :returns: True if event data exists for the read number. 97 | """ 98 | read_info = self.handle.status.read_info 99 | if read_number is None: 100 | if len(read_info) != 1: 101 | raise Exception('Must specify a read number if there is not exactly 1 read.') 102 | read_number = read_info[0].read_number 103 | else: 104 | read_numbers = [info.read_number for info in read_info] 105 | if read_number not in read_numbers: 106 | raise Exception('Specified read does not exist.') 107 | group = '{}/Reads/Read_{}'.format(self.group_name, read_number) 108 | dataset = self.handle.get_analysis_dataset(group, 'Events', skip_decoding=True) 109 | return dataset is not None 110 | 111 | ########################## 112 | # 113 | # Private methods below 114 | # 115 | ########################## 116 | 117 | def _new_analysis(self, meta, config): 118 | if self.handle.mode == 'r': 119 | raise Exception('Cannot create new event detection group. File is not open for writing.') 120 | self.handle.add_analysis('event_detection', self.group_name, meta, config) 121 | self.handle.add_analysis_subgroup(self.group_name, 'Reads') 122 | -------------------------------------------------------------------------------- /ont_fast5_api/analysis_tools/segmentation.py: -------------------------------------------------------------------------------- 1 | """ Helper class for working with segmentation type analyses. 2 | """ 3 | import numpy as np 4 | 5 | from ont_fast5_api.analysis_tools.base_tool import BaseTool 6 | from ont_fast5_api.analysis_tools.event_detection import EventDetectionTools 7 | 8 | 9 | class SegmentationTools(BaseTool): 10 | """ Provides helper methods specific to segmentation analyses. 11 | """ 12 | group_id = 'Segmentation' 13 | analysis_id = 'segmentation' 14 | 15 | def get_results(self): 16 | """ Returns the segmentation summary data. 17 | 18 | This data is normalized, to eliminate differences in what is stored 19 | for different types of segmentation analyses. 20 | 21 | The following fields are output: 22 | 23 | * has_template - True if the segmentation found template data. 24 | * has_complement - True if the segmentation found complement data. 25 | * first_sample_template - The first sample of the template data in 26 | the raw data. Only present if has_template is True. 27 | * duration_template - The duration (in samples) of the template 28 | data. Only present if has_template is True. 29 | * first_sample_complement - The first sample of the complement data 30 | in the raw data. Only present if has_complement is True. 31 | * duration_complement - The duration (in samples) of the complement 32 | data. Only present if has_complement is True. 33 | 34 | """ 35 | summary = self._get_summary_data() 36 | if summary is None: 37 | results = {'has_template': False, 38 | 'has_complement': False} 39 | else: 40 | results = {} 41 | if 'has_template' in summary: 42 | results['has_template'] = bool(summary['has_template']) 43 | else: 44 | results['has_template'] = True if summary['num_temp'] > 0 else False 45 | if 'has_complement' in summary: 46 | results['has_complement'] = bool(summary['has_complement']) 47 | else: 48 | results['has_complement'] = True if summary['num_comp'] > 0 else False 49 | need_raw_info = False 50 | if results['has_template']: 51 | if 'start_index_temp' in summary: 52 | summary['start_event_template'] = summary['start_index_temp'] 53 | summary['end_event_template'] = summary['end_index_temp'] 54 | if 'first_sample_template' not in summary: 55 | need_raw_info = True 56 | if results['has_complement']: 57 | if 'start_index_comp' in summary: 58 | summary['start_event_complement'] = summary['start_index_comp'] 59 | summary['end_event_complement'] = summary['end_index_comp'] 60 | if 'first_sample_complement' not in summary: 61 | need_raw_info = True 62 | if need_raw_info: 63 | self._get_raw_info(summary) 64 | if results['has_template']: 65 | results['first_sample_template'] = summary['first_sample_template'] 66 | results['duration_template'] = summary['duration_template'] 67 | if 'start_event_template' in summary: 68 | results['start_event_template'] = summary['start_event_template'] 69 | results['end_event_template'] = summary['end_event_template'] 70 | if results['has_complement']: 71 | results['first_sample_complement'] = summary['first_sample_complement'] 72 | results['duration_complement'] = summary['duration_complement'] 73 | if 'start_event_complement' in summary: 74 | results['start_event_complement'] = summary['start_event_complement'] 75 | results['end_event_complement'] = summary['end_event_complement'] 76 | return results 77 | 78 | def get_event_data(self, section, time_in_seconds=False): 79 | """ Get the template or complement event data. 80 | 81 | :param section: Either template, complement, or both. 82 | :param time_in_seconds: Return the start and length fields 83 | in seconds, rather than samples. 84 | :return: The event dataset for the section. If section=both 85 | then it returns a tuple with both sections. Returns None 86 | if the section does not exist. 87 | """ 88 | if section not in ['template', 'complement', 'both']: 89 | raise Exception('Unrecognized section: {} Expected: "template", "complement" or "both"'.format(section)) 90 | results = self.get_results() 91 | if results is None: 92 | return None, None if section is 'both' else None 93 | if section == 'both': 94 | sections = ['template', 'complement'] 95 | else: 96 | sections = [section] 97 | evdet_group, _ = self._find_event_data() 98 | with EventDetectionTools(self.handle, group_name=evdet_group) as evdet: 99 | event_data, _ = evdet.get_event_data(time_in_seconds=time_in_seconds) 100 | datasets = [None, None] 101 | for n, this_section in enumerate(sections): 102 | if not results['has_{}'.format(this_section)]: 103 | continue 104 | ev1 = results['start_event_{}'.format(this_section)] 105 | ev2 = results['end_event_{}'.format(this_section)] 106 | datasets[n] = event_data[ev1:ev2] 107 | if section == 'both': 108 | return tuple(datasets) 109 | return datasets[0] 110 | 111 | def get_raw_data(self, section, scale=False): 112 | """ Get the template or complement raw data. 113 | 114 | :param section: Either template, complement, or both. 115 | :param scale: Scale the raw data to pA. 116 | :return: The raw data for the section. If section=both 117 | then it returns a tuple with both sections. Returns None 118 | if the section does not exist. 119 | """ 120 | results = self.get_results() 121 | datasets = [None, None] 122 | if section == 'both': 123 | sections = ['template', 'complement'] 124 | else: 125 | sections = [section] 126 | for n, this_section in enumerate(sections): 127 | if not results['has_{}'.format(this_section)]: 128 | continue 129 | start = results['first_sample_{}'.format(this_section)] 130 | dur = results['duration_{}'.format(this_section)] 131 | datasets[n] = self.handle.get_raw_data(start=start, end=start+dur, scale=scale) 132 | if section == 'both': 133 | return tuple(datasets) 134 | return datasets[0] 135 | 136 | 137 | ########################## 138 | # 139 | # Private methods below 140 | # 141 | ########################## 142 | 143 | def _get_summary_data(self): 144 | summary = self.handle.get_summary_data(self.group_name) 145 | if summary is None: 146 | return None 147 | if 'segmentation' in summary: 148 | results = summary['segmentation'] 149 | elif 'split_hairpin' in summary: 150 | results = summary['split_hairpin'] 151 | else: 152 | results = None 153 | return results 154 | 155 | def _find_event_data(self): 156 | attrs = self.handle.get_analysis_attributes(self.group_name) 157 | evdet_group = attrs.get('event_detection') 158 | if evdet_group is None: 159 | evdet_group = self.handle.get_latest_analysis('EventDetection') 160 | else: 161 | evdet_group = evdet_group[9:] 162 | if evdet_group is None: 163 | return None 164 | # We directly use the Fast5Read interface here, rather than the 165 | # EventDetectionTools one, because we don't want to load the entire 166 | # event table into memory. 167 | read_info = self.handle.status.read_info[0] # We assume only one read. 168 | read_number = read_info.read_number 169 | event_table_group = '{}/Reads/Read_{}'.format(evdet_group, read_number) 170 | dataset = self.handle.get_analysis_dataset(event_table_group, 'Events', skip_decoding=True) 171 | return evdet_group, dataset 172 | 173 | def _get_raw_info(self, summary): 174 | _, dataset = self._find_event_data() 175 | read_info = self.handle.status.read_info[0] # We assume only one read. 176 | if dataset is None: 177 | summary['first_sample_template'] = None 178 | summary['duration_template'] = None 179 | summary['first_sample_complement'] = None 180 | summary['duration_complement'] = None 181 | return 182 | if summary.get('start_event_template', -1) >= 0: 183 | ev1 = summary['start_event_template'] 184 | ev2 = summary['end_event_template'] 185 | summary['first_sample_template'] = dataset[ev1]['start'] - read_info.start_time 186 | end = dataset[ev2-1]['start'] + dataset[ev2-1]['length'] - read_info.start_time 187 | summary['duration_template'] = end - summary['first_sample_template'] 188 | if summary.get('start_event_complement', -1) >= 0: 189 | ev1 = summary['start_event_complement'] 190 | ev2 = summary['end_event_complement'] 191 | summary['first_sample_complement'] = dataset[ev1]['start'] - read_info.start_time 192 | end = dataset[ev2-1]['start'] + dataset[ev2-1]['length'] - read_info.start_time 193 | summary['duration_complement'] = end - summary['first_sample_complement'] 194 | -------------------------------------------------------------------------------- /ont_fast5_api/compression_settings.py: -------------------------------------------------------------------------------- 1 | import pkg_resources 2 | 3 | 4 | def register_plugin(): 5 | plugin_path = pkg_resources.resource_filename('ont_fast5_api', 'vbz_plugin') 6 | try: 7 | from h5py import h5pl 8 | h5pl.prepend(bytes(plugin_path, 'UTF-8')) 9 | except (ImportError, AttributeError): 10 | # We don't have the plugin library in h5py<2.10 so we fall back on an environment variable 11 | import os 12 | os.environ['HDF5_PLUGIN_PATH'] = plugin_path 13 | return plugin_path 14 | 15 | 16 | class AbstractCompression: 17 | compression = "AbstractCompression" 18 | compression_opts = () 19 | shuffle = False 20 | scaleoffset = False 21 | fletcher32 = False 22 | 23 | def __repr__(self): 24 | return self.compression 25 | 26 | @property 27 | def filter_settings(self): 28 | return {} 29 | 30 | 31 | class VbzCompressionV1Alpha(AbstractCompression): 32 | def __init__(self): 33 | self.compression = 32020 # https://portal.hdfgroup.org/display/support/Registered+Filters 34 | self.compression_opts = (1, 2, 1, 1) # VBZ_VERSION, VBZ_PACKING, VBZ_ZIG_ZAG, VBZ_ZSTD_COMPRESSION 35 | 36 | def __repr__(self): 37 | return "vbz_v1.alpha" 38 | 39 | @property 40 | def filter_settings(self): 41 | return {str(self.compression): self.compression_opts} 42 | 43 | 44 | class VbzCompression(AbstractCompression): 45 | def __init__(self): 46 | self.compression = 32020 # https://portal.hdfgroup.org/display/support/Registered+Filters 47 | self.compression_opts = (0, 2, 1, 1) # VBZ_VERSION, VBZ_PACKING, VBZ_ZIG_ZAG, VBZ_ZSTD_COMPRESSION 48 | 49 | def __repr__(self): 50 | return "vbz" 51 | 52 | @property 53 | def filter_settings(self): 54 | return {str(self.compression): self.compression_opts} 55 | 56 | 57 | class GzipCompression(AbstractCompression): 58 | def __init__(self): 59 | self.compression = "gzip" 60 | self.compression_opts = 1 61 | 62 | @property 63 | def filter_settings(self): 64 | return {str(self.compression): self.compression_opts} 65 | 66 | 67 | VBZ_ERROR_MESSAGE = "Failed to read compressed raw data. " \ 68 | "VBZ compression filter (id=32020) may be missing from expected path: '{}'" 69 | 70 | 71 | def raise_missing_vbz_error_read(err): 72 | if str(VBZ.compression) in str(err): 73 | raise IOError(VBZ_ERROR_MESSAGE.format(register_plugin())) from err 74 | # If we don't see anything relating to VBZ just raise the existing error without additional info 75 | raise 76 | 77 | 78 | def raise_missing_vbz_error_write(err): 79 | if type(err) is OSError and "Can't read data" in str(err): 80 | raise IOError(VBZ_ERROR_MESSAGE.format(register_plugin())) from err 81 | # If we don't see anything relating to VBZ just raise the existing error without additional info 82 | raise 83 | 84 | 85 | VBZ_ALPHA = VbzCompressionV1Alpha() 86 | VBZ = VbzCompression() 87 | GZIP = GzipCompression() 88 | 89 | COMPRESSION_MAP = {str(comp): comp for comp in (VBZ_ALPHA, VBZ, GZIP)} 90 | -------------------------------------------------------------------------------- /ont_fast5_api/conversion_tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/ont_fast5_api/conversion_tools/__init__.py -------------------------------------------------------------------------------- /ont_fast5_api/conversion_tools/check_file_compression.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from ont_fast5_api.compression_settings import COMPRESSION_MAP 4 | from ont_fast5_api.conversion_tools.conversion_utils import yield_fast5_files 5 | from ont_fast5_api.fast5_interface import get_fast5_file 6 | 7 | 8 | def check_read_compression(read): 9 | """ 10 | Check the compresion type on the raw data of a read 11 | :param read: Fast5Read object 12 | :return: AbstractCompression object 13 | """ 14 | detected_compression = read.raw_compression_filters 15 | for compression in COMPRESSION_MAP.values(): 16 | if compression.filter_settings == detected_compression: 17 | return compression 18 | return detected_compression 19 | 20 | 21 | def check_compression(input_path, recursive, follow_symlinks, check_all_reads): 22 | """ 23 | Check the compression type of the raw data in files in a folder 24 | :param input_path: 25 | :param recursive: 26 | :param follow_symlinks: 27 | :param check_all_reads: bool - check all reads in a file or just the first 28 | :return: (Compression, read_id, file_path) 29 | """ 30 | for input_file in yield_fast5_files(input_path, recursive, follow_symlinks): 31 | with get_fast5_file(input_file, 'r') as f5: 32 | for read in f5.get_reads(): 33 | compression = check_read_compression(read) 34 | yield (compression, read.read_id, input_file) 35 | if not check_all_reads: 36 | break 37 | 38 | 39 | def main(): 40 | parser = ArgumentParser("Tool for checking the compression type of raw data in fast5 files") 41 | parser.add_argument('-i', '--input_path', required=True, 42 | help="Path to Fast5 file or directory of Fast5 files") 43 | parser.add_argument('--check_all_reads', action='store_true', required=False, default=False, 44 | help="Check all reads in a file individually (default: check only the first read)") 45 | parser.add_argument('-r', '--recursive', action='store_true', required=False, default=False, 46 | help="Search recursively through folders for MultiRead fast5 files") 47 | parser.add_argument('--ignore_symlinks', action='store_true', 48 | help="Ignore symlinks when searching recursively for fast5 files") 49 | parser.add_argument('--file_list', required=False, 50 | help="File containing names of files to search in") 51 | args = parser.parse_args() 52 | compression_results = check_compression(args.input_path, args.recursive, not args.ignore_symlinks, 53 | args.check_all_reads) 54 | for result in compression_results: 55 | print(result) 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /ont_fast5_api/conversion_tools/compress_fast5.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | from argparse import ArgumentParser, ArgumentError 5 | from multiprocessing.pool import Pool 6 | 7 | from ont_fast5_api import __version__ 8 | from ont_fast5_api.compression_settings import COMPRESSION_MAP 9 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, get_progress_bar 10 | from ont_fast5_api.fast5_file import Fast5File, EmptyFast5 11 | from ont_fast5_api.fast5_read import copy_attributes 12 | from ont_fast5_api.fast5_interface import is_multi_read 13 | from ont_fast5_api.multi_fast5 import MultiFast5File 14 | from ont_fast5_api.static_data import OPTIONAL_READ_GROUPS 15 | 16 | 17 | def compress_batch(input_folder, output_folder, target_compression, recursive=True, threads=1, follow_symlinks=True, 18 | in_place=False, sanitize=False): 19 | # We require an absolute input path to we can replicate the data structure relative to it later on 20 | input_folder = os.path.abspath(input_folder) 21 | 22 | file_list = get_fast5_file_list(input_folder, recursive, follow_symlinks=follow_symlinks) 23 | if len(file_list) == 0: 24 | raise ValueError("No input fast5 files found in '{}'. Recursive={}".format(input_folder, recursive)) 25 | 26 | # Set up the process pool and the progressbar 27 | pool = Pool(min(threads, len(file_list))) 28 | pbar = get_progress_bar(len(file_list)) 29 | 30 | def update(result): 31 | if in_place and result is not None: 32 | input_path, output_path = result 33 | shutil.move(output_path, input_path) 34 | pbar.update(pbar.currval + 1) 35 | 36 | for input_file in file_list: 37 | input_path = os.path.join(input_folder, input_file) 38 | if in_place: 39 | output_path = os.path.join(input_path + ".tmp.compressed") 40 | else: 41 | output_path = os.path.join(output_folder, os.path.relpath(input_path, input_folder)) 42 | 43 | pool.apply_async(func=compress_file, 44 | args=(input_path, output_path, target_compression, sanitize), 45 | callback=update) 46 | 47 | # Tear down the process pool and pbar. We can't use contextmanagers since we need to close() then join() 48 | pool.close() 49 | pool.join() 50 | pbar.finish() 51 | 52 | 53 | def compress_file(input_file, output_file, target_compression, sanitize=False): 54 | try: 55 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 56 | if is_multi_read(input_file): 57 | with MultiFast5File(input_file, 'r') as input_f5, MultiFast5File(output_file, 'a') as output_f5: 58 | for read in input_f5.get_reads(): 59 | output_f5.add_existing_read(read, target_compression, sanitize=sanitize) 60 | else: 61 | with Fast5File(input_file, 'r') as input_f5, \ 62 | EmptyFast5(output_file, 'a') as output_f5: 63 | compress_single_read(output_f5, input_f5, target_compression, sanitize=sanitize) 64 | except Exception as e: 65 | # Error raised in Pool.async will be lost so we explicitly print them. 66 | logging.exception(e) 67 | raise 68 | return (input_file, output_file) 69 | 70 | 71 | def compress_single_read(output_f5, read_to_copy, target_compression, sanitize=False): 72 | read_id = read_to_copy.get_read_id() 73 | raw_dataset_name = read_to_copy.raw_dataset_name 74 | raw_group_name = read_to_copy.raw_dataset_group_name 75 | read_name = "read_" + read_id 76 | # Recreating the status object is painful, but doesn't actually interact with the file so we can just reference it. 77 | output_f5.status = read_to_copy.status 78 | 79 | if str(target_compression) in read_to_copy.raw_compression_filters: 80 | # If we have the right compression then no need for doing anything fancy 81 | output_f5.handle.copy(read_to_copy.handle, read_name) 82 | else: 83 | copy_attributes(read_to_copy.handle.attrs, output_f5.handle) 84 | for subgroup in read_to_copy.handle: 85 | if subgroup not in raw_dataset_name: 86 | if sanitize and subgroup in OPTIONAL_READ_GROUPS: 87 | # skip optional groups when sanitizing 88 | continue 89 | output_f5.handle.copy(read_to_copy.handle[subgroup], subgroup) 90 | else: 91 | raw_attrs = read_to_copy.handle[raw_group_name].attrs 92 | raw_data = read_to_copy.handle[raw_dataset_name] 93 | output_f5.add_raw_data(raw_data, raw_attrs, compression=target_compression) 94 | 95 | 96 | def main(): 97 | parser = ArgumentParser("Tool for changing the compression of Fast5 files") 98 | parser.add_argument('-i', '--input_path', required=True, 99 | help='Folder containing fast5 files') 100 | 101 | output_group = parser.add_mutually_exclusive_group(required=True) 102 | save_arg = output_group.add_argument('-s', '--save_path', default=None, 103 | help="Folder to output fast5 read files to") 104 | output_group.add_argument('--in_place', action='store_true', 105 | help='Replace the old files with new files in place') 106 | 107 | parser.add_argument('-c', '--compression', required=True, choices=list(COMPRESSION_MAP.keys()), 108 | help="Target output compression type") 109 | parser.add_argument('--sanitize', action='store_true', 110 | help="Clean output files of optional groups and datasets (e.g. 'Analyses')") 111 | parser.add_argument('-t', '--threads', type=int, default=1, required=False, 112 | help="Maximum number of threads to use") 113 | parser.add_argument('--recursive', action='store_true', 114 | help="Search recursively through folders for single_read fast5 files") 115 | parser.add_argument('--ignore_symlinks', action='store_true', 116 | help="Ignore symlinks when searching recursively for fast5 files") 117 | parser.add_argument('-v', '--version', action='version', version=__version__) 118 | args = parser.parse_args() 119 | 120 | if args.input_path == args.save_path: 121 | raise ArgumentError(save_arg, "--input_path and --save_path must be different locations, or use --in_place") 122 | if args.sanitize and args.save_path is None: 123 | raise ArgumentError(save_arg, "--save_path must be given if using --sanitize") 124 | 125 | compress_batch(input_folder=args.input_path, 126 | output_folder=args.save_path, 127 | target_compression=COMPRESSION_MAP[args.compression], 128 | threads=args.threads, 129 | recursive=args.recursive, 130 | follow_symlinks=not args.ignore_symlinks, 131 | in_place=args.in_place, 132 | sanitize=args.sanitize) 133 | 134 | 135 | if __name__ == '__main__': 136 | main() 137 | -------------------------------------------------------------------------------- /ont_fast5_api/conversion_tools/demux_fast5.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script for binning fast5 reads into separate directories based on column value in summary file 3 | Inteded for demultiplexing reads using barcoding summary file. 4 | """ 5 | from pathlib import Path 6 | from typing import Union, Dict, Set, List 7 | from multiprocessing import Pool 8 | import logging 9 | from csv import reader 10 | from collections import defaultdict 11 | from time import sleep 12 | from math import ceil 13 | from argparse import ArgumentParser 14 | 15 | from ont_fast5_api.compression_settings import COMPRESSION_MAP 16 | from ont_fast5_api.conversion_tools.conversion_utils import ( 17 | get_fast5_file_list, 18 | get_progress_bar, 19 | Fast5FilterWorker, 20 | READS_PER_FILE, 21 | FILENAME_BASE, 22 | ProgressBar, 23 | ) 24 | 25 | DEMULTIPLEX_COLUMN = "barcode_arrangement" 26 | READ_ID_COLUMN = "read_id" 27 | 28 | 29 | class Fast5Demux: 30 | """ 31 | Bin reads from directory of fast5 files according to demultiplex_column in sequencing_summary path 32 | :param input_dir: Path to input Fast5 file or directory of Fast5 files 33 | :param output_dir: Path to output directory 34 | :param summary_file: Path to TSV summary file 35 | :param demultiplex_column: str name of column with demultiplex values 36 | :param read_id_column: str name of column with read ids 37 | :param filename_base: str prefix for output Fast5 files 38 | :param batch_size: int maximum number of reads per output file 39 | :param threads: int maximum number of worker processes 40 | :param recursive: bool flag to search recursively through input_dir for Fast5 files 41 | :param follow_symlinks: bool flag to follow symlinks in input_dir 42 | :param target_compression: str compression type in output Fast5 files 43 | """ 44 | 45 | def __init__( 46 | self, 47 | input_dir: Path, 48 | output_dir: Path, 49 | summary_file: Path, 50 | demultiplex_column: str, 51 | read_id_column: str = READ_ID_COLUMN, 52 | filename_base: str = FILENAME_BASE, 53 | batch_size: int = READS_PER_FILE, 54 | threads: int = 1, 55 | recursive: bool = False, 56 | follow_symlinks: bool = True, 57 | target_compression: Union[str, None] = None, 58 | ): 59 | self.input_dir = input_dir 60 | self.output_dir = output_dir 61 | self.summary = summary_file 62 | self.demultiplex_column = demultiplex_column 63 | self.read_id_column = read_id_column 64 | self.filename_base = filename_base 65 | self.batch_size = batch_size 66 | self.threads = threads 67 | self.recursive = recursive 68 | self.follow_symlinks = follow_symlinks 69 | self.target_compression = target_compression 70 | 71 | self.read_sets: Dict[str, Set[str]] = {} 72 | self.input_fast5s: List[Path] = [] 73 | self.max_threads: int = 0 74 | self.workers: List = [] 75 | self.progressbar: Union[ProgressBar, None] = None 76 | self.logger: logging.Logger = logging.getLogger(self.__class__.__name__) 77 | 78 | def create_output_dirs(self) -> None: 79 | """ 80 | In output directory create a subdirectory per demux category 81 | :return: 82 | """ 83 | self.output_dir.mkdir(parents=True, exist_ok=True) 84 | for demux in self.read_sets: 85 | out_dir = self.output_dir / demux 86 | out_dir.mkdir(exist_ok=True) 87 | 88 | def run_batch(self) -> None: 89 | """ 90 | Run workers in pool or sequentially 91 | Starts multiprocessing pool if max_threads allows it 92 | :return: 93 | """ 94 | self.workers_setup() 95 | 96 | if self.max_threads > 1: 97 | with Pool(self.max_threads) as pool: 98 | for worker in self.workers: 99 | worker.run_batch(pool=pool) 100 | while any(worker.tasks for worker in self.workers): 101 | sleep(1) 102 | 103 | pool.join() 104 | pool.close() 105 | else: 106 | for worker in self.workers: 107 | worker.run_batch(pool=None) 108 | 109 | self.progressbar.finish() 110 | 111 | def workers_setup(self) -> None: 112 | """ 113 | Parse input summary and input file list to determine amount of work 114 | Create output directories and initialise workers 115 | :return: 116 | """ 117 | self.read_sets = self.parse_summary_demultiplex() 118 | self.input_fast5s = get_fast5_file_list( 119 | input_path=self.input_dir, 120 | recursive=self.recursive, 121 | follow_symlinks=self.follow_symlinks, 122 | ) 123 | self.max_threads = self.calculate_max_threads() 124 | # progressbar length is total numbers of reads to be extracted plus total number of files to be read 125 | total_progress = sum(len(item) for item in self.read_sets.values()) + ( 126 | len(self.input_fast5s) * len(self.read_sets) 127 | ) 128 | self.progressbar = get_progress_bar(num_reads=total_progress) 129 | self.create_output_dirs() 130 | for demux in sorted(self.read_sets): 131 | self.workers.append( 132 | Fast5FilterWorker( 133 | input_file_list=self.input_fast5s, 134 | output_dir=self.output_dir / demux, 135 | read_set=self.read_sets[demux], 136 | progressbar=self.progressbar, 137 | logger=self.logger, 138 | filename_base=self.filename_base, 139 | batch_size=self.batch_size, 140 | target_compression=self.target_compression, 141 | ) 142 | ) 143 | 144 | def report(self) -> None: 145 | """ 146 | Log summary of work done 147 | :return: 148 | """ 149 | total_reads = 0 150 | for idx, _ in enumerate(sorted(self.read_sets)): 151 | worker = self.workers[idx] 152 | for file, reads in worker.out_files.items(): 153 | total_reads += len(reads) 154 | 155 | self.logger.info("{} reads extracted".format(total_reads)) 156 | 157 | # report reads not found 158 | reads_to_extract = sum(len(item) for item in self.read_sets.values()) 159 | if reads_to_extract > total_reads: 160 | self.logger.warning( 161 | "{} reads not found!".format(reads_to_extract - total_reads) 162 | ) 163 | 164 | def calculate_max_threads(self) -> int: 165 | """ 166 | Calculate max number of workers based on number of output files, input files and threads argument 167 | :return: int 168 | """ 169 | max_inputs_per_worker = len(self.input_fast5s) 170 | total_outputs = 0 171 | for read_set in self.read_sets.values(): 172 | outputs = int(ceil(len(read_set) / float(self.batch_size))) 173 | total_outputs += min(outputs, max_inputs_per_worker) 174 | 175 | return min(self.threads, total_outputs) 176 | 177 | def parse_summary_demultiplex(self) -> Dict[str, Set[str]]: 178 | """ 179 | Open a TSV file and parse read_id and demultiplex columns into dict {demultiplex: read_id_set} 180 | :return: 181 | """ 182 | read_sets = defaultdict(set) 183 | with open(str(self.summary), "r") as fh: 184 | read_list_tsv = reader(fh, delimiter="\t") 185 | header = next(read_list_tsv) 186 | 187 | if self.read_id_column in header: 188 | read_id_col_idx = header.index(self.read_id_column) 189 | else: 190 | raise ValueError( 191 | "No '{}' read_id column in header: {}".format( 192 | self.read_id_column, header 193 | ) 194 | ) 195 | 196 | if self.demultiplex_column in header: 197 | demultiplex_col_idx = header.index(self.demultiplex_column) 198 | else: 199 | raise ValueError( 200 | "No '{}' demultiplex column in header: {}".format( 201 | self.demultiplex_column, header 202 | ) 203 | ) 204 | 205 | for line in read_list_tsv: 206 | read_id = line[read_id_col_idx] 207 | demux = line[demultiplex_col_idx] 208 | read_sets[demux].add(read_id) 209 | 210 | return read_sets 211 | 212 | 213 | def create_arg_parser(): 214 | parser = ArgumentParser( 215 | "Tool for binning reads from a multi_read_fast5_file by column value in summary file" 216 | ) 217 | parser.add_argument( 218 | "-i", 219 | "--input", 220 | required=True, 221 | type=Path, 222 | help="Path to Fast5 file or directory of Fast5 files", 223 | ) 224 | parser.add_argument( 225 | "-s", 226 | "--save_path", 227 | required=True, 228 | type=Path, 229 | help="Directory to output MultiRead subset to", 230 | ) 231 | parser.add_argument( 232 | "-l", 233 | "--summary_file", 234 | required=True, 235 | type=Path, 236 | help="TSV file containing read_id column (sequencing_summary.txt file)", 237 | ) 238 | parser.add_argument( 239 | "-f", 240 | "--filename_base", 241 | default="batch", 242 | required=False, 243 | help="Root of output filename, default='{}' -> '{}0.fast5'".format( 244 | FILENAME_BASE, FILENAME_BASE 245 | ), 246 | ) 247 | parser.add_argument( 248 | "-n", 249 | "--batch_size", 250 | type=int, 251 | default=READS_PER_FILE, 252 | required=False, 253 | help="Number of reads per multi-read file (default {})".format(READS_PER_FILE), 254 | ) 255 | parser.add_argument( 256 | "-t", 257 | "--threads", 258 | type=int, 259 | default=1, 260 | required=False, 261 | help="Maximum number of parallel processes to use (default 1)", 262 | ) 263 | parser.add_argument( 264 | "-r", 265 | "--recursive", 266 | action="store_true", 267 | required=False, 268 | default=False, 269 | help="Flag to search recursively through input directory for MultiRead fast5 files", 270 | ) 271 | parser.add_argument( 272 | "--ignore_symlinks", 273 | action="store_true", 274 | help="Ignore symlinks when searching recursively for fast5 files", 275 | ) 276 | parser.add_argument( 277 | "-c", 278 | "--compression", 279 | required=False, 280 | default=None, 281 | choices=list(COMPRESSION_MAP.keys()) + [None], 282 | help="Target output compression type. If omitted - don't change compression type", 283 | ) 284 | parser.add_argument( 285 | "--demultiplex_column", 286 | type=str, 287 | default=DEMULTIPLEX_COLUMN, 288 | required=False, 289 | help="Name of column for demultiplexing in summary file (default '{}'".format( 290 | DEMULTIPLEX_COLUMN 291 | ), 292 | ) 293 | parser.add_argument( 294 | "--read_id_column", 295 | type=str, 296 | default=READ_ID_COLUMN, 297 | required=False, 298 | help="Name of read_id column in summary file (default '{}'".format( 299 | READ_ID_COLUMN 300 | ), 301 | ) 302 | return parser 303 | 304 | 305 | def main(): 306 | parser = create_arg_parser() 307 | args = parser.parse_args() 308 | if args.compression is not None: 309 | args.compression = COMPRESSION_MAP[args.compression] 310 | 311 | demux = Fast5Demux( 312 | input_dir=args.input, 313 | output_dir=args.save_path, 314 | summary_file=args.summary_file, 315 | demultiplex_column=args.demultiplex_column, 316 | read_id_column=args.read_id_column, 317 | filename_base=args.filename_base, 318 | batch_size=args.batch_size, 319 | threads=args.threads, 320 | recursive=args.recursive, 321 | follow_symlinks=not args.ignore_symlinks, 322 | target_compression=args.compression, 323 | ) 324 | demux.run_batch() 325 | demux.report() 326 | 327 | 328 | if __name__ == "__main__": 329 | main() 330 | -------------------------------------------------------------------------------- /ont_fast5_api/conversion_tools/fast5_subset.py: -------------------------------------------------------------------------------- 1 | """Filter Fast5 files based on read_id list 2 | """ 3 | import csv 4 | import logging 5 | from argparse import ArgumentParser 6 | from math import ceil 7 | from multiprocessing import Pool 8 | from os import makedirs, path 9 | from pathlib import Path 10 | from time import sleep 11 | 12 | from ont_fast5_api.compression_settings import COMPRESSION_MAP 13 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, get_progress_bar, Fast5FilterWorker 14 | from ont_fast5_api.conversion_tools.conversion_utils import READS_PER_FILE, FILENAME_BASE 15 | 16 | logging.basicConfig(level=logging.DEBUG) 17 | 18 | 19 | class Fast5Filter: 20 | """ 21 | Extract reads listed read_list_file from fast5 files in input_folder, write to multi-fast5 files in 22 | output_folder 23 | """ 24 | 25 | def __init__(self, input_folder, output_folder, read_list_file, filename_base=FILENAME_BASE, 26 | batch_size=READS_PER_FILE, threads=1, recursive=False, file_list_file=None, follow_symlinks=True, 27 | target_compression=None): 28 | assert path.isdir(input_folder) 29 | assert path.isfile(read_list_file) 30 | assert isinstance(filename_base, str) 31 | assert isinstance(batch_size, int) 32 | assert isinstance(threads, int) 33 | assert isinstance(recursive, bool) 34 | self.logger = logging.getLogger(self.__class__.__name__) 35 | 36 | self.read_set = parse_summary_file(read_list_file) 37 | self.input_f5s = get_fast5_file_list(str(input_folder), recursive, follow_symlinks=follow_symlinks) 38 | makedirs(output_folder, exist_ok=True) 39 | 40 | if len(self.read_set) < 1: 41 | raise ValueError("No reads in read list file {}".format(read_list_file)) 42 | 43 | if len(self.input_f5s) < 1: 44 | raise ValueError( 45 | "No input fast5 files found in {}. Recursion is set to {}".format(str(input_folder), recursive)) 46 | 47 | if batch_size < 1: 48 | raise ValueError("Batch size (--batch_size) must be a positive integer, not {}".format(batch_size)) 49 | 50 | if threads < 1: 51 | raise ValueError("Max number of threads (--threads) must be a positive integer, not {}".format(threads)) 52 | 53 | if file_list_file: 54 | file_set = parse_summary_file(file_list_file) 55 | for file in file_set: 56 | assert path.exists(file), "{} from file list doesn't exist".format(file) 57 | self.input_f5s = list(file_set.intersection(self.input_f5s)) 58 | 59 | # determine max number of workers 60 | num_outputs = int(ceil(len(self.read_set) / float(batch_size))) 61 | self.num_workers = min(threads, num_outputs, len(self.input_f5s)) 62 | 63 | # progressbar total is number of reads in read_set plus number of input files 64 | # (to see progress while scanning files that don't have any relevant reads) 65 | self.pbar = get_progress_bar(len(self.read_set) + len(self.input_f5s)) 66 | 67 | self.worker = Fast5FilterWorker( 68 | input_file_list=self.input_f5s, 69 | output_dir=Path(output_folder), 70 | logger=self.logger, 71 | progressbar=self.pbar, 72 | read_set=self.read_set, 73 | filename_base=filename_base, 74 | batch_size=batch_size, 75 | target_compression=target_compression 76 | ) 77 | 78 | def run_batch(self): 79 | 80 | if self.num_workers == 1: 81 | self.worker.run_batch(pool=None) 82 | else: 83 | with Pool(self.num_workers) as pool: 84 | self.worker.run_batch(pool=pool) 85 | 86 | while self.worker.tasks: 87 | sleep(1) 88 | 89 | pool.close() 90 | pool.join() 91 | 92 | self.pbar.finish() 93 | self.logger.info("{} reads extracted".format(sum(len(v) for v in self.worker.out_files.values()))) 94 | 95 | # report reads not found 96 | if len(self.worker.read_set) > 0: 97 | self.logger.warning("{} reads not found!".format(len(self.worker.read_set))) 98 | 99 | 100 | def parse_summary_file(read_list_file): 101 | """ 102 | Opens a text file and returns set of read_ids 103 | Expects either a single column file where every line is read_id or 104 | multi-column Tab-separated CSV, that contains a column read_id 105 | :param read_list_file: path to file 106 | :return: set 107 | """ 108 | reads = set() 109 | with open(str(read_list_file), 'r') as fh: 110 | read_list_tsv = csv.reader(fh, delimiter='\t') 111 | header = next(read_list_tsv) 112 | 113 | if "read_id" in header: 114 | col_idx = header.index("read_id") 115 | else: 116 | if len(header) == 1: 117 | reads.add(header[0].strip()) 118 | col_idx = 0 119 | else: 120 | raise TypeError("multi-column file without 'read_id' column") 121 | 122 | for line in read_list_tsv: 123 | reads.add(line[col_idx].strip()) 124 | if len(reads) < 1: 125 | raise ValueError("No reads in read list file {}".format(read_list_file)) 126 | return reads 127 | 128 | 129 | def main(): 130 | parser = ArgumentParser("Tool for extracting reads from a multi_read_fast5_file by read_id") 131 | parser.add_argument('-i', '--input', required=True, 132 | help="Path to Fast5 file or directory of Fast5 files") 133 | parser.add_argument('-s', '--save_path', required=True, 134 | help="Folder to output MultiRead subset to") 135 | parser.add_argument('-l', '--read_id_list', required=True, 136 | help="File containing list of read ids to extract (or sequencing_summary.txt file)") 137 | parser.add_argument('-f', '--filename_base', default=FILENAME_BASE, required=False, 138 | help="Root of output filename, default='{}' -> '{}0.fast5'".format(FILENAME_BASE, FILENAME_BASE)) 139 | parser.add_argument('-n', '--batch_size', type=int, default=READS_PER_FILE, required=False, 140 | help="Number of reads per multi-read file (default {}".format(READS_PER_FILE)) 141 | parser.add_argument('-t', '--threads', type=int, default=1, required=False, 142 | help="Maximum number of threads to use") 143 | parser.add_argument('-r', '--recursive', action='store_true', required=False, default=False, 144 | help="Search recursively through folders for MultiRead fast5 files") 145 | parser.add_argument('--ignore_symlinks', action='store_true', 146 | help="Ignore symlinks when searching recursively for fast5 files") 147 | parser.add_argument('-c', '--compression', required=False, default=None, 148 | choices=list(COMPRESSION_MAP.keys()) + [None], help="Target output compression type") 149 | parser.add_argument('--file_list', required=False, 150 | help="File containing names of files to search in") 151 | args = parser.parse_args() 152 | 153 | if args.compression is not None: 154 | args.compression = COMPRESSION_MAP[args.compression] 155 | 156 | multifilter = Fast5Filter(input_folder=args.input, 157 | output_folder=args.save_path, 158 | filename_base=args.filename_base, 159 | read_list_file=args.read_id_list, 160 | batch_size=args.batch_size, 161 | threads=args.threads, 162 | recursive=args.recursive, 163 | file_list_file=args.file_list, 164 | follow_symlinks=not args.ignore_symlinks, 165 | target_compression=args.compression) 166 | 167 | multifilter.run_batch() 168 | 169 | 170 | if __name__ == '__main__': 171 | main() 172 | -------------------------------------------------------------------------------- /ont_fast5_api/conversion_tools/multi_to_single_fast5.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from multiprocessing import Pool 3 | import logging 4 | import os 5 | 6 | from ont_fast5_api import __version__ 7 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, get_progress_bar 8 | from ont_fast5_api.fast5_file import EmptyFast5, Fast5FileTypeError 9 | from ont_fast5_api.fast5_interface import check_file_type, MULTI_READ 10 | from ont_fast5_api.multi_fast5 import MultiFast5File 11 | 12 | logging.basicConfig(level=logging.INFO) 13 | logger = logging.getLogger(__name__) 14 | exc_info = False 15 | 16 | 17 | def batch_convert_multi_files_to_single(input_path, output_folder, threads, recursive, follow_symlinks): 18 | pool = Pool(threads) 19 | file_list = get_fast5_file_list(input_path, recursive, follow_symlinks=follow_symlinks) 20 | pbar = get_progress_bar(len(file_list)) 21 | 22 | def update(result): 23 | input_file = result[0] 24 | with open(os.path.join(output_folder, "filename_mapping.txt"), 'a') as output_table: 25 | for filename in result[1]: 26 | output_table.write("{}\t{}\n".format(input_file, filename)) 27 | pbar.update(pbar.currval + 1) 28 | 29 | if not os.path.exists(output_folder): 30 | os.makedirs(output_folder) 31 | 32 | results_array = [] 33 | for batch_num, filename in enumerate(file_list): 34 | results_array.append(pool.apply_async(convert_multi_to_single, 35 | args=(filename, output_folder, 36 | str(batch_num)), 37 | callback=update)) 38 | 39 | pool.close() 40 | pool.join() 41 | pbar.finish() 42 | 43 | 44 | def convert_multi_to_single(input_file, output_folder, subfolder): 45 | output_files = () 46 | try: 47 | output_files = try_multi_to_single_conversion(input_file, output_folder, subfolder) 48 | except Exception as e: 49 | logger.error("{}\n\tFailed to copy files from: {}" 50 | "".format(e, input_file), exc_info=exc_info) 51 | return input_file, output_files 52 | 53 | 54 | def try_multi_to_single_conversion(input_file, output_folder, subfolder): 55 | output_files = [] 56 | with MultiFast5File(input_file, 'r') as multi_f5: 57 | file_type = check_file_type(multi_f5) 58 | if file_type != MULTI_READ: 59 | raise Fast5FileTypeError("Could not convert Multi->Single for file type '{}' with path '{}'" 60 | "".format(file_type, input_file)) 61 | for read in multi_f5.get_reads(): 62 | try: 63 | output_file = os.path.join(output_folder, subfolder, "{}.fast5".format(read.read_id)) 64 | create_single_f5(output_file, read) 65 | output_files.append(os.path.basename(output_file)) 66 | except Exception as e: 67 | logger.error("{}\n\tFailed to copy read '{}' from {}" 68 | "".format(str(e), read.read_id, input_file), exc_info=exc_info) 69 | return output_files 70 | 71 | 72 | def create_single_f5(output_file, read): 73 | if not os.path.exists(os.path.dirname(output_file)): 74 | os.makedirs(os.path.dirname(output_file)) 75 | with EmptyFast5(output_file, 'w') as single_f5: 76 | for group in read.handle: 77 | if group == "Raw": 78 | read_number = read.handle["Raw"].attrs["read_number"] 79 | single_f5.handle.copy(read.handle[group], "Raw/Reads/Read_{}".format(read_number)) 80 | elif group in ("channel_id", "context_tags", "tracking_id"): 81 | if "UniqueGlobalKey" not in single_f5.handle: 82 | single_f5.handle.create_group("UniqueGlobalKey") 83 | single_f5.handle.copy(read.handle[group], "UniqueGlobalKey/{}".format(group)) 84 | else: 85 | single_f5.handle.copy(read.handle[group], group) 86 | 87 | 88 | def main(): 89 | parser = ArgumentParser("") 90 | parser.add_argument('-i', '--input_path', required=True, 91 | help="MultiRead fast5 file or path to directory of MultiRead files") 92 | parser.add_argument('-s', '--save_path', required=True, 93 | help="Folder to output SingleRead fast5 files to") 94 | parser.add_argument('--recursive', action='store_true', 95 | help="Search recursively through folders for MultiRead fast5 files") 96 | parser.add_argument('--ignore_symlinks', action='store_true', 97 | help="Ignore symlinks when searching recursively for fast5 files") 98 | parser.add_argument('-t', '--threads', type=int, default=1, required=False, 99 | help="Number of threads to use") 100 | parser.add_argument('-v', '--version', action='version', version=__version__) 101 | args = parser.parse_args() 102 | 103 | batch_convert_multi_files_to_single(args.input_path, args.save_path, args.threads, 104 | args.recursive, follow_symlinks=not args.ignore_symlinks) 105 | 106 | 107 | if __name__ == '__main__': 108 | main() 109 | -------------------------------------------------------------------------------- /ont_fast5_api/conversion_tools/single_to_multi_fast5.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from argparse import ArgumentParser 4 | from multiprocessing import Pool 5 | 6 | from ont_fast5_api import __version__ 7 | from ont_fast5_api.compression_settings import COMPRESSION_MAP 8 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, batcher, get_progress_bar 9 | from ont_fast5_api.fast5_file import Fast5File, Fast5FileTypeError 10 | from ont_fast5_api.multi_fast5 import MultiFast5File 11 | 12 | logging.basicConfig(level=logging.INFO) 13 | logger = logging.getLogger(__name__) 14 | exc_info = False 15 | 16 | 17 | def batch_convert_single_to_multi(input_path, output_folder, filename_base, batch_size, 18 | threads, recursive, follow_symlinks, target_compression): 19 | pool = Pool(threads) 20 | file_list = get_fast5_file_list(input_path, recursive, follow_symlinks) 21 | pbar = get_progress_bar(int((len(file_list) + batch_size - 1) / batch_size)) 22 | 23 | def update(result): 24 | output_file = result[1] 25 | with open(os.path.join(output_folder, "filename_mapping.txt"), 'a') as output_table: 26 | for filename in result[0]: 27 | output_table.write("{}\t{}\n".format(filename, output_file)) 28 | pbar.update(pbar.currval + 1) 29 | 30 | results_array = [] 31 | os.makedirs(output_folder, exist_ok=True) 32 | for batch_num, batch in enumerate(batcher(file_list, batch_size)): 33 | output_file = os.path.join(output_folder, "{}_{}.fast5".format(filename_base, batch_num)) 34 | results_array.append(pool.apply_async(create_multi_read_file, 35 | args=(batch, output_file, target_compression), 36 | callback=update)) 37 | 38 | pool.close() 39 | pool.join() 40 | pbar.finish() 41 | 42 | 43 | def create_multi_read_file(input_files, output_file, target_compression): 44 | results = [] 45 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 46 | if os.path.exists(output_file): 47 | logger.info("FileExists - appending new reads to existing file: {}".format(output_file)) 48 | try: 49 | with MultiFast5File(output_file, 'a') as multi_f5: 50 | for filename in input_files: 51 | try: 52 | with Fast5File(filename, 'r') as f5_input: 53 | read = f5_input.get_read(f5_input.read_id) 54 | multi_f5.add_existing_read(read, target_compression=target_compression) 55 | results.append(os.path.basename(filename)) 56 | except Fast5FileTypeError as e: 57 | logger.error("{}: Cannot input MultiRead files to single_to_multi: '{}'" 58 | "".format(e, filename), exc_info=exc_info) 59 | raise 60 | except Exception as e: 61 | logger.error("{}\n\tFailed to add single read file: '{}' to '{}'" 62 | "".format(e, filename, output_file), exc_info=exc_info) 63 | 64 | except Fast5FileTypeError: 65 | raise 66 | except Exception as e: 67 | logger.error("{}\n\tFailed to write to MultiRead file: {}" 68 | "".format(e, output_file), exc_info=exc_info) 69 | return results, output_file 70 | 71 | 72 | def main(): 73 | parser = ArgumentParser("") 74 | parser.add_argument('-i', '--input_path', required=True, 75 | help='Folder containing single read fast5 files') 76 | parser.add_argument('-s', '--save_path', required=True, 77 | help="Folder to output multi read files to") 78 | parser.add_argument('-f', '--filename_base', default='batch', required=False, 79 | help="Root of output filename, default='batch' -> 'batch_0.fast5'") 80 | parser.add_argument('-n', '--batch_size', type=int, default=4000, required=False, 81 | help="Number of reads per multi-read file") 82 | parser.add_argument('-t', '--threads', type=int, default=1, required=False, 83 | help="Number of threads to use") 84 | parser.add_argument('--recursive', action='store_true', 85 | help="Search recursively through folders for single_read fast5 files") 86 | parser.add_argument('--ignore_symlinks', action='store_true', 87 | help="Ignore symlinks when searching recursively for fast5 files") 88 | parser.add_argument('-c', '--compression', required=False, default=None, 89 | choices=list(COMPRESSION_MAP.keys()) + [None], help="Target output compression type") 90 | parser.add_argument('-v', '--version', action='version', version=__version__) 91 | args = parser.parse_args() 92 | 93 | if args.compression is not None: 94 | args.compression = COMPRESSION_MAP[args.compression] 95 | 96 | batch_convert_single_to_multi(args.input_path, 97 | args.save_path, 98 | args.filename_base, 99 | args.batch_size, 100 | args.threads, 101 | args.recursive, 102 | follow_symlinks=not args.ignore_symlinks, 103 | target_compression=args.compression) 104 | 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /ont_fast5_api/data_sanitisation.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import numpy as np 3 | 4 | 5 | def _clean(value): 6 | """ Convert numpy numeric types to their python equivalents. """ 7 | if isinstance(value, np.ndarray): 8 | if value.dtype.kind == 'S': 9 | return np.char.decode(value).tolist() 10 | else: 11 | return value.tolist() 12 | elif type(value).__module__ == np.__name__: 13 | # h5py==2.8.0 on windows sometimes fails to cast this from an np.float64 to a python.float 14 | # We have to let the user do this themselves, since casting here could be dangerous 15 | # https://github.com/h5py/h5py/issues/1051 16 | conversion = value.item() # np.asscalar(value) was deprecated in v1.16 17 | if isinstance(conversion, bytes): 18 | conversion = conversion.decode() 19 | return conversion 20 | elif isinstance(value, bytes): 21 | return value.decode() 22 | else: 23 | return value 24 | 25 | 26 | def _sanitize_data_for_writing(data): 27 | # To make the interface more user friendly we encode python strings as byte-strings when writing datasets 28 | if isinstance(data, str): 29 | # Plain python-strings can be encoded trivially 30 | return data.encode() 31 | elif isinstance(data, np.ndarray) and data.dtype.kind == np.dtype(np.unicode_): 32 | # If the array is all of one type, unicode-string, we can encode with numpy 33 | return data.astype('S') 34 | elif isinstance(data, np.ndarray) and len(data.dtype) > 1: 35 | # If the array is of mixed types we have to set the encoding column by column 36 | encoded_dtypes = [] 37 | for field_name in data.dtype.names: 38 | field_dtype, field_byte_index = data.dtype.fields[field_name] 39 | if field_dtype.kind == 'U': 40 | str_len = field_dtype.itemsize // field_dtype.alignment 41 | field_dtype = np.dtype("|S{}".format(str_len)) 42 | encoded_dtypes.append((field_name, field_dtype)) 43 | return data.astype(encoded_dtypes) 44 | 45 | return data 46 | 47 | 48 | def _sanitize_data_for_reading(data): 49 | # To make the interface more user friendly we decode byte-strings into unicode strings when reading datasets 50 | if isinstance(data, h5py.Dataset): 51 | data = data[()] 52 | 53 | if isinstance(data, bytes): 54 | # Plain byte-strings can be decoded trivially 55 | return data.decode() 56 | elif isinstance(data, np.ndarray) and data.dtype.kind == 'S': 57 | # If the array is all of one type, byte-string, we can decode with numpy 58 | return np.char.decode(data) 59 | elif isinstance(data, np.ndarray) and len(data.dtype) > 1: 60 | # If the array is of mixed types we have to decode column by column 61 | decoded_dtypes = [] 62 | for field_name in data.dtype.names: 63 | field_dtype, field_byte_index = data.dtype.fields[field_name] 64 | if field_dtype.kind == 'S': 65 | field_dtype = np.dtype("=3', 26 | 'numpy>=1.16', 27 | 'packaging', 28 | 'progressbar33>=2.3.1', 29 | 'setuptools'] 30 | 31 | setup(name=__pkg_name__.replace("_", "-"), 32 | author='Oxford Nanopore Technologies, Limited', 33 | description='Oxford Nanopore Technologies fast5 API software', 34 | license='MPL 2.0', 35 | long_description=documentation, 36 | version=get_version(), 37 | url='https://github.com/nanoporetech/{}'.format(__pkg_name__), 38 | install_requires=installation_requirements, 39 | packages=find_packages(), 40 | package_data={__pkg_name__: ['vbz_plugin/*.so', 'vbz_plugin/*.dylib', 'vbz_plugin/*.dll']}, 41 | python_requires='>=3.7', 42 | entry_points={'console_scripts': [ 43 | "multi_to_single_fast5={}.conversion_tools.multi_to_single_fast5:main".format(__pkg_name__), 44 | "single_to_multi_fast5={}.conversion_tools.single_to_multi_fast5:main".format(__pkg_name__), 45 | "fast5_subset={}.conversion_tools.fast5_subset:main".format(__pkg_name__), 46 | "compress_fast5={}.conversion_tools.compress_fast5:main".format(__pkg_name__), 47 | "check_compression={}.conversion_tools.check_file_compression:main".format(__pkg_name__), 48 | "demux_fast5={}.conversion_tools.demux_fast5:main".format(__pkg_name__), 49 | ]}, 50 | classifiers=[ 51 | 'Development Status :: 5 - Production/Stable', 52 | 'Environment :: Console', 53 | 'Intended Audience :: Developers', 54 | 'Intended Audience :: Science/Research', 55 | 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)', 56 | 'Natural Language :: English', 57 | 'Operating System :: Microsoft :: Windows', 58 | 'Operating System :: POSIX :: Linux', 59 | 'Operating System :: MacOS', 60 | 'Programming Language :: Python :: 3 :: Only', 61 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 62 | ], 63 | keywords='fast5 nanopore') 64 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/__init__.py -------------------------------------------------------------------------------- /test/data/basecall_2d_file_v1.0.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/basecall_2d_file_v1.0.fast5 -------------------------------------------------------------------------------- /test/data/hardlink/single_reads/00031f3e-415c-4ab5-9c16-fb6fe45ff519.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/00031f3e-415c-4ab5-9c16-fb6fe45ff519.fast5 -------------------------------------------------------------------------------- /test/data/hardlink/single_reads/000c0b4e-46c2-4fb5-9b17-d7031eefb975.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/000c0b4e-46c2-4fb5-9b17-d7031eefb975.fast5 -------------------------------------------------------------------------------- /test/data/hardlink/single_reads/000ebd63-3e1a-4499-9ded-26af3225a022.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/000ebd63-3e1a-4499-9ded-26af3225a022.fast5 -------------------------------------------------------------------------------- /test/data/hardlink/single_reads/002ad0e4-c6bb-4eff-a30f-5fec01475ab8.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/002ad0e4-c6bb-4eff-a30f-5fec01475ab8.fast5 -------------------------------------------------------------------------------- /test/data/hardlink/single_reads/002b0891-03bf-4622-ae66-ae6984890ed4.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/002b0891-03bf-4622-ae66-ae6984890ed4.fast5 -------------------------------------------------------------------------------- /test/data/hardlink/single_reads/0048058c-ecb4-4a0f-b283-9a128bd598c5.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/0048058c-ecb4-4a0f-b283-9a128bd598c5.fast5 -------------------------------------------------------------------------------- /test/data/hardlink/single_reads/004a87b0-c9f6-4237-b4d6-466ab979aee2.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/004a87b0-c9f6-4237-b4d6-466ab979aee2.fast5 -------------------------------------------------------------------------------- /test/data/hardlink/single_reads/0059d270-3238-4413-b38b-f588e28326df.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/0059d270-3238-4413-b38b-f588e28326df.fast5 -------------------------------------------------------------------------------- /test/data/hardlink/unlinked/batch0.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/unlinked/batch0.fast5 -------------------------------------------------------------------------------- /test/data/multi_read/batch_0.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/multi_read/batch_0.fast5 -------------------------------------------------------------------------------- /test/data/multi_read_analyses/batch_0.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/multi_read_analyses/batch_0.fast5 -------------------------------------------------------------------------------- /test/data/read_file_v0.6_raw.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/read_file_v0.6_raw.fast5 -------------------------------------------------------------------------------- /test/data/read_file_v0.6_single.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/read_file_v0.6_single.fast5 -------------------------------------------------------------------------------- /test/data/read_file_v1.0_single.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/read_file_v1.0_single.fast5 -------------------------------------------------------------------------------- /test/data/rle_basecall_table/rle_example.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/rle_basecall_table/rle_example.fast5 -------------------------------------------------------------------------------- /test/data/single_read_analyses/read.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_read_analyses/read.fast5 -------------------------------------------------------------------------------- /test/data/single_reads/fe85b517-62ee-4a33-8767-41cab5d5ab39.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_reads/fe85b517-62ee-4a33-8767-41cab5d5ab39.fast5 -------------------------------------------------------------------------------- /test/data/single_reads/fe8a3026-d1f4-46b3-8daa-e610f27acde1.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_reads/fe8a3026-d1f4-46b3-8daa-e610f27acde1.fast5 -------------------------------------------------------------------------------- /test/data/single_reads/fe9374ee-b86a-4ca4-81dc-ac06e3297728.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_reads/fe9374ee-b86a-4ca4-81dc-ac06e3297728.fast5 -------------------------------------------------------------------------------- /test/data/single_reads/read0.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_reads/read0.fast5 -------------------------------------------------------------------------------- /test/data/summaries/two_barcode_summary.txt: -------------------------------------------------------------------------------- 1 | read_id barcode_arrangement 2 | fe85b517-62ee-4a33-8767-41cab5d5ab39 barcode01 3 | fe9374ee-b86a-4ca4-81dc-ac06e3297728 barcode02 4 | fe849dd3-63bc-4044-8910-14e1686273bb barcode02 5 | fe8a3026-d1f4-46b3-8daa-e610f27acde1 barcode01 -------------------------------------------------------------------------------- /test/data/telemetry_test.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/telemetry_test.fast5 -------------------------------------------------------------------------------- /test/data/vbz_reads/vbz_reads.fast5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/vbz_reads/vbz_reads.fast5 -------------------------------------------------------------------------------- /test/helpers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from tempfile import TemporaryDirectory, _get_candidate_names 4 | import unittest 5 | 6 | test_data = os.path.join(os.path.dirname(__file__), 'data') 7 | 8 | 9 | def disable_logging(test_func): 10 | def do_test(self, *args, **kwargs): 11 | logging.disable(logging.CRITICAL) 12 | test_func(self, *args, **kwargs) 13 | 14 | return do_test 15 | 16 | 17 | class TestFast5ApiHelper(unittest.TestCase): 18 | 19 | def setUp(self): 20 | self._tmp_dir = TemporaryDirectory() 21 | self.save_path = self._tmp_dir.name 22 | 23 | def tearDown(self): 24 | self._tmp_dir.cleanup() 25 | 26 | def generate_temp_filename(self): 27 | return os.path.join(self.save_path, next(_get_candidate_names())) 28 | -------------------------------------------------------------------------------- /test/test_alignment_tools.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from ont_fast5_api.fast5_file import Fast5File 4 | from ont_fast5_api.analysis_tools.alignment import AlignmentTools 5 | from test.helpers import TestFast5ApiHelper 6 | 7 | 8 | class TestAlignmentTools(TestFast5ApiHelper): 9 | 10 | def test_001_put_and_retrieve(self): 11 | fname = self.generate_temp_filename() 12 | summary_temp = {'genome': 'Lambda', 13 | 'genome_start': 100, 14 | 'genome_end': 200, 15 | 'strand_start': 1, 16 | 'strand_end': 101, 17 | 'num_events': 125, 18 | 'num_aligned': 92, 19 | 'num_correct': 87, 20 | 'num_insertions': 8, 21 | 'num_deletions': 8, 22 | 'identity': 0.9457, 23 | 'accuracy': 0.8056} 24 | summary_comp = {'genome': 'Lambda_rc', 25 | 'genome_start': 100, 26 | 'genome_end': 200, 27 | 'strand_start': 0, 28 | 'strand_end': 96, 29 | 'num_events': 120, 30 | 'num_aligned': 90, 31 | 'num_correct': 88, 32 | 'num_insertions': 6, 33 | 'num_deletions': 10, 34 | 'identity': 0.9778, 35 | 'accuracy': 0.8302} 36 | summary_2d = {'genome': 'Lambda', 37 | 'genome_start': 100, 38 | 'genome_end': 200, 39 | 'strand_start': 0, 40 | 'strand_end': 100, 41 | 'num_events': 125, 42 | 'num_aligned': 98, 43 | 'num_correct': 96, 44 | 'num_insertions': 4, 45 | 'num_deletions': 4, 46 | 'identity': 0.9796, 47 | 'accuracy': 0.9057} 48 | sam1 = 'Dummy string for template SAM.' 49 | sam2 = 'Dummy string for complement SAM.' 50 | sam3 = 'Dummy string for 2D SAM.' 51 | sequence1 = ''.join(np.random.choice(['A', 'C', 'G', 'T'], 100)) 52 | bc = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} 53 | sequence2 = ''.join([bc[letter] for letter in sequence1[::-1]]) 54 | with Fast5File(fname, mode='w') as fh: 55 | fh.add_channel_info({'channel_number': 1, 56 | 'sampling_rate': 4000, 57 | 'digitisation': 8192, 58 | 'range': 819.2, 59 | 'offset': 0}) 60 | fh.add_read(12, 'unique_snowflake', 12345, 4000, 0, 120.75) 61 | attrs = {'name': 'test', 'version': 0, 'time_stamp': 'just now', 'component': 'segmentation'} 62 | fh.add_analysis('segmentation', 'Segmentation_000', attrs) 63 | seg_data = {'has_template': 1, 64 | 'has_complement': 1, 65 | 'first_sample_template': 0, 66 | 'duration_template': 2000, 67 | 'first_sample_complement': 2000, 68 | 'duration_complement': 2000} 69 | fh.set_summary_data('Segmentation_000', 'segmentation', seg_data) 70 | attrs['component'] = 'alignment' 71 | attrs['segmentation'] = 'Analyses/Segmentation_000' 72 | fh.add_analysis('alignment', 'Alignment_000', attrs) 73 | fh.set_summary_data('Alignment_000', 'genome_mapping_template', summary_temp) 74 | fh.set_summary_data('Alignment_000', 'genome_mapping_complement', summary_comp) 75 | fh.set_summary_data('Alignment_000', 'genome_mapping_2d', summary_2d) 76 | with AlignmentTools(fh, group_name='Alignment_000') as align: 77 | align.add_alignment_data('template', sam1, sequence1) 78 | align.add_alignment_data('complement', sam2, sequence2) 79 | align.add_alignment_data('2d', sam3, sequence1) 80 | with Fast5File(fname, mode='r') as fh: 81 | with AlignmentTools(fh, group_name='Alignment_000') as align: 82 | sam, seq = align.get_alignment_data('template') 83 | self.assertEqual(sam1, sam) 84 | self.assertEqual(sequence1, seq) 85 | sam, seq = align.get_alignment_data('complement') 86 | self.assertEqual(sam2, sam) 87 | self.assertEqual(sequence2, seq) 88 | sam, seq = align.get_alignment_data('2d') 89 | self.assertEqual(sam3, sam) 90 | self.assertEqual(sequence1, seq) 91 | results = align.get_results() 92 | speed_temp = align.calculate_speed('template') 93 | speed_comp = align.calculate_speed('complement') 94 | # Make sure we can calculate speed using only what's in the 95 | # summary 96 | summary = fh.get_summary_data('Alignment_000') 97 | template_summary = summary['genome_mapping_template'] 98 | summary_speed_temp = align.calculate_speed('template', 99 | template_summary) 100 | self.assertEqual(250, speed_temp) 101 | self.assertEqual(250, speed_comp) 102 | self.assertEqual(speed_temp, summary_speed_temp) 103 | self.assertDictEqual({'status': 'match found', 104 | 'direction': 'forward', 105 | 'ref_name': 'Lambda', 106 | 'ref_span': (100, 200), 107 | 'seq_span': (1, 101), 108 | 'seq_len': 125, 109 | 'num_aligned': 92, 110 | 'num_correct': 87, 111 | 'num_insertions': 8, 112 | 'num_deletions': 8, 113 | 'identity': 0.9457, 114 | 'accuracy': 0.8056}, results['template']) 115 | self.assertDictEqual({'status': 'match found', 116 | 'direction': 'reverse', 117 | 'ref_name': 'Lambda', 118 | 'ref_span': (100, 200), 119 | 'seq_span': (0, 96), 120 | 'seq_len': 120, 121 | 'num_aligned': 90, 122 | 'num_correct': 88, 123 | 'num_insertions': 6, 124 | 'num_deletions': 10, 125 | 'identity': 0.9778, 126 | 'accuracy': 0.8302}, results['complement']) 127 | self.assertDictEqual({'status': 'match found', 128 | 'direction': 'forward', 129 | 'ref_name': 'Lambda', 130 | 'ref_span': (100, 200), 131 | 'seq_span': (0, 100), 132 | 'seq_len': 125, 133 | 'num_aligned': 98, 134 | 'num_correct': 96, 135 | 'num_insertions': 4, 136 | 'num_deletions': 4, 137 | 'identity': 0.9796, 138 | 'accuracy': 0.9057}, results['2d']) 139 | -------------------------------------------------------------------------------- /test/test_basecall_1d_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ont_fast5_api.fast5_file import Fast5File 3 | from ont_fast5_api.analysis_tools.basecall_1d import Basecall1DTools 4 | from test.helpers import TestFast5ApiHelper 5 | 6 | 7 | class TestBasecall1DTools(TestFast5ApiHelper): 8 | 9 | def test_001_put_and_retrieve(self): 10 | fname = self.generate_temp_filename() 11 | dtypes = [('mean', float), 12 | ('start', float), 13 | ('stdv', float), 14 | ('length', float), 15 | ('called_state', ' None: 13 | super().setUp() 14 | 15 | # Known good read_ids from test_data/multi_read/batch_0.fast5 16 | self.read_id_set = {'fe849dd3-63bc-4044-8910-14e1686273bb', 17 | 'fe85b517-62ee-4a33-8767-41cab5d5ab39'} 18 | self.read_id_list = ['fe849dd3-63bc-4044-8910-14e1686273bb', 19 | 'fe85b517-62ee-4a33-8767-41cab5d5ab39'] 20 | self.fast5_path = test_data + "/multi_read/batch_0.fast5" 21 | 22 | 23 | def test_yield_fast5_files_from_fast5_file(self): 24 | f5_gen = yield_fast5_files(self.fast5_path, recursive=False) 25 | f5_path = next(f5_gen) 26 | self.assertTrue(Path(f5_path).is_file(), "Filepath is not a file") 27 | self.assertTrue(f5_path.endswith('.fast5'), "Filepath does not end with fast5 extension") 28 | self.assertTrue(Path(f5_path).absolute() == Path(self.fast5_path).absolute(), 29 | "Direct path did not return itself") 30 | 31 | def test_yield_fast5_files_from_dir(self): 32 | f5_gen = yield_fast5_files(test_data, recursive=False) 33 | 34 | for f5_path in f5_gen: 35 | self.assertTrue(Path(f5_path).is_file(), "Filepath is not a file") 36 | self.assertTrue(f5_path.endswith('.fast5'), "Filepath does not end with fast5 extension") 37 | 38 | def test_yield_fast5_reads_from_fast5_file(self): 39 | f5_read_gen = yield_fast5_reads(self.fast5_path, recursive=False) 40 | read_id, read_data = next(f5_read_gen) 41 | self.assertTrue(read_id is not None, "read_id is None") 42 | self.assertTrue(isinstance(read_data, Fast5Read), "Return is not Fast5Read instance") 43 | 44 | def test_yield_fast5_reads_from_dir(self): 45 | f5_read_gen = yield_fast5_reads(test_data, recursive=False) 46 | read_id, read_data = next(f5_read_gen) 47 | self.assertTrue(read_id is not None, "read_id is None") 48 | self.assertTrue(isinstance(read_data, Fast5Read), "Return is not Fast5Read instance") 49 | 50 | def test_yield_fast5_reads_with_set(self): 51 | f5_read_gen = yield_fast5_reads(self.fast5_path, 52 | recursive=False, 53 | read_ids=self.read_id_set) 54 | f5_reads = list(f5_read_gen) 55 | self.assertTrue(len(f5_reads) == len(self.read_id_set)) 56 | 57 | for read_id, read_data in f5_reads: 58 | self.assertTrue(read_id in self.read_id_set, "A read_id is not a member of read_ids") 59 | self.assertTrue(isinstance(read_data, Fast5Read), "Return is not Fast5Read instance") 60 | 61 | def test_yield_fast5_reads_with_list(self): 62 | f5_read_gen = yield_fast5_reads(self.fast5_path, 63 | recursive=False, 64 | read_ids=self.read_id_set) 65 | f5_reads = list(f5_read_gen) 66 | self.assertTrue(len(f5_reads) == len(self.read_id_list)) 67 | 68 | for read_id, read_data in f5_reads: 69 | self.assertTrue(read_id in self.read_id_set, "A read_id is not a member of read_id_list") 70 | self.assertTrue(isinstance(read_data, Fast5Read), "Return is not Fast5Read instance") 71 | 72 | def test_yield_fast5_reads_set_versus_list_equality(self): 73 | f5_read_gen_by_id_set = yield_fast5_reads(self.fast5_path, 74 | recursive=False, 75 | read_ids=self.read_id_set) 76 | 77 | f5_read_gen_by_id_list = yield_fast5_reads(self.fast5_path, 78 | recursive=False, 79 | read_ids=self.read_id_list) 80 | 81 | # Consume the generators into sets 82 | ids_by_set = set(rid for rid, _ in f5_read_gen_by_id_set) 83 | ids_by_list = set(rid for rid, _ in f5_read_gen_by_id_list) 84 | self.assertTrue(ids_by_list == ids_by_set, 'Ids differ when using read_id list versus set') 85 | 86 | 87 | def test_yield_fast5_reads_with_empty_set(self): 88 | f5_read_gen = yield_fast5_reads(self.fast5_path, 89 | recursive=False, 90 | read_ids=set([])) 91 | 92 | self.assertTrue(len(list(f5_read_gen)) != 0, "Empty read_ids resulted in zero returned reads") 93 | 94 | def test_yield_fast5_reads_with_garbage_set(self): 95 | f5_read_gen = yield_fast5_reads(self.fast5_path, 96 | recursive=False, 97 | read_ids={'_g4rbag£_'}) 98 | f5_reads = list(f5_read_gen) 99 | self.assertTrue(len(f5_reads) == 0, "Garbage read_ids returned non-zero reads") 100 | 101 | def test_yield_fast5_reads_type_error(self): 102 | with self.assertRaisesRegex(TypeError, 'read_ids'): 103 | f5_read_gen = yield_fast5_reads(self.fast5_path, 104 | recursive=False, 105 | read_ids=int(1)) 106 | next(f5_read_gen) -------------------------------------------------------------------------------- /test/test_fast5_converter.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import os 4 | import h5py 5 | import numpy 6 | 7 | from ont_fast5_api.conversion_tools.multi_to_single_fast5 import convert_multi_to_single, try_multi_to_single_conversion 8 | from ont_fast5_api.conversion_tools.single_to_multi_fast5 import batch_convert_single_to_multi, get_fast5_file_list, \ 9 | create_multi_read_file 10 | from ont_fast5_api.multi_fast5 import MultiFast5File 11 | from ont_fast5_api.fast5_file import Fast5FileTypeError, Fast5File 12 | from test.helpers import TestFast5ApiHelper, test_data, disable_logging 13 | 14 | 15 | class TestFast5Converter(TestFast5ApiHelper): 16 | 17 | @patch('ont_fast5_api.conversion_tools.single_to_multi_fast5.get_progress_bar') 18 | def test_single_to_multi(self, mock_pbar): 19 | input_folder = os.path.join(test_data, "single_reads") 20 | batch_size = 3 21 | file_count = len(os.listdir(input_folder)) 22 | batch_convert_single_to_multi(input_folder, self.save_path, filename_base="batch", batch_size=batch_size, 23 | threads=1, recursive=False, follow_symlinks=False, target_compression=None) 24 | 25 | expected_output_reads = {"filename_mapping.txt": 0, 26 | "batch_0.fast5": batch_size, 27 | "batch_1.fast5": file_count % batch_size} 28 | self.assertEqual(sorted(os.listdir(self.save_path)), sorted(list(expected_output_reads.keys()))) 29 | for file, read_count in expected_output_reads.items(): 30 | if read_count > 0: 31 | with h5py.File(os.path.join(self.save_path, file), 'r') as f5: 32 | self.assertEqual(len(f5), read_count) 33 | 34 | def test_multi_to_single(self): 35 | input_file = os.path.join(test_data, "multi_read", "batch_0.fast5") 36 | with MultiFast5File(input_file, 'r') as f5: 37 | read_count = len(f5.handle) 38 | expected_files = sorted([os.path.join(self.save_path, "{}", i + '.fast5') for i in f5.get_read_ids()]) 39 | 40 | subfolder = '0' 41 | convert_multi_to_single(input_file, self.save_path, subfolder) 42 | 43 | out_files = sorted(get_fast5_file_list(self.save_path, recursive=True, follow_symlinks=True)) 44 | self.assertEqual(len(out_files), read_count) 45 | self.assertEqual(out_files, [f.format(subfolder) for f in expected_files]) 46 | 47 | @disable_logging 48 | def test_single_to_multi_incorrect_types(self): 49 | input_files = [os.path.join(test_data, "multi_read", "batch_0.fast5")] 50 | with self.assertRaises(Fast5FileTypeError): 51 | create_multi_read_file(input_files, self.generate_temp_filename(), target_compression=None) 52 | 53 | def test_multi_to_single_incorrect_types(self): 54 | input_folder = os.path.join(test_data, "single_reads") 55 | input_file = os.path.join(input_folder, os.listdir(input_folder)[0]) 56 | with self.assertRaises(Fast5FileTypeError): 57 | try_multi_to_single_conversion(input_file, self.save_path, subfolder='0') 58 | 59 | def test_add_read_to_multi(self): 60 | with Fast5File(os.path.join(test_data, "single_reads", "read0.fast5"), 'r') as single_fast5, \ 61 | MultiFast5File(self.generate_temp_filename(), 'w') as multi_out: 62 | multi_out.add_existing_read(single_fast5) 63 | expected_raw = single_fast5.get_raw_data() 64 | actual_raw = multi_out.get_read(single_fast5.get_read_id()).get_raw_data() 65 | self.assertTrue(numpy.array_equal(actual_raw, expected_raw)) 66 | -------------------------------------------------------------------------------- /test/test_fast5_interface.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from ont_fast5_api.fast5_file import Fast5File 5 | from ont_fast5_api.fast5_interface import get_fast5_file, check_file_type, MULTI_READ, SINGLE_READ 6 | from ont_fast5_api.multi_fast5 import MultiFast5File 7 | from test.helpers import test_data 8 | 9 | 10 | 11 | class TestFast5Interface(unittest.TestCase): 12 | 13 | def test_correct_type(self): 14 | single_read_path = os.path.join(test_data, "single_reads", "read0.fast5") 15 | single_read_id = Fast5File(single_read_path).get_read_id() 16 | with get_fast5_file(single_read_path) as f5: 17 | self.assertTrue(isinstance(f5, Fast5File)) 18 | self.assertEqual(check_file_type(f5), SINGLE_READ) 19 | self.assertEqual(len(f5.get_read_ids()), 1) 20 | self.assertEqual(single_read_id, f5.get_read_ids()[0]) 21 | self.get_raw(f5) 22 | 23 | multi_read_path = os.path.join(test_data, "multi_read", "batch_0.fast5") 24 | with get_fast5_file(multi_read_path) as f5: 25 | self.assertTrue(isinstance(f5, MultiFast5File)) 26 | self.assertEqual(check_file_type(f5), MULTI_READ) 27 | self.assertTrue(len(f5.get_read_ids()) >= 1) 28 | self.get_raw(f5) 29 | 30 | def get_raw(self, f5): 31 | # Test we can get raw data using the same method for single and multi 32 | raw_data = f5.get_read(f5.get_read_ids()[0]).get_raw_data() 33 | self.assertTrue(len(raw_data) >= 0) 34 | -------------------------------------------------------------------------------- /test/test_fast5_subset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy 3 | from unittest.mock import patch 4 | from pathlib import Path 5 | 6 | from ont_fast5_api.compression_settings import VBZ 7 | from ont_fast5_api.conversion_tools.fast5_subset import Fast5Filter 8 | from ont_fast5_api.conversion_tools.conversion_utils import Fast5FilterWorker, extract_selected_reads, read_generator 9 | from ont_fast5_api.multi_fast5 import MultiFast5File 10 | from ont_fast5_api.fast5_file import Fast5File 11 | from test.helpers import TestFast5ApiHelper, test_data 12 | 13 | 14 | class TestFast5Subset(TestFast5ApiHelper): 15 | input_multif5_path = Path(test_data) / "multi_read" / "batch_0.fast5" 16 | read_set = {"fe85b517-62ee-4a33-8767-41cab5d5ab39", "fe9374ee-b86a-4ca4-81dc-ac06e3297728"} 17 | 18 | def test_read_generator(self): 19 | count = 0 20 | for read_id, read in read_generator(input_file=self.input_multif5_path, read_set=self.read_set): 21 | self.assertIn(read_id, self.read_set) 22 | count += 1 23 | 24 | self.assertEqual(len(self.read_set), count) 25 | 26 | def _create_read_list_file(self, read_ids): 27 | output_path = os.path.join(self.save_path, 'read_list.txt') 28 | with open(output_path, 'w') as fh: 29 | for read_id in read_ids: 30 | fh.write(read_id + "\n") 31 | return output_path 32 | 33 | @patch('ont_fast5_api.conversion_tools.fast5_subset.logging') 34 | @patch('ont_fast5_api.conversion_tools.fast5_subset.get_progress_bar') 35 | def test_subset_from_single(self, mock_log, mock_pbar): 36 | input_path = os.path.join(test_data, "single_reads") 37 | read_list = self._create_read_list_file(self.read_set) 38 | f5_filter = Fast5Filter(input_folder=input_path, 39 | output_folder=self.save_path, 40 | read_list_file=read_list) 41 | f5_filter.run_batch() 42 | 43 | count = 0 44 | with MultiFast5File(os.path.join(self.save_path, 'batch0.fast5'), 'r') as output_f5: 45 | for input_file in os.listdir(input_path): 46 | with Fast5File(os.path.join(input_path, input_file), 'r') as input_f5: 47 | read_id = input_f5.get_read_id() 48 | if read_id in self.read_set: 49 | read_in = input_f5.get_read(read_id) 50 | read_out = output_f5.get_read(read_id) 51 | self.assertTrue(numpy.array_equal(read_in.get_raw_data(), read_out.get_raw_data())) 52 | count += 1 53 | self.assertEqual(len(self.read_set), count) 54 | 55 | @patch('ont_fast5_api.conversion_tools.fast5_subset.logging') 56 | @patch('ont_fast5_api.conversion_tools.fast5_subset.get_progress_bar') 57 | def test_subset_from_multi(self, mock_log, mock_pbar): 58 | read_list = self._create_read_list_file(self.read_set) 59 | f5_filter = Fast5Filter(input_folder=os.path.dirname(self.input_multif5_path), 60 | output_folder=self.save_path, 61 | read_list_file=read_list) 62 | f5_filter.run_batch() 63 | with MultiFast5File(self.input_multif5_path, 'r') as input_f5, \ 64 | MultiFast5File(os.path.join(self.save_path, 'batch0.fast5'), 'r') as output_f5: 65 | self.assertEqual(len(self.read_set), len(output_f5.get_read_ids())) 66 | for read_id in self.read_set: 67 | read_in = input_f5.get_read(read_id) 68 | read_out = output_f5.get_read(read_id) 69 | self.assertTrue(numpy.array_equal(read_in.get_raw_data(), read_out.get_raw_data())) 70 | 71 | def test_extract_selected_reads(self): 72 | # three test for count below, equaling and above number of read in input file 73 | for count in (1, 2, 3): 74 | temp_file_name = self.generate_temp_filename() 75 | found_reads, output_file, input_file = extract_selected_reads(input_file=self.input_multif5_path, 76 | output_file=temp_file_name, 77 | count=count, read_set=self.read_set) 78 | if count < len(self.read_set): 79 | self.assertTrue(found_reads.issubset(self.read_set)) 80 | self.assertEqual(input_file, self.input_multif5_path) 81 | elif count == len(self.read_set): 82 | self.assertEqual(found_reads, self.read_set) 83 | self.assertEqual(input_file, self.input_multif5_path) 84 | elif count >= len(self.read_set): 85 | self.assertEqual(found_reads, self.read_set) 86 | self.assertIsNone(input_file) 87 | 88 | self.assertEqual(output_file, temp_file_name) 89 | # verify that resulting output file is a legal MultiFast5 with desired reads in it 90 | with MultiFast5File(output_file) as multi_file: 91 | readlist = multi_file.get_read_ids() 92 | self.assertTrue(set(readlist).issubset(self.read_set)) 93 | 94 | @patch('ont_fast5_api.conversion_tools.conversion_utils.ProgressBar') 95 | @patch('ont_fast5_api.conversion_tools.fast5_subset.logging') 96 | def test_selector_args_generator(self, mock_pbar, mock_logger): 97 | single_reads = os.path.join(test_data, "single_reads") 98 | self.assertTrue(os.path.isdir(single_reads), msg=single_reads) 99 | 100 | input_f5s = list(Path(single_reads).glob('*.fast5')) 101 | batch_size = 1 102 | 103 | f = Fast5FilterWorker( 104 | input_file_list=input_f5s, 105 | output_dir=Path(self.save_path), 106 | read_set=self.read_set, 107 | batch_size=batch_size, 108 | filename_base="batch", 109 | target_compression=VBZ, 110 | progressbar=mock_pbar, 111 | logger=mock_logger 112 | ) 113 | 114 | args_combos = list(f._args_generator()) 115 | # there should be two tuples of arguments 116 | self.assertEqual(len(args_combos), len(self.read_set) / batch_size) 117 | 118 | num_files_queued = len(f.input_f5s) # should be 0 119 | self.assertEqual(num_files_queued, (len(input_f5s) - len(args_combos)), msg=f.input_f5s) 120 | self.assertEqual(len(f.available_out_files), 0) 121 | 122 | # "exhaust" an input file and put output file back on queue 123 | input_file, output_file, reads, count, compression = args_combos[0] 124 | f._update_file_lists(reads={}, in_file=None, out_file=output_file) 125 | self.assertEqual(len(f.input_f5s), num_files_queued) 126 | self.assertEqual(len(f.available_out_files), 1) 127 | self.assertEqual(compression, VBZ) 128 | 129 | # this results in another args tuple generated 130 | new_args_combos = list(f._args_generator()) 131 | self.assertEqual(len(new_args_combos), 1, msg=len(new_args_combos)) -------------------------------------------------------------------------------- /test/test_hardlink_metadata.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from ont_fast5_api.compression_settings import VBZ 4 | from ont_fast5_api.conversion_tools.compress_fast5 import compress_file 5 | from ont_fast5_api.conversion_tools.conversion_utils import extract_selected_reads 6 | from ont_fast5_api.fast5_interface import get_fast5_file 7 | from ont_fast5_api.multi_fast5 import MultiFast5File 8 | from ont_fast5_api.static_data import HARDLINK_GROUPS 9 | from test.helpers import TestFast5ApiHelper, test_data 10 | 11 | 12 | class TestHardlinkMetaData(TestFast5ApiHelper): 13 | read_subset = {'00031f3e-415c-4ab5-9c16-fb6fe45ff519', 14 | "000c0b4e-46c2-4fb5-9b17-d7031eefb975", 15 | '000ebd63-3e1a-4499-9ded-26af3225a022', 16 | '002ad0e4-c6bb-4eff-a30f-5fec01475ab8', 17 | '0059d270-3238-4413-b38b-f588e28326df'} 18 | 19 | def test_create_read(self): 20 | input_path = os.path.join(test_data, 'hardlink', 'unlinked', 'batch0.fast5') 21 | output_path = self.generate_temp_filename() 22 | compress_file(input_path, output_path, target_compression=VBZ) 23 | new_read_id = "123456789abcdef" 24 | with MultiFast5File(output_path, 'a') as f5: 25 | # Test we can hardlink to existing metadata when creating an new empty read 26 | run_id = list(f5.run_id_map.keys())[0] 27 | master_read_id = f5.run_id_map[run_id] 28 | f5.create_empty_read(new_read_id, run_id) 29 | for group in HARDLINK_GROUPS: 30 | self.assertTrue(self.is_read_hardlinked(f5, new_read_id, master_read_id, group)) 31 | 32 | # Test we don't explode if there is no metadata 33 | f5.create_empty_read(new_read_id[::-1], "not an existing run_id") 34 | 35 | def test_hardlink_multi_compression(self): 36 | input_path = os.path.join(test_data, 'hardlink', 'unlinked', 'batch0.fast5') 37 | output_path = self.generate_temp_filename() 38 | 39 | self.assertFalse(self.is_file_hardlinked(input_path)) 40 | compress_file(input_path, output_path, target_compression=VBZ) 41 | self.assertTrue(self.is_file_hardlinked(output_path)) 42 | 43 | def test_hardlink_subset(self): 44 | input_path = os.path.join(test_data, 'hardlink', 'unlinked', 'batch0.fast5') 45 | output_path = self.generate_temp_filename() 46 | 47 | self.assertFalse(self.is_file_hardlinked(input_path)) 48 | extract_selected_reads(input_path, output_path, self.read_subset, count=len(self.read_subset)) 49 | self.assertTrue(self.is_file_hardlinked(output_path)) 50 | 51 | def test_hardlink_subset_single_reads(self): 52 | input_path = os.path.join(test_data, 'hardlink', 'single_reads') 53 | output_path = self.generate_temp_filename() 54 | 55 | for single_read_file in os.listdir(input_path): 56 | extract_selected_reads(os.path.join(input_path, single_read_file), output_path, self.read_subset, count=1) 57 | self.assertTrue(self.is_file_hardlinked(output_path)) 58 | 59 | def test_hardlink_single_to_multi(self): 60 | input_folder = os.path.join(test_data, 'hardlink', 'single_reads') 61 | input_files = [os.path.join(input_folder, file) for file in os.listdir(input_folder)] 62 | output_path = self.generate_temp_filename() 63 | 64 | with MultiFast5File(output_path, 'a') as multi_f5: 65 | for input_file in input_files: 66 | with get_fast5_file(input_file, 'r') as f5_file: 67 | for read in f5_file.get_reads(): 68 | multi_f5.add_existing_read(read) 69 | 70 | with MultiFast5File(output_path, 'r') as multi_f5: 71 | self.assertEqual(len(input_files), len(multi_f5.get_read_ids())) 72 | self.assertTrue(self.is_file_hardlinked(output_path)) 73 | 74 | def is_file_hardlinked(self, input_path): 75 | file_hardlinked = True 76 | with MultiFast5File(input_path, 'r') as f5_file: 77 | for read in f5_file.get_reads(): 78 | master_read_id = f5_file.run_id_map[read.get_run_id()] 79 | for group in HARDLINK_GROUPS: 80 | file_hardlinked &= self.is_read_hardlinked(f5_file, read.read_id, master_read_id, group) 81 | return file_hardlinked 82 | 83 | def is_read_hardlinked(self, f5_handle, read_id1, read_id2, group): 84 | if read_id1 == read_id2: 85 | return True 86 | group1 = f5_handle.get_read(read_id1).handle[group] 87 | group2 = f5_handle.get_read(read_id2).handle[group] 88 | return group1 == group2 89 | -------------------------------------------------------------------------------- /test/test_multi_fast5.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import os 3 | import random 4 | 5 | from ont_fast5_api.fast5_file import Fast5File 6 | from ont_fast5_api.fast5_read import Fast5Read 7 | from ont_fast5_api.multi_fast5 import MultiFast5File 8 | from test.helpers import TestFast5ApiHelper 9 | 10 | hexdigits = "0123456789abcdef" 11 | run_id = "123abc" 12 | 13 | 14 | class TestMultiFast5(TestFast5ApiHelper): 15 | 16 | def create_multi_file(self, read_ids): 17 | filename = self.generate_temp_filename() 18 | # driver=None is the default, but adding this in here makes sure we 19 | # preserve the constructor argument. 20 | with MultiFast5File(filename, 'w', driver=None) as multi_f5: 21 | for read_id in read_ids: 22 | multi_f5.create_empty_read(read_id, run_id) 23 | return filename 24 | 25 | def test_read_interface(self): 26 | read_ids = generate_read_ids(6) 27 | f5_file = self.create_multi_file(read_ids) 28 | 29 | with MultiFast5File(f5_file, 'a') as multi_f5: 30 | # Check we have the read_ids we expect 31 | self.assertEqual(sorted(read_ids), sorted(multi_f5.get_read_ids())) 32 | 33 | # Try and add another read with the same read_id and expect error 34 | with self.assertRaises(ValueError): 35 | multi_f5.create_empty_read(read_ids[0], run_id) 36 | 37 | # Test we can get a read from the file and it has the interface we expect 38 | read_0 = multi_f5.get_read(read_ids[0]) 39 | self.assertTrue(isinstance(read_0, Fast5Read)) 40 | 41 | # Test we cannot get a read which doesn't exit 42 | with self.assertRaises(KeyError): 43 | multi_f5.get_read("0123") 44 | 45 | def test_raw_data(self): 46 | f5_file = self.create_multi_file(generate_read_ids(4)) 47 | data = list(range(10)) 48 | raw_attrs = { 49 | "duration": 1, 50 | "median_before": 2.5, 51 | "read_id": "abcd", 52 | "read_number": 8, 53 | "start_mux": 2, 54 | "start_time": 99 55 | } 56 | with MultiFast5File(f5_file, 'a') as multi_f5: 57 | read0 = multi_f5.get_read(multi_f5.get_read_ids()[0]) 58 | read0.add_raw_data(data, attrs=raw_attrs) 59 | output_data = read0.get_raw_data() 60 | numpy.testing.assert_array_equal(output_data, data) 61 | 62 | def test_channel_info(self): 63 | f5_file = self.create_multi_file(generate_read_ids(4)) 64 | channel_info = { 65 | "digitisation": 2048, 66 | "offset": -119.5, 67 | "range": 74.2, 68 | "sampling_rate": 4000, 69 | "channel_number": "72" 70 | } 71 | # Fast5File explicitly casts the channel number on reading 72 | expected_out = channel_info.copy() 73 | expected_out['channel_number'] = int(channel_info['channel_number']) 74 | with MultiFast5File(f5_file, 'a') as multi_f5: 75 | read0 = multi_f5.get_read(multi_f5.get_read_ids()[0]) 76 | read0.add_channel_info(channel_info) 77 | output_data = read0.get_channel_info() 78 | self.assertEqual(output_data, expected_out) 79 | 80 | def test_tracking_id(self): 81 | f5_file = self.create_multi_file(generate_read_ids(4)) 82 | tracking_id = { 83 | "asic_id_eeprom": "some string", 84 | "device_id": "some string", 85 | "exp_script_name": "some string", 86 | "exp_script_purpose": "some string", 87 | "exp_start_time": "some string", 88 | "flow_cell_id": "some string", 89 | "hostname": "some string", 90 | "protocol_run_id": "some string", 91 | "protocols_version": "some string", 92 | "run_id": "some string", 93 | "version": "some string", 94 | } 95 | 96 | with MultiFast5File(f5_file, 'a') as multi_f5: 97 | read0 = multi_f5.get_read(multi_f5.get_read_ids()[0]) 98 | read0.add_tracking_id(tracking_id) 99 | output_data = read0.get_tracking_id() 100 | self.assertEqual(output_data, tracking_id) 101 | 102 | def test_add_analysis(self): 103 | f5_file = self.create_multi_file(generate_read_ids(4)) 104 | group = "Test" 105 | component = "test_component" 106 | attrs = {"attribute": 1} 107 | 108 | # Fast5File.add_analysis includes the component name in the analysis attributes 109 | expected_attributes = attrs.copy() 110 | expected_attributes['component'] = component 111 | with MultiFast5File(f5_file, 'a') as multi_f5: 112 | read0 = multi_f5.get_read(multi_f5.get_read_ids()[0]) 113 | self.assertEqual(read0.list_analyses(), []) 114 | read0.add_analysis(component, group, attrs) 115 | self.assertEqual(read0.list_analyses(), [(component, group)]) 116 | self.assertEqual(read0.get_analysis_attributes(group), expected_attributes) 117 | 118 | 119 | def generate_read_ids(num_ids, id_len=8): 120 | return ["".join(random.choice(hexdigits) for _ in range(id_len)) for _ in range(num_ids)] 121 | -------------------------------------------------------------------------------- /test/test_segmentation_tools.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | from ont_fast5_api.fast5_file import Fast5File 5 | from ont_fast5_api.analysis_tools.event_detection import EventDetectionTools 6 | from ont_fast5_api.analysis_tools.segmentation import SegmentationTools 7 | from test.helpers import TestFast5ApiHelper 8 | 9 | 10 | class TestSegmentationTools(TestFast5ApiHelper): 11 | 12 | def test_001_raw_only(self): 13 | fname = self.generate_temp_filename() 14 | with Fast5File(fname, mode='w') as fh: 15 | fh.add_channel_info({'channel_number': 1, 16 | 'sampling_rate': 4000, 17 | 'digitisation': 8192, 18 | 'range': 819.2, 19 | 'offset': 0}) 20 | fh.add_read(12, 'unique_snowflake', 12345, 1000, 0, 120.75) 21 | raw = np.empty(1000, dtype=np.int16) 22 | raw[:] = range(1000) 23 | fh.add_raw_data(raw) 24 | attrs = {'name': 'test', 'version': 0, 'time_stamp': 'just now'} 25 | fh.add_analysis('segmentation', 'Segmentation_000', attrs) 26 | segment_data = {'has_template': 1, 27 | 'has_complement': 1, 28 | 'first_sample_template': 10, 29 | 'duration_template': 470, 30 | 'first_sample_complement': 520, 31 | 'duration_complement': 460} 32 | fh.set_summary_data('Segmentation_000', 'segmentation', segment_data) 33 | with SegmentationTools(fh, group_name='Segmentation_000') as segment: 34 | results = segment.get_results() 35 | self.assertDictEqual({'has_template': True, 36 | 'has_complement': True, 37 | 'first_sample_template': 10, 38 | 'duration_template': 470, 39 | 'first_sample_complement': 520, 40 | 'duration_complement': 460}, results) 41 | temp_raw = segment.get_raw_data('template', scale=False) 42 | np.testing.assert_array_equal(temp_raw, raw[10:480]) 43 | comp_raw = segment.get_raw_data('complement', scale=False) 44 | np.testing.assert_array_equal(comp_raw, raw[520:980]) 45 | temp_raw, comp_raw = segment.get_raw_data('both', scale=False) 46 | np.testing.assert_array_equal(temp_raw, raw[10:480]) 47 | np.testing.assert_array_equal(comp_raw, raw[520:980]) 48 | temp_raw, comp_raw = segment.get_raw_data('both', scale=True) 49 | scaled_temp = raw[10:480] * 0.1 50 | scaled_comp = raw[520:980] * 0.1 51 | np.testing.assert_array_almost_equal(temp_raw, scaled_temp, decimal=5) 52 | np.testing.assert_array_almost_equal(comp_raw, scaled_comp, decimal=5) 53 | 54 | def test_002_events_only(self): 55 | fname = self.generate_temp_filename() 56 | with Fast5File(fname, mode='w') as fh: 57 | fh.add_channel_info({'channel_number': 1, 58 | 'sampling_rate': 4000, 59 | 'digitisation': 8192, 60 | 'range': 819.2, 61 | 'offset': 0}) 62 | fh.add_read(12, 'unique_snowflake', 10000, 1000, 0, 120.75) 63 | with EventDetectionTools(fh, group_name='EventDetection_000', meta={'name': 'test'}) as evdet: 64 | data = np.zeros(100, dtype=[('start', int), ('length', int), ('mean', float), ('stdv', float)]) 65 | data['start'][2] = 10010 66 | data['start'][46] = 10470 67 | data['length'][46] = 10 68 | data['start'][53] = 10520 69 | data['start'][97] = 10960 70 | data['length'][97] = 20 71 | read_attrs = {'read_number': 12} 72 | evdet.set_event_data(data, read_attrs) 73 | attrs = {'name': 'test', 'version': 0, 'time_stamp': 'just now', 74 | 'event_detection': 'Analyses/EventDetection_000'} 75 | fh.add_analysis('segmentation', 'Segmentation_000', attrs) 76 | segment_data = {'has_template': 1, 77 | 'has_complement': 1, 78 | 'start_event_template': 2, 79 | 'end_event_template': 47, 80 | 'start_event_complement': 53, 81 | 'end_event_complement': 98} 82 | fh.set_summary_data('Segmentation_000', 'segmentation', segment_data) 83 | with SegmentationTools(fh, group_name='Segmentation_000') as segment: 84 | results = segment.get_results() 85 | self.assertDictEqual({'has_template': True, 86 | 'has_complement': True, 87 | 'start_event_template': 2, 88 | 'end_event_template': 47, 89 | 'start_event_complement': 53, 90 | 'end_event_complement': 98, 91 | 'first_sample_template': 10, 92 | 'duration_template': 470, 93 | 'first_sample_complement': 520, 94 | 'duration_complement': 460}, results) 95 | --------------------------------------------------------------------------------