├── .pydevproject
├── CHANGELOG.md
├── LICENSE.md
├── MANIFEST.in
├── README.rst
├── docs
├── Makefile
└── source
│ ├── conf.py
│ └── index.rst
├── img
└── ONT_logo.png
├── ont_fast5_api
├── __init__.py
├── analysis_tools
│ ├── __init__.py
│ ├── alignment.py
│ ├── base_tool.py
│ ├── basecall_1d.py
│ ├── basecall_2d.py
│ ├── event_detection.py
│ └── segmentation.py
├── compression_settings.py
├── conversion_tools
│ ├── __init__.py
│ ├── check_file_compression.py
│ ├── compress_fast5.py
│ ├── conversion_utils.py
│ ├── demux_fast5.py
│ ├── fast5_subset.py
│ ├── multi_to_single_fast5.py
│ └── single_to_multi_fast5.py
├── data_sanitisation.py
├── fast5_file.py
├── fast5_info.py
├── fast5_interface.py
├── fast5_read.py
├── helpers.py
├── multi_fast5.py
├── static_data.py
└── vbz_plugin
│ ├── libvbz_hdf_plugin.dylib
│ ├── libvbz_hdf_plugin_aarch64.so
│ ├── libvbz_hdf_plugin_m1.dylib
│ ├── libvbz_hdf_plugin_x86_64.so
│ └── vbz_hdf_plugin.dll
├── setup.py
└── test
├── __init__.py
├── data
├── basecall_2d_file_v1.0.fast5
├── hardlink
│ ├── single_reads
│ │ ├── 00031f3e-415c-4ab5-9c16-fb6fe45ff519.fast5
│ │ ├── 000c0b4e-46c2-4fb5-9b17-d7031eefb975.fast5
│ │ ├── 000ebd63-3e1a-4499-9ded-26af3225a022.fast5
│ │ ├── 002ad0e4-c6bb-4eff-a30f-5fec01475ab8.fast5
│ │ ├── 002b0891-03bf-4622-ae66-ae6984890ed4.fast5
│ │ ├── 0048058c-ecb4-4a0f-b283-9a128bd598c5.fast5
│ │ ├── 004a87b0-c9f6-4237-b4d6-466ab979aee2.fast5
│ │ └── 0059d270-3238-4413-b38b-f588e28326df.fast5
│ └── unlinked
│ │ └── batch0.fast5
├── multi_read
│ └── batch_0.fast5
├── multi_read_analyses
│ └── batch_0.fast5
├── read_file_v0.6_raw.fast5
├── read_file_v0.6_single.fast5
├── read_file_v1.0_single.fast5
├── rle_basecall_table
│ └── rle_example.fast5
├── single_read_analyses
│ └── read.fast5
├── single_reads
│ ├── fe85b517-62ee-4a33-8767-41cab5d5ab39.fast5
│ ├── fe8a3026-d1f4-46b3-8daa-e610f27acde1.fast5
│ ├── fe9374ee-b86a-4ca4-81dc-ac06e3297728.fast5
│ └── read0.fast5
├── summaries
│ └── two_barcode_summary.txt
├── telemetry_test.fast5
└── vbz_reads
│ └── vbz_reads.fast5
├── helpers.py
├── test_alignment_tools.py
├── test_basecall_1d_tools.py
├── test_basecall_2d_tools.py
├── test_check_compression.py
├── test_compress_fast5.py
├── test_compression_settings.py
├── test_data_sanitisation.py
├── test_demux_fast5.py
├── test_event_detection_tools.py
├── test_fast5_conversion_utils.py
├── test_fast5_converter.py
├── test_fast5_file.py
├── test_fast5_interface.py
├── test_fast5_subset.py
├── test_hardlink_metadata.py
├── test_multi_fast5.py
└── test_segmentation_tools.py
/.pydevproject:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | python 2.7
6 | Default
7 |
8 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 | All notable changes and fixes to ont_fast5_api will be documented here
3 |
4 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
5 | This project (aspires to) adhere to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
6 |
7 | ## [4.1.3]
8 |
9 | ### Added
10 | - Support for python up to 3.12
11 |
12 | ## [4.1.2]
13 |
14 | ### Added
15 | - Support for h5py>=3.9
16 | ### Changed
17 | - Renamed VBZ compression settings to make it clearer which version is used in production
18 | ### Removed
19 | - Support for python3.6
20 |
21 | ## [4.1.1]
22 |
23 | ### Fixed
24 | - Compatibility with numpy==1.24 unicode type
25 |
26 | ### Changed
27 | - Updated Windows VBZ Plugin dll
28 |
29 | ## [4.1.0]
30 |
31 | ### Added
32 | - Support for fast5_api on macOS-M1
33 |
34 | ## [4.0.2]
35 |
36 | ### Fixed
37 | - Fixed Fast5Read import error
38 |
39 | ## [4.0.1]
40 |
41 | ### Changed
42 | - Fixed unresolved reference in `compress_fast5.py`
43 | - Fixed issue with `compress_fast5.py` not retaining enumeration metadata for the end_reason attribute
44 | - Increased minimum h5py version to 2.10
45 |
46 | ## [4.0.0]
47 |
48 | ### Added
49 | - Script `demux_fast5` for demultiplexing fast5 reads based on column in summary file, e.g. for barcoded experiments
50 |
51 | ### Removed
52 | - Removed deb builds which are no longer supported
53 | - Python3.5 support
54 |
55 | ## [3.3.0] 2021-02-17
56 |
57 | ### Added
58 | - Added `yield_fast5_reads` to conversion_tools.
59 |
60 | ## [3.2.0] 2021-01-28
61 |
62 | ### Changed
63 | - Dropped support for older h5py/numpy versions, min now h5py>=2.8, numpy>=1.16
64 | - fast5_subset now displays errors (but continues processing) when it encounters input fast5 files it can't read.
65 |
66 | ### Added
67 | - Add support for explicitly specifying file drivers when loading
68 | multi-read fast5 files.
69 |
70 | ## [3.1.6] 2020-08-20
71 | ### Added
72 | - `compress_fast5` now has a `--sanitize` option to remove optional groups.
73 |
74 | ### Fixed
75 | - Correctly handle the case where h5pl can be imported but doesn't have the prepend() function available.
76 |
77 | ## [3.1.5] 2020-06-15
78 | ### Added
79 | - Added explicit requirements and checks to prevent running on Python 2.
80 |
81 | ## [3.1.4] 2020-06-12
82 | ### Fixed
83 | - Compression now works in `single_to_multi`.
84 |
85 | ## [3.1.3] 2020-05-28
86 | ### Fixed
87 | - Compression argument in `fast5_subset` and `single_to_multi` failed if not set
88 |
89 | ## [3.1.2] 2020-05-04
90 | ### Fixed
91 | - Compression argument in `fast5_subset` and `single_to_multi` was parsed incorrectly
92 |
93 | ## [3.1.1] 2020-04-03
94 | ### Fixed
95 | - Argument list for `fast5_subset` and `single_to_multi` had a syntax error
96 |
97 | ## [3.1.0] 2020-04-02
98 | ### Added
99 | - Hardlinking of metadata to prevent duplication and reduce filesize
100 | - Ability to enable compression when using `fast5_subset` and `single_to_multi`
101 | ### Fixed
102 | - `fast5_subset` thread pool could sometimes close before all tasks were completed
103 | - `fast5_subset` will create output directory if it doesn't exist
104 |
105 | ## [3.0.2] 2020-03-17
106 | ### Fixed
107 | - Comparison of file_versions could throw an error
108 |
109 | ## [3.0.1] 2020-01-29
110 | ### Fixed
111 | - Basecall1DTools could not load data from a Fast5Read
112 |
113 | ## [3.0.0] 2020-01-20
114 | ### Removed
115 | - python2 compatibility
116 | ### Fixed
117 | - minor documentation errors: https://github.com/nanoporetech/ont_fast5_api/issues/28
118 |
119 | ## [2.1.0] 2019-12-16
120 | ### Added
121 | - Script to check the compression type of fast5 files in a folder
122 | - `compress_fast5` can now be used `--in_place`
123 | ### Fixed
124 | - Reading arrays with padded strings now succeeds (on h5py>2.7)
125 | - Compatibility bugs with h5py==2.6 now raises appropriate errors
126 | - Fast5File now has attribute read_id to match documentation
127 | ### Changed
128 | - Now use standard settings for gzip compression (gzip=1, shuffle=None)
129 | - Inverted dependency between `Fast5File` and `Fast5Read` so `Fast5Read` is now the primary object
130 |
131 | ## [2.0.1] 2019-11-28
132 | ### Added
133 | - Option to `--ignore_symlinks` in fast5 conversion scripts
134 | - Explicit check to file_type for detemining single/multi-read files
135 | ### Fixed
136 | - `fast5_subset` with single read fast5s was failing
137 | - unit test data now cleaned up properly
138 |
139 | ## [2.0.0] 2019-11-19
140 | ### Added
141 | - Compatibility for VBZ compressed reads
142 | - `compress_fast5` script for compressing/decompressing fast5 files
143 | - `get_reads()` helper method to more easily loop through reads in a fast5 file
144 | ### Changed
145 | - `Fast5File().get_raw_data()` updated interface to match `Fast5Read` and remove support for legacy files with multiple read numbers in a single `Fast5File`
146 | - Minimum depedency version requirements bumped. Set to Ubuntu16 `apt` python3-package defaults
147 | ### Removed
148 | - Legacy `Fast5Writer` object. `MultiReadFast5` or `EmptyFast5File` are preferred
149 |
150 | ## [1.4.9] 2019-11-01
151 | ### Added
152 | - Check for progressbar2 package and fail early if it's installed.
153 |
154 | ## [1.4.8] 2019-10-22
155 | ### Added
156 | - Support for h5py==2.10 string data type encoding changes
157 | ### Fixed
158 | - Corrected some "for for" typos in argparse help text.
159 |
160 | ## [1.4.7] 2019-07-29
161 | ### Fixed
162 | - Bug in read string and read_id concatenation resulted in broken output file
163 |
164 | ## [1.4.6] 2019-07-03
165 | ### Added
166 | - Updated fast5_subset script to extract also from single-read fast5 files
167 | ### Changed
168 | - Renamed fast5_subset source script from multi_fast5_subset.py to fast5_subset.py
169 |
170 | ## [1.4.5] 2019-07-01
171 | ### Fixed
172 | - Bug in number of processes being 0 when batch size is greater than number of reads (py2)
173 |
174 | ## [1.4.4] 2019-06-18
175 | ### Fixed
176 | - Bug in path name output from pathlib changes
177 |
178 | ## [1.4.3] 2019-06-12
179 | ### Fixed
180 | - Bug with apt-install and pathlib2
181 |
182 | ## [1.4.2] 2019-06-10
183 | ### Fixed
184 | - get_raw_data() now works with scale=True when start,end are None
185 |
186 | ## [1.4.1] 2019-06-06
187 | ### Added
188 | - Useful error message if no input files found
189 | ### Fixed
190 | - filename_mapping output gave incorrect filenames
191 |
192 | ## [1.4.0] 2019-05-29
193 | ### Added
194 | - Script for extracting reads by id from `multi_read` files
195 |
196 | ## [1.3.0] 2019-03-01
197 | ### Fixed
198 | - Bug in output to `filename_mapping.txt`
199 |
200 | ## [1.2.0] 2019-01-11
201 | ### Added
202 | - Multi-threading support for multi<->single conversion for improved performance
203 |
204 | ### Fixed
205 | - Removed incorrect license accidentally added to README
206 |
207 | ## [1.1.1] 2019-01-10
208 | ### Changed
209 | - Minor documentation updates
210 | - Follow symlinks when finding files recursively
211 |
212 | ## [1.1.0] 2019-01-07
213 | ### Added
214 | - Generic single- and multi- read interface via `get_fast5_file`
215 |
216 | ### Fixed
217 | - Incorrect time estimates for single-multi conversion
218 | - Fixed path creation if not exist
219 |
220 | ## [1.0.1] 2018-09-26
221 | ### Added
222 | - Support for multi-read fast5 files
223 | - Conversion tools for single-multi read files
224 |
225 | ### Fixed
226 | - Support for deprecated interface to Basecall2D following 0.4.0, support will end in v1.x.x
227 |
228 |
229 | ## [0.4.0] 2017-07-16 (internal only)
230 | ### Fixed
231 | - Basecall1d and Basecall2d raise consistent KeyError when fastq data missing
232 |
233 | ### Changed
234 | - Interface to Basecall1d and Basecall2d unified for add_sequence() and get_sequence()
235 |
236 |
237 | ## [0.3.3] 2017-06-23
238 | ### Added
239 | - Fast5 file now supports logging via 'Fast5File.add_log()'
240 |
241 | ### Fixed
242 | - Invalid component names no longer checked against LEGACY_COMPENENTS
243 | - Raise KeyError when fastq data missing from Basecall1d
244 | - median_before and start_mux populate correctly with sensible defaults
245 |
246 |
247 | ## [0.3.2] 2017-03-22
248 | ### Added
249 | Major release - changes not logged before this point
250 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # This file tells sdist which addional files to include in the distribution it builds.
2 | # That distribution is used as the base for building the .deb with stdeb, and certain files
3 | # (such as header files and .md files) are not included by default.
4 | # See https://docs.python.org/2/distutils/sourcedist.html#manifest-template
5 |
6 | include README.md
7 | include LICENSE.md
8 | prune test
9 | prune build
10 | prune docs
11 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | PAPER =
8 | BUILDDIR = build
9 | VERSION ?= unknown
10 |
11 | # Internal variables.
12 | PAPEROPT_a4 = -D latex_paper_size=a4
13 | PAPEROPT_letter = -D latex_paper_size=letter
14 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) -D version=$(VERSION) -D release=$(VERSION) $(SPHINXOPTS) source
15 | # the i18n builder cannot share the environment and doctrees with the others
16 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
17 |
18 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
19 |
20 | help:
21 | @echo "Please use \`make ' where is one of"
22 | @echo " api to autogenerate API documentation"
23 | @echo " html to make standalone HTML files"
24 | @echo " dirhtml to make HTML files named index.html in directories"
25 | @echo " singlehtml to make a single large HTML file"
26 | @echo " pickle to make pickle files"
27 | @echo " json to make JSON files"
28 | @echo " htmlhelp to make HTML files and a HTML help project"
29 | @echo " qthelp to make HTML files and a qthelp project"
30 | @echo " devhelp to make HTML files and a Devhelp project"
31 | @echo " epub to make an epub"
32 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
33 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
34 | @echo " text to make text files"
35 | @echo " man to make manual pages"
36 | @echo " texinfo to make Texinfo files"
37 | @echo " info to make Texinfo files and run them through makeinfo"
38 | @echo " gettext to make PO message catalogs"
39 | @echo " changes to make an overview of all changed/added/deprecated items"
40 | @echo " linkcheck to check all external links for integrity"
41 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
42 |
43 | clean:
44 | $(eval NON_INDEX_FILES := $(filter-out source/index.rst, $(wildcard source/*.rst)))
45 | -rm -rf $(BUILDDIR)
46 | mkdir $(BUILDDIR)
47 | ifneq ($(NON_INDEX_FILES),)
48 | rm $(NON_INDEX_FILES)
49 | endif
50 |
51 | api:
52 | $(eval NON_INDEX_FILES := $(filter-out source/index.rst, $(wildcard source/*.rst)))
53 | ifneq ($(NON_INDEX_FILES),)
54 | rm $(NON_INDEX_FILES)
55 | endif
56 | sphinx-apidoc --no-toc -o source/ ..
57 | rm source/test.rst
58 | rm source/setup.rst
59 | @echo
60 | @echo "API gubbins generated in source directory for version $(VERSION)."
61 |
62 | html:
63 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
64 | @echo
65 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
66 |
67 | dirhtml:
68 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
69 | @echo
70 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
71 |
72 | singlehtml:
73 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
74 | @echo
75 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
76 |
77 | pickle:
78 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
79 | @echo
80 | @echo "Build finished; now you can process the pickle files."
81 |
82 | json:
83 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
84 | @echo
85 | @echo "Build finished; now you can process the JSON files."
86 |
87 | htmlhelp:
88 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
89 | @echo
90 | @echo "Build finished; now you can run HTML Help Workshop with the" \
91 | ".hhp project file in $(BUILDDIR)/htmlhelp."
92 |
93 | qthelp:
94 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
95 | @echo
96 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
97 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
98 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/fast5_api.qhcp"
99 | @echo "To view the help file:"
100 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/fast5_api.qhc"
101 |
102 | devhelp:
103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | @echo
105 | @echo "Build finished."
106 | @echo "To view the help file:"
107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/ont_fast5_api"
108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/ont_fast5_api"
109 | @echo "# devhelp"
110 |
111 | epub:
112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | @echo
114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 |
116 | latex:
117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | @echo
119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | "(use \`make latexpdf' here to do that automatically)."
122 |
123 | latexpdf:
124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | @echo "Running LaTeX files through pdflatex..."
126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 |
129 | text:
130 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
131 | @echo
132 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
133 |
134 | man:
135 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
136 | @echo
137 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
138 |
139 | texinfo:
140 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
141 | @echo
142 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
143 | @echo "Run \`make' in that directory to run these through makeinfo" \
144 | "(use \`make info' here to do that automatically)."
145 |
146 | info:
147 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
148 | @echo "Running Texinfo files through makeinfo..."
149 | make -C $(BUILDDIR)/texinfo info
150 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
151 |
152 | gettext:
153 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
154 | @echo
155 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
156 |
157 | changes:
158 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
159 | @echo
160 | @echo "The overview file is in $(BUILDDIR)/changes."
161 |
162 | linkcheck:
163 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
164 | @echo
165 | @echo "Link check complete; look for any errors in the above output " \
166 | "or in $(BUILDDIR)/linkcheck/output.txt."
167 |
168 | doctest:
169 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
170 | @echo "Testing of doctests in the sources finished, look at the " \
171 | "results in $(BUILDDIR)/doctest/output.txt."
172 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # fast5_api documentation build configuration file, created by
4 | # sphinx-quickstart on Fri Nov 21 09:32:46 2014.
5 | #
6 | # This file is execfile()d with the current directory set to its containing dir.
7 | #
8 | # Note that not all possible configuration values are present in this
9 | # autogenerated file.
10 | #
11 | # All configuration values have a default; values that are commented out
12 | # serve to show the default.
13 |
14 | import sys, os
15 | sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
16 | sys.path.insert(0, os.path.abspath(os.path.join('..', '..', 'ont_fast5_api')))
17 |
18 | # If extensions (or modules to document with autodoc) are in another directory,
19 | # add these directories to sys.path here. If the directory is relative to the
20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
21 | #sys.path.insert(0, os.path.abspath('.'))
22 |
23 | # -- General configuration -----------------------------------------------------
24 |
25 | # If your documentation needs a minimal Sphinx version, state it here.
26 | #needs_sphinx = '1.0'
27 |
28 | # Add any Sphinx extension module names here, as strings. They can be extensions
29 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
30 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
31 |
32 | # Add any paths that contain templates here, relative to this directory.
33 | templates_path = ['_templates']
34 |
35 | # The suffix of source filenames.
36 | source_suffix = '.rst'
37 |
38 | # The encoding of source files.
39 | #source_encoding = 'utf-8-sig'
40 |
41 | # The master toctree document.
42 | master_doc = 'index'
43 |
44 | # General information about the project.
45 | project = u'ont_fast5_api'
46 | copyright = u'2016, Oxford Nanopore Technologies'
47 |
48 | # The version info for the project you're documenting, acts as replacement for
49 | # |version| and |release|, also used in various other places throughout the
50 | # built documents.
51 | #
52 | # The short X.Y version.
53 | version = '1.6.2'
54 | # The full version, including alpha/beta/rc tags.
55 | release = '1.6.2'
56 |
57 | # The language for content autogenerated by Sphinx. Refer to documentation
58 | # for a list of supported languages.
59 | #language = None
60 |
61 | # There are two options for replacing |today|: either, you set today to some
62 | # non-false value, then it is used:
63 | #today = ''
64 | # Else, today_fmt is used as the format for a strftime call.
65 | #today_fmt = '%B %d, %Y'
66 |
67 | # List of patterns, relative to source directory, that match files and
68 | # directories to ignore when looking for source files.
69 | exclude_patterns = []
70 |
71 | # The reST default role (used for this markup: `text`) to use for all documents.
72 | #default_role = None
73 |
74 | # If true, '()' will be appended to :func: etc. cross-reference text.
75 | #add_function_parentheses = True
76 |
77 | # If true, the current module name will be prepended to all description
78 | # unit titles (such as .. function::).
79 | #add_module_names = True
80 |
81 | # If true, sectionauthor and moduleauthor directives will be shown in the
82 | # output. They are ignored by default.
83 | #show_authors = False
84 |
85 | # The name of the Pygments (syntax highlighting) style to use.
86 | pygments_style = 'sphinx'
87 |
88 | # A list of ignored prefixes for module index sorting.
89 | #modindex_common_prefix = []
90 |
91 |
92 | # -- Options for HTML output ---------------------------------------------------
93 |
94 | # The theme to use for HTML and HTML Help pages. See the documentation for
95 | # a list of builtin themes.
96 | html_theme = 'sphinxdoc'
97 |
98 | # Theme options are theme-specific and customize the look and feel of a theme
99 | # further. For a list of options available for each theme, see the
100 | # documentation.
101 | #html_theme_options = {}
102 |
103 | # Add any paths that contain custom themes here, relative to this directory.
104 | #html_theme_path = []
105 |
106 | # The name for this set of Sphinx documents. If None, it defaults to
107 | # " v documentation".
108 | #html_title = None
109 |
110 | # A shorter title for the navigation bar. Default is the same as html_title.
111 | #html_short_title = None
112 |
113 | # The name of an image file (relative to this directory) to place at the top
114 | # of the sidebar.
115 | #html_logo = None
116 |
117 | # The name of an image file (within the static path) to use as favicon of the
118 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
119 | # pixels large.
120 | #html_favicon = None
121 |
122 | # Add any paths that contain custom static files (such as style sheets) here,
123 | # relative to this directory. They are copied after the builtin static files,
124 | # so a file named "default.css" will overwrite the builtin "default.css".
125 | html_static_path = ['_static']
126 |
127 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
128 | # using the given strftime format.
129 | #html_last_updated_fmt = '%b %d, %Y'
130 |
131 | # If true, SmartyPants will be used to convert quotes and dashes to
132 | # typographically correct entities.
133 | #html_use_smartypants = True
134 |
135 | # Custom sidebar templates, maps document names to template names.
136 | #html_sidebars = {}
137 |
138 | # Additional templates that should be rendered to pages, maps page names to
139 | # template names.
140 | #html_additional_pages = {}
141 |
142 | # If false, no module index is generated.
143 | #html_domain_indices = True
144 |
145 | # If false, no index is generated.
146 | #html_use_index = True
147 |
148 | # If true, the index is split into individual pages for each letter.
149 | #html_split_index = False
150 |
151 | # If true, links to the reST sources are added to the pages.
152 | #html_show_sourcelink = True
153 |
154 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
155 | #html_show_sphinx = True
156 |
157 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
158 | #html_show_copyright = True
159 |
160 | # If true, an OpenSearch description file will be output, and all pages will
161 | # contain a tag referring to it. The value of this option must be the
162 | # base URL from which the finished HTML is served.
163 | #html_use_opensearch = ''
164 |
165 | # This is the file name suffix for HTML files (e.g. ".xhtml").
166 | #html_file_suffix = None
167 |
168 | # Output file base name for HTML help builder.
169 | htmlhelp_basename = 'fast5_api_doc'
170 |
171 |
172 | # -- Options for LaTeX output --------------------------------------------------
173 |
174 | latex_elements = {
175 | # The paper size ('letterpaper' or 'a4paper').
176 | #'papersize': 'letterpaper',
177 |
178 | # The font size ('10pt', '11pt' or '12pt').
179 | #'pointsize': '10pt',
180 |
181 | # Additional stuff for the LaTeX preamble.
182 | #'preamble': '',
183 | }
184 |
185 | # Grouping the document tree into LaTeX files. List of tuples
186 | # (source start file, target name, title, author, documentclass [howto/manual]).
187 | latex_documents = [
188 | ('index', 'fast5_api.tex', u'fast5_api Documentation',
189 | u'Kevin Dolan, Forrest Brennen', 'manual'),
190 | ]
191 |
192 | # The name of an image file (relative to this directory) to place at the top of
193 | # the title page.
194 | #latex_logo = None
195 |
196 | # For "manual" documents, if this is true, then toplevel headings are parts,
197 | # not chapters.
198 | #latex_use_parts = False
199 |
200 | # If true, show page references after internal links.
201 | #latex_show_pagerefs = False
202 |
203 | # If true, show URL addresses after external links.
204 | #latex_show_urls = False
205 |
206 | # Documents to append as an appendix to all manuals.
207 | #latex_appendices = []
208 |
209 | # If false, no module index is generated.
210 | #latex_domain_indices = True
211 |
212 |
213 | # -- Options for manual page output --------------------------------------------
214 |
215 | # One entry per manual page. List of tuples
216 | # (source start file, name, description, authors, manual section).
217 | man_pages = [
218 | ('index', 'ont_fast5_api', u'ont_fast5_api Documentation',
219 | [u'Kevin Dolan, Forrest Brennen'], 1)
220 | ]
221 |
222 | # If true, show URL addresses after external links.
223 | #man_show_urls = False
224 |
225 |
226 | # -- Options for Texinfo output ------------------------------------------------
227 |
228 | # Grouping the document tree into Texinfo files. List of tuples
229 | # (source start file, target name, title, author,
230 | # dir menu entry, description, category)
231 | texinfo_documents = [
232 | ('index', 'ont_fast5_api', u'ont_fast5_api Documentation',
233 | u'Kevin Dolan, Forrest Brennen', 'fast5_api', 'One line description of project.',
234 | 'Miscellaneous'),
235 | ]
236 |
237 | # Documents to append as an appendix to all manuals.
238 | #texinfo_appendices = []
239 |
240 | # If false, no module index is generated.
241 | #texinfo_domain_indices = True
242 |
243 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
244 | #texinfo_show_urls = 'footnote'
245 |
246 | # Included to display docstrings from class __init__() functions.
247 | autoclass_content = "both"
248 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. ont_fast5_api documentation master file, created by
2 | sphinx-quickstart on Fri Nov 21 09:32:46 2014.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | .. include:: ../../README.rst
7 |
8 | Contents:
9 |
10 | .. toctree::
11 | :maxdepth: 4
12 | :glob:
13 |
14 | ont_fast5_api
15 |
16 |
17 | Indices and tables
18 | ==================
19 |
20 | * :ref:`genindex`
21 | * :ref:`modindex`
22 | * :ref:`search`
--------------------------------------------------------------------------------
/img/ONT_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/img/ONT_logo.png
--------------------------------------------------------------------------------
/ont_fast5_api/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '4.1.3'
2 | __version_info__ = tuple([int(num) for num in __version__.split('.')])
3 | CURRENT_FAST5_VERSION = 2.0
4 |
5 | import sys
6 | if sys.version_info < (3,):
7 | raise ImportError(
8 | """ont-fast5-api requires Python 3.7
9 |
10 | Somehow you have ended up running this on Python 2, which reached its end of
11 | life in 2019. Apologies! To avoid this issue, either:
12 |
13 | - Upgrade to Python 3, or
14 |
15 | - Download an older ont-fast5-api version:
16 |
17 | $ pip install 'ont-fast5-api<3.0'
18 |
19 | Note that you will be missing features and bug fixes by running older versions
20 | of ont-fast5-api.
21 |
22 | """)
23 |
24 | # Set up a default NullHandler in case we don't end up using another one
25 | # Taken from http://docs.python-guide.org/en/latest/writing/logging/
26 | import logging
27 | logging.getLogger(__name__).addHandler(logging.NullHandler())
28 |
29 | from ont_fast5_api.compression_settings import register_plugin
30 | register_plugin()
31 |
--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/ont_fast5_api/analysis_tools/__init__.py
--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/alignment.py:
--------------------------------------------------------------------------------
1 | """ Helper class for working with alignment type analyses.
2 | """
3 | import numpy as np
4 |
5 | from ont_fast5_api.analysis_tools.base_tool import BaseTool
6 | from ont_fast5_api.fast5_file import Fast5File
7 | from ont_fast5_api.analysis_tools.segmentation import SegmentationTools
8 | from ont_fast5_api.fast5_read import Fast5Read
9 |
10 |
11 | class AlignmentTools(BaseTool):
12 | """ Provides helper methods specific to alignment analyses.
13 | """
14 |
15 | def __init__(self, source, mode='r', group_name=None, meta=None, config=None):
16 | """ Create a new alignment tools object.
17 |
18 | :param source: Either an open Fast5File object, or a filename
19 | of a fast5 file.
20 | :param mode: The open mode (r or r+). Only if a filename is used
21 | for the source argument.
22 | :param group_name: The specific alignment analysis instance
23 | you are interested in.
24 | :param meta: Metadata for a new alignment analysis.
25 | :param config: Configuration data for a new alignment analysis.
26 |
27 | To create a new alignment analysis, provide a group name that
28 | does not already exist, and an optional dictionary with the metadata.
29 | The following fields are recommended, as a minimum:
30 |
31 | * name - The name of the basecall software used.
32 | * time_stamp - The time at which the analysis was performed.
33 |
34 | If the group name already exists, the "meta" parameter is ignored. If
35 | the specified group has a "component" attribute, and its value is not
36 | "alignment", an exception will be thrown.
37 | """
38 | if isinstance(source, Fast5Read):
39 | self.handle = source
40 | self.close_handle_when_done = False
41 | elif isinstance(source, str):
42 | self.handle = Fast5File(source, mode)
43 | self.close_handle_when_done = True
44 | else:
45 | raise Exception('Unrecognized type for argument "source".')
46 | if group_name is None:
47 | group_name = self.handle.get_latest_analysis('Alignment')
48 | if group_name is None:
49 | raise Exception('No Alignment analysis group found in file.')
50 | self.group_name = group_name
51 | attrs = self.handle.get_analysis_attributes(group_name)
52 | if attrs is None:
53 | if meta is None:
54 | meta = {}
55 | self.handle.add_analysis('alignment', group_name, meta, config)
56 | attrs = self.handle.get_analysis_attributes(group_name)
57 | if ('component' in attrs
58 | and attrs['component'] not in ['alignment',
59 | 'calibration_strand']):
60 | self.close()
61 | raise Exception('Analysis does not appear to be an alignment component.')
62 |
63 | def get_results(self):
64 | """ Get details about the alignments that have been performed.
65 |
66 | :return: A dict of dicts.
67 |
68 | The keys of the top level are 'template', 'complement' and '2d'.
69 | Each of these dicts contains the following fields:
70 |
71 | * status: Can be 'no data', 'no match found', or 'match found'.
72 | * direction: Can be 'forward', 'reverse'.
73 | * ref_name: Name of reference.
74 | * ref_span: Section of reference aligned to, as a tuple (start, end).
75 | * seq_span: Section of the called sequence that aligned, as a tuple (start, end).
76 | * seq_len: Total length of the called sequence.
77 | * num_aligned: Number of bases that aligned to bases in the reference.
78 | * num_correct: Number of aligned bases that match the reference.
79 | * num_deletions: Number of bases in the aligned section of the
80 | reference that are not aligned to bases in the called sequence.
81 | * num_insertions: Number of bases in the aligned section of the called
82 | sequence that are not aligned to bases in the reference.
83 | * identity: The fraction of aligned bases that are correct (num_correct /
84 | num_aligned).
85 | * accuracy: The overall basecall accuracy, according to the alignment.
86 | (num_correct / (num_aligned + num_deletions + num_insertions)).
87 |
88 | Note that if the status field is not 'match found', then all the other
89 | fields will be absent.
90 | """
91 | summary = self.handle.get_summary_data(self.group_name)
92 | results = {'template': {'status': 'no data'},
93 | 'complement': {'status': 'no data'},
94 | '2d': {'status': 'no data'}}
95 | if 'genome_mapping_template' in summary:
96 | results['template'] = self._get_results(summary['genome_mapping_template'])
97 | if 'genome_mapping_complement' in summary:
98 | results['complement'] = self._get_results(summary['genome_mapping_complement'])
99 | if 'genome_mapping_2d' in summary:
100 | results['2d'] = self._get_results(summary['genome_mapping_2d'])
101 | return results
102 |
103 | def get_alignment_data(self, section):
104 | """ Get the alignment SAM and Fasta, if present.
105 |
106 | :param section: Can be 'template', 'complement', or '2d'.
107 | :return: A tuple containing the SAM and the section of the reference
108 | aligned to (both as strings). Returns None if no alignment is
109 | present for that section.
110 | """
111 | subgroup = '{}/Aligned_{}'.format(self.group_name, section)
112 | sam = self.handle.get_analysis_dataset(subgroup, 'SAM')
113 | fasta = self.handle.get_analysis_dataset(subgroup, 'Fasta')
114 | if sam is None or fasta is None:
115 | return None
116 | sequence = fasta.split('\n')[1]
117 | return sam, sequence
118 |
119 | def add_alignment_data(self, section, sam, sequence):
120 | """ Add the SAM and Fasta alignment data for a section.
121 |
122 | :param section: Can be 'template', 'complement', or '2d'.
123 | :param sam: A string containing the SAM contents.
124 | :param sequence: A string containing the section of the
125 | reference the basecall aligned to.
126 | """
127 | subgroup = 'Aligned_{}'.format(section)
128 | if not subgroup in self.handle.handle['Analyses/{}'.format(self.group_name)]:
129 | self.handle.add_analysis_subgroup(self.group_name, subgroup)
130 | sam_arr = np.array(sam, dtype=str)
131 | self.handle.add_analysis_dataset('{}/{}'.format(self.group_name, subgroup), 'SAM', sam_arr)
132 | fasta_arr = np.array('>{}\n{}\n'.format(section, sequence), dtype=str)
133 | self.handle.add_analysis_dataset('{}/{}'.format(self.group_name, subgroup), 'Fasta', fasta_arr)
134 |
135 | def calculate_speed(self, section, alignment_results=None):
136 | """ Calculate speed using alignment information.
137 |
138 | :param section: The section (template or complement) we're calculating
139 | speed for.
140 | :param alignment_results: Optional dictionary of the alignment summary,
141 | so that speed can be calculated without having to write the summary
142 | out to the fast5 file first.
143 | :return: Speed in bases per second or zero if the speed could not be
144 | calculated.
145 |
146 | The only reliable way we have of finding out how many bases have gone through the pore is by
147 | looking at how much of the reference the sequence aligned to. This takes that information and
148 | uses it to calculate speed in reference-bases-per-second.
149 | """
150 | speed = 0.0
151 | if alignment_results:
152 | results = self._get_results(alignment_results)
153 | else:
154 | results = self.get_results()[section]
155 | if results['status'] != 'match found':
156 | return 0.0
157 | ref_span = results['ref_span']
158 | ref_len = ref_span[1] - ref_span[0]
159 | seq_span = results['seq_span']
160 | seq_len = seq_span[1] - seq_span[0]
161 | total_len = results['seq_len']
162 |
163 | sample_rate = self.handle.get_channel_info()['sampling_rate']
164 |
165 | # We need the duration from the segmentation results
166 | chain = self.handle.get_chain(self.group_name)
167 | if chain is not None:
168 | segmentation_group = dict(chain).get('segmentation')
169 | else:
170 | segmentation_group = None
171 | duration = 0
172 | if segmentation_group is not None:
173 | with SegmentationTools(self.handle, group_name=segmentation_group) as seg:
174 | summary = seg.get_results()
175 | if summary is not None:
176 | duration = summary['duration_{}'.format(section)]
177 | if duration == 0:
178 | return 0.0
179 |
180 | normalized_duration = duration * seq_len / float(total_len)
181 | speed = sample_rate * ref_len / normalized_duration
182 | return speed
183 |
184 | ##########################
185 | #
186 | # Private methods below
187 | #
188 | ##########################
189 |
190 | def _get_results(self, summary):
191 | results = {'status': 'no data'}
192 | ref_name = summary['genome']
193 | if ref_name == 'no_match':
194 | results['status'] = 'no match found'
195 | return results
196 | results['status'] = 'match found'
197 | results['direction'] = 'forward'
198 | if ref_name.endswith('_rc'):
199 | ref_name = ref_name[:-3]
200 | results['direction'] = 'reverse'
201 | results['ref_name'] = ref_name
202 | results['ref_span'] = (summary['genome_start'], summary['genome_end'])
203 | results['seq_span'] = (summary['strand_start'], summary['strand_end'])
204 | results['seq_len'] = summary['num_events']
205 | results.update({key: summary[key] for key in ['num_aligned', 'num_correct', 'num_insertions',
206 | 'num_deletions', 'identity', 'accuracy']})
207 | return results
208 |
--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/base_tool.py:
--------------------------------------------------------------------------------
1 | import abc
2 | from abc import abstractmethod
3 |
4 | from ont_fast5_api.fast5_file import Fast5File, Fast5FileTypeError
5 | from ont_fast5_api.fast5_read import Fast5Read
6 |
7 |
8 | class BaseTool(object):
9 | @property
10 | def group_id(self):
11 | raise NotImplementedError("BaseTool does not have a group_id")
12 |
13 | @property
14 | def analysis_id(self):
15 | raise NotImplementedError("BaseTool does not have a analysis_id")
16 |
17 | def __init__(self, source, mode='r', group_name=None, meta=None, config=None):
18 | """ Create a new analysis_tools object.
19 |
20 | :param source: Either an open Fast5File object, or a filename
21 | of a fast5 file.
22 | :param mode: The open mode (r or r+). Only if a filename is used
23 | for the source argument.
24 | :param group_name: The specific analysis instance you are interested in.
25 | :param meta: Metadata for a new analysis.
26 | :param config: Configuration data for a new analysis.
27 |
28 | To create a new analysis group, provide a group name that
29 | does not already exist, and an optional dictionary with the metadata.
30 | The following fields are recommended, as a minimum:
31 |
32 | * name - The name of the software used.
33 | * time_stamp - The time at which the analysis was performed.
34 |
35 | If the group name already exists, the "meta" parameter is ignored. If
36 | the specified group has a "component" attribute, and its value does not
37 | match self.analysis_id, an exception will be thrown.
38 | """
39 | if isinstance(source, Fast5Read):
40 | self.filename = source.filename # Useful for debugging purposes
41 | self.handle = source
42 | self.close_handle_when_done = False
43 | elif isinstance(source, str):
44 | self.filename = source # Useful for debugging purposes
45 | try:
46 | self.handle = Fast5File(source, mode)
47 | except Fast5FileTypeError :
48 | raise NotImplementedError("AnalysisTools do not support accessing MultiReadFast5 files by filepath")
49 | self.close_handle_when_done = True
50 | else:
51 | raise KeyError('Unrecognized type for argument "source": {}'.format(source))
52 | if group_name is None:
53 | group_name = self.handle.get_latest_analysis(self.group_id)
54 | if group_name is None:
55 | raise KeyError('No group: {} found in file: {}'.format(group_name, self.filename))
56 | self.group_name = group_name
57 | attrs = self.handle.get_analysis_attributes(group_name)
58 |
59 | if attrs is None:
60 | self.handle.add_analysis(self.analysis_id, group_name, meta, config)
61 | attrs = self.handle.get_analysis_attributes(group_name)
62 | if 'component' in attrs and attrs['component'] != self.analysis_id:
63 | raise ValueError('Component {} is not {}'.format(attrs.get('component'), self.analysis_id))
64 |
65 | def __enter__(self):
66 | return self
67 |
68 | def __exit__(self, exception_type, exception_value, traceback):
69 | self.close()
70 | return False
71 |
72 | def close(self):
73 | """ Closes the object.
74 | """
75 | if self.handle and self.close_handle_when_done:
76 | self.handle.close()
77 |
--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/basecall_1d.py:
--------------------------------------------------------------------------------
1 | """ Helper class for working with 1D basecall type analyses.
2 | """
3 | import numpy as np
4 |
5 | from ont_fast5_api.analysis_tools.base_tool import BaseTool
6 |
7 |
8 | class Basecall1DTools(BaseTool):
9 | """ Provides helper methods specific to 1D basecall analyses.
10 | """
11 | group_id = 'Basecall_1D'
12 | analysis_id = 'basecall_1d'
13 |
14 |
15 | def get_event_data(self, section):
16 | """ Return either the template or complement event data, if present.
17 |
18 | :param section: Either template or complement.
19 | :return: Event data table.
20 | """
21 | event_group = '{}/BaseCalled_{}'.format(self.group_name, section)
22 | data = self.handle.get_analysis_dataset(event_group, 'Events')
23 | return data
24 |
25 | def add_event_data(self, section, data):
26 | """ Add template or complement basecalled event data.
27 |
28 | :param section: Either template or complement.
29 | :param data: Event data table to be written.
30 | """
31 | event_group = 'BaseCalled_{}'.format(section)
32 | if not event_group in self.handle.handle['Analyses/{}'.format(self.group_name)]:
33 | self.handle.add_analysis_subgroup(self.group_name, event_group)
34 | self.handle.add_analysis_dataset('{}/{}'.format(self.group_name, event_group), 'Events', data)
35 |
36 | def get_called_sequence(self, section, fastq=False):
37 | """ Return either the called sequence data, if present.
38 |
39 | :param section: ['template', 'complement' or '2D']
40 | :param fastq: If True, return a single, multiline fastq string. If
41 | False, return a tuple of (name, sequence, qstring).
42 | :return: Either the fastq string or the (name, sequence, qstring) tuple.
43 | """
44 |
45 | event_group = '{}/BaseCalled_{}'.format(self.group_name, section)
46 | data = self.handle.get_analysis_dataset(event_group, 'Fastq')
47 | if data is None:
48 | raise KeyError("No fastq data in: {} {}".format(event_group, self.filename))
49 | if fastq:
50 | return data
51 | name, sequence, _, qstring = data.strip().split('\n')
52 | name = name[1:]
53 | return name, sequence, qstring
54 |
55 | def add_called_sequence(self, section, name, sequence, qstring):
56 | """ Add basecalled sequence data
57 |
58 | :param section: ['template', 'complement' or '2D']
59 | :param name: The record ID to use for the fastq.
60 | :param sequence: The called sequence.
61 | :param qstring: The quality string.
62 | """
63 | event_group = 'BaseCalled_{}'.format(section)
64 | if not event_group in self.handle.handle['Analyses/{}'.format(self.group_name)]:
65 | self.handle.add_analysis_subgroup(self.group_name, event_group)
66 | fastq_text = '@{}\n{}\n+\n{}\n'.format(name, sequence, qstring)
67 | fastq_arr = np.array(fastq_text, dtype=str)
68 | self.handle.add_analysis_dataset('{}/{}'.format(self.group_name, event_group), 'Fastq', fastq_arr)
69 |
--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/basecall_2d.py:
--------------------------------------------------------------------------------
1 | """ Helper class for working with 2D basecall type analyses.
2 | """
3 | import warnings
4 | from ont_fast5_api.analysis_tools.basecall_1d import Basecall1DTools
5 |
6 |
7 | class Basecall2DTools(Basecall1DTools):
8 | """ Provides helper methods specific to 2D basecall analyses.
9 | """
10 |
11 | group_id = 'Basecall_2D'
12 | analysis_id = 'basecall_2d'
13 |
14 | def get_prior_alignment(self):
15 | """ Return the prior alignment that was used for 2D basecalling.
16 |
17 | :return: Alignment data table.
18 | """
19 | data_group = '{}/HairpinAlign'.format(self.group_name)
20 | data = self.handle.get_analysis_dataset(data_group, 'Alignment')
21 | return data
22 |
23 | def get_2d_call_alignment(self):
24 | """ Return the alignment and model_states from the 2D basecall.
25 |
26 | :return: Alignment data table.
27 | """
28 | data_group = '{}/BaseCalled_2D'.format(self.group_name)
29 | data = self.handle.get_analysis_dataset(data_group, 'Alignment')
30 | return data
31 |
32 | def add_prior_alignment(self, data):
33 | """ Add template or complement basecalled event data.
34 |
35 | :param data: Alignment table to be written.
36 | """
37 | path = 'Analyses/{}'.format(self.group_name)
38 | if 'HairpinAlign' not in self.handle.handle[path]:
39 | self.handle.add_analysis_subgroup(self.group_name, 'HairpinAlign')
40 |
41 | path = '{}/HairpinAlign'.format(self.group_name)
42 | self.handle.add_analysis_dataset(path, 'Alignment', data)
43 |
44 | def add_2d_call_alignment(self, data):
45 | """ Add the alignment and model_state data table..
46 |
47 | :param data: Alignment and model_state table to be written.
48 | """
49 | path = 'Analyses/{}'.format(self.group_name)
50 | if 'BaseCalled_2D' not in self.handle.handle[path]:
51 | self.handle.add_analysis_subgroup(self.group_name, 'BaseCalled_2D')
52 |
53 | path = '{}/BaseCalled_2D'.format(self.group_name)
54 | self.handle.add_analysis_dataset(path, 'Alignment', data)
55 |
56 | def get_called_sequence(self, section=None, fastq=False):
57 | """ Return either the called sequence data, if present.
58 | :param section: ['template', 'complement' or '2D']
59 | :param fastq: If True, return a single, multiline fastq string. If
60 | False, return a tuple of (name, sequence, qstring).
61 | :return: Either the fastq string or the (name, sequence, qstring) tuple.
62 | """
63 | if section != "2D":
64 | warnings.warn("Basecall2DTools.get_called_sequence() should specify section='2D'", DeprecationWarning)
65 | # Backwards compatibilty to 0.3.3, if no "2D" section, bump args by 1 and pass to super
66 | if section == None:
67 | # We assume that a named arg or no-arg was given
68 | return super(Basecall2DTools, self).get_called_sequence("2D", fastq)
69 | # We assume that a single unnamed arg was given for fastq
70 | return super(Basecall2DTools, self).get_called_sequence("2D", section)
71 | return super(Basecall2DTools, self).get_called_sequence(section, fastq)
72 |
--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/event_detection.py:
--------------------------------------------------------------------------------
1 | """ Helper class for working with event detection type analyses.
2 | """
3 | import numpy as np
4 |
5 | from ont_fast5_api.analysis_tools.base_tool import BaseTool
6 |
7 |
8 | class EventDetectionTools(BaseTool):
9 | """ Provides helper methods specific to event detection analyses.
10 | """
11 |
12 | group_id = 'EventDetection'
13 | analysis_id = 'event_detection'
14 |
15 | def set_event_data(self, data, read_attrs):
16 | """ Set event data with the specied attributes.
17 |
18 | :param data: Event data table.
19 | :param read_attrs: Attributes to put on the read group. This must include
20 | the read_number, which must refer to a read present in the object. The
21 | attributes should not include the standard read attributes:
22 |
23 | * read_id
24 | * start_time
25 | * duration
26 | * start_mux
27 |
28 | Those will be pulled from the read information already present in the
29 | object for the specified read.
30 | """
31 | if self.handle.mode == 'r':
32 | raise Exception('File is not open for writing.')
33 | read_number = read_attrs['read_number']
34 | read_group = '{}/Reads/Read_{}'.format(self.group_name, read_number)
35 | read_info = self.handle.status.read_info
36 | read_number_map = self.handle.status.read_number_map
37 | index = read_number_map.get(read_number)
38 | if index is None:
39 | raise Exception('Cannot add event detection data for a read that does not exist.')
40 | info = read_info[index]
41 | read_attrs.update({'read_id': info.read_id,
42 | 'start_time': info.start_time,
43 | 'duration': info.duration,
44 | 'start_mux': info.start_mux,
45 | 'median_before': info.median_before})
46 | attrs = self.handle.get_analysis_attributes(read_group)
47 | if attrs is None:
48 | self.handle.add_analysis_subgroup(self.group_name, 'Reads/Read_{}'.format(read_number),
49 | attrs=read_attrs)
50 | self.handle.add_analysis_dataset(read_group, 'Events', data)
51 | else:
52 | raise Exception('Event detection data already exists for this analysis and read.')
53 |
54 | def get_event_data(self, read_number=None, time_in_seconds=False):
55 | """ Get event data for the specified (or only) read.
56 |
57 | :param read_number: The read number to grab event data for. If this
58 | is None, and there is only one read, it will grab event data for
59 | that read.
60 | :param time_in_seconds: If True, this will convert (if necessary) the
61 | start and length fields from samples to seconds. If they are already
62 | in seconds, this option has no effect.
63 | :return: A tuple containing the event data, and the read attributes.
64 | """
65 | read_info = self.handle.status.read_info
66 | if read_number is None:
67 | if len(read_info) != 1:
68 | raise Exception('Must specify a read number if there is not exactly 1 read.')
69 | read_number = read_info[0].read_number
70 | else:
71 | read_numbers = [info.read_number for info in read_info]
72 | if read_number not in read_numbers:
73 | raise Exception('Specified read does not exist.')
74 | group = '{}/Reads/Read_{}'.format(self.group_name, read_number)
75 | attrs = self.handle.get_analysis_attributes(group)
76 | dataset = self.handle.get_analysis_dataset(group, 'Events', skip_decoding=True)
77 | if dataset is None:
78 | raise Exception('Read number {} has no event data.'.format(read_number))
79 | if time_in_seconds and dataset['start'].dtype.kind in ['i', 'u']:
80 | channel_info = self.handle.get_channel_info()
81 | sample_size = 1.0 / channel_info['sampling_rate']
82 | descr = [(x[0], 'float64') if x[0] in ('start', 'length') else x
83 | for x in dataset.dtype.descr]
84 | data = dataset.astype(np.dtype(descr))[()]
85 | data['start'] *= sample_size
86 | data['length'] *= sample_size
87 | else:
88 | data = dataset[()]
89 | return data, attrs
90 |
91 | def has_event_data(self, read_number=None):
92 | """ Find out if the specified (or only) read has event data.
93 |
94 | :param read_number: The read number to check for event data. If this
95 | is ``None``, and there is only one read, it will check that read.
96 | :returns: True if event data exists for the read number.
97 | """
98 | read_info = self.handle.status.read_info
99 | if read_number is None:
100 | if len(read_info) != 1:
101 | raise Exception('Must specify a read number if there is not exactly 1 read.')
102 | read_number = read_info[0].read_number
103 | else:
104 | read_numbers = [info.read_number for info in read_info]
105 | if read_number not in read_numbers:
106 | raise Exception('Specified read does not exist.')
107 | group = '{}/Reads/Read_{}'.format(self.group_name, read_number)
108 | dataset = self.handle.get_analysis_dataset(group, 'Events', skip_decoding=True)
109 | return dataset is not None
110 |
111 | ##########################
112 | #
113 | # Private methods below
114 | #
115 | ##########################
116 |
117 | def _new_analysis(self, meta, config):
118 | if self.handle.mode == 'r':
119 | raise Exception('Cannot create new event detection group. File is not open for writing.')
120 | self.handle.add_analysis('event_detection', self.group_name, meta, config)
121 | self.handle.add_analysis_subgroup(self.group_name, 'Reads')
122 |
--------------------------------------------------------------------------------
/ont_fast5_api/analysis_tools/segmentation.py:
--------------------------------------------------------------------------------
1 | """ Helper class for working with segmentation type analyses.
2 | """
3 | import numpy as np
4 |
5 | from ont_fast5_api.analysis_tools.base_tool import BaseTool
6 | from ont_fast5_api.analysis_tools.event_detection import EventDetectionTools
7 |
8 |
9 | class SegmentationTools(BaseTool):
10 | """ Provides helper methods specific to segmentation analyses.
11 | """
12 | group_id = 'Segmentation'
13 | analysis_id = 'segmentation'
14 |
15 | def get_results(self):
16 | """ Returns the segmentation summary data.
17 |
18 | This data is normalized, to eliminate differences in what is stored
19 | for different types of segmentation analyses.
20 |
21 | The following fields are output:
22 |
23 | * has_template - True if the segmentation found template data.
24 | * has_complement - True if the segmentation found complement data.
25 | * first_sample_template - The first sample of the template data in
26 | the raw data. Only present if has_template is True.
27 | * duration_template - The duration (in samples) of the template
28 | data. Only present if has_template is True.
29 | * first_sample_complement - The first sample of the complement data
30 | in the raw data. Only present if has_complement is True.
31 | * duration_complement - The duration (in samples) of the complement
32 | data. Only present if has_complement is True.
33 |
34 | """
35 | summary = self._get_summary_data()
36 | if summary is None:
37 | results = {'has_template': False,
38 | 'has_complement': False}
39 | else:
40 | results = {}
41 | if 'has_template' in summary:
42 | results['has_template'] = bool(summary['has_template'])
43 | else:
44 | results['has_template'] = True if summary['num_temp'] > 0 else False
45 | if 'has_complement' in summary:
46 | results['has_complement'] = bool(summary['has_complement'])
47 | else:
48 | results['has_complement'] = True if summary['num_comp'] > 0 else False
49 | need_raw_info = False
50 | if results['has_template']:
51 | if 'start_index_temp' in summary:
52 | summary['start_event_template'] = summary['start_index_temp']
53 | summary['end_event_template'] = summary['end_index_temp']
54 | if 'first_sample_template' not in summary:
55 | need_raw_info = True
56 | if results['has_complement']:
57 | if 'start_index_comp' in summary:
58 | summary['start_event_complement'] = summary['start_index_comp']
59 | summary['end_event_complement'] = summary['end_index_comp']
60 | if 'first_sample_complement' not in summary:
61 | need_raw_info = True
62 | if need_raw_info:
63 | self._get_raw_info(summary)
64 | if results['has_template']:
65 | results['first_sample_template'] = summary['first_sample_template']
66 | results['duration_template'] = summary['duration_template']
67 | if 'start_event_template' in summary:
68 | results['start_event_template'] = summary['start_event_template']
69 | results['end_event_template'] = summary['end_event_template']
70 | if results['has_complement']:
71 | results['first_sample_complement'] = summary['first_sample_complement']
72 | results['duration_complement'] = summary['duration_complement']
73 | if 'start_event_complement' in summary:
74 | results['start_event_complement'] = summary['start_event_complement']
75 | results['end_event_complement'] = summary['end_event_complement']
76 | return results
77 |
78 | def get_event_data(self, section, time_in_seconds=False):
79 | """ Get the template or complement event data.
80 |
81 | :param section: Either template, complement, or both.
82 | :param time_in_seconds: Return the start and length fields
83 | in seconds, rather than samples.
84 | :return: The event dataset for the section. If section=both
85 | then it returns a tuple with both sections. Returns None
86 | if the section does not exist.
87 | """
88 | if section not in ['template', 'complement', 'both']:
89 | raise Exception('Unrecognized section: {} Expected: "template", "complement" or "both"'.format(section))
90 | results = self.get_results()
91 | if results is None:
92 | return None, None if section is 'both' else None
93 | if section == 'both':
94 | sections = ['template', 'complement']
95 | else:
96 | sections = [section]
97 | evdet_group, _ = self._find_event_data()
98 | with EventDetectionTools(self.handle, group_name=evdet_group) as evdet:
99 | event_data, _ = evdet.get_event_data(time_in_seconds=time_in_seconds)
100 | datasets = [None, None]
101 | for n, this_section in enumerate(sections):
102 | if not results['has_{}'.format(this_section)]:
103 | continue
104 | ev1 = results['start_event_{}'.format(this_section)]
105 | ev2 = results['end_event_{}'.format(this_section)]
106 | datasets[n] = event_data[ev1:ev2]
107 | if section == 'both':
108 | return tuple(datasets)
109 | return datasets[0]
110 |
111 | def get_raw_data(self, section, scale=False):
112 | """ Get the template or complement raw data.
113 |
114 | :param section: Either template, complement, or both.
115 | :param scale: Scale the raw data to pA.
116 | :return: The raw data for the section. If section=both
117 | then it returns a tuple with both sections. Returns None
118 | if the section does not exist.
119 | """
120 | results = self.get_results()
121 | datasets = [None, None]
122 | if section == 'both':
123 | sections = ['template', 'complement']
124 | else:
125 | sections = [section]
126 | for n, this_section in enumerate(sections):
127 | if not results['has_{}'.format(this_section)]:
128 | continue
129 | start = results['first_sample_{}'.format(this_section)]
130 | dur = results['duration_{}'.format(this_section)]
131 | datasets[n] = self.handle.get_raw_data(start=start, end=start+dur, scale=scale)
132 | if section == 'both':
133 | return tuple(datasets)
134 | return datasets[0]
135 |
136 |
137 | ##########################
138 | #
139 | # Private methods below
140 | #
141 | ##########################
142 |
143 | def _get_summary_data(self):
144 | summary = self.handle.get_summary_data(self.group_name)
145 | if summary is None:
146 | return None
147 | if 'segmentation' in summary:
148 | results = summary['segmentation']
149 | elif 'split_hairpin' in summary:
150 | results = summary['split_hairpin']
151 | else:
152 | results = None
153 | return results
154 |
155 | def _find_event_data(self):
156 | attrs = self.handle.get_analysis_attributes(self.group_name)
157 | evdet_group = attrs.get('event_detection')
158 | if evdet_group is None:
159 | evdet_group = self.handle.get_latest_analysis('EventDetection')
160 | else:
161 | evdet_group = evdet_group[9:]
162 | if evdet_group is None:
163 | return None
164 | # We directly use the Fast5Read interface here, rather than the
165 | # EventDetectionTools one, because we don't want to load the entire
166 | # event table into memory.
167 | read_info = self.handle.status.read_info[0] # We assume only one read.
168 | read_number = read_info.read_number
169 | event_table_group = '{}/Reads/Read_{}'.format(evdet_group, read_number)
170 | dataset = self.handle.get_analysis_dataset(event_table_group, 'Events', skip_decoding=True)
171 | return evdet_group, dataset
172 |
173 | def _get_raw_info(self, summary):
174 | _, dataset = self._find_event_data()
175 | read_info = self.handle.status.read_info[0] # We assume only one read.
176 | if dataset is None:
177 | summary['first_sample_template'] = None
178 | summary['duration_template'] = None
179 | summary['first_sample_complement'] = None
180 | summary['duration_complement'] = None
181 | return
182 | if summary.get('start_event_template', -1) >= 0:
183 | ev1 = summary['start_event_template']
184 | ev2 = summary['end_event_template']
185 | summary['first_sample_template'] = dataset[ev1]['start'] - read_info.start_time
186 | end = dataset[ev2-1]['start'] + dataset[ev2-1]['length'] - read_info.start_time
187 | summary['duration_template'] = end - summary['first_sample_template']
188 | if summary.get('start_event_complement', -1) >= 0:
189 | ev1 = summary['start_event_complement']
190 | ev2 = summary['end_event_complement']
191 | summary['first_sample_complement'] = dataset[ev1]['start'] - read_info.start_time
192 | end = dataset[ev2-1]['start'] + dataset[ev2-1]['length'] - read_info.start_time
193 | summary['duration_complement'] = end - summary['first_sample_complement']
194 |
--------------------------------------------------------------------------------
/ont_fast5_api/compression_settings.py:
--------------------------------------------------------------------------------
1 | import pkg_resources
2 |
3 |
4 | def register_plugin():
5 | plugin_path = pkg_resources.resource_filename('ont_fast5_api', 'vbz_plugin')
6 | try:
7 | from h5py import h5pl
8 | h5pl.prepend(bytes(plugin_path, 'UTF-8'))
9 | except (ImportError, AttributeError):
10 | # We don't have the plugin library in h5py<2.10 so we fall back on an environment variable
11 | import os
12 | os.environ['HDF5_PLUGIN_PATH'] = plugin_path
13 | return plugin_path
14 |
15 |
16 | class AbstractCompression:
17 | compression = "AbstractCompression"
18 | compression_opts = ()
19 | shuffle = False
20 | scaleoffset = False
21 | fletcher32 = False
22 |
23 | def __repr__(self):
24 | return self.compression
25 |
26 | @property
27 | def filter_settings(self):
28 | return {}
29 |
30 |
31 | class VbzCompressionV1Alpha(AbstractCompression):
32 | def __init__(self):
33 | self.compression = 32020 # https://portal.hdfgroup.org/display/support/Registered+Filters
34 | self.compression_opts = (1, 2, 1, 1) # VBZ_VERSION, VBZ_PACKING, VBZ_ZIG_ZAG, VBZ_ZSTD_COMPRESSION
35 |
36 | def __repr__(self):
37 | return "vbz_v1.alpha"
38 |
39 | @property
40 | def filter_settings(self):
41 | return {str(self.compression): self.compression_opts}
42 |
43 |
44 | class VbzCompression(AbstractCompression):
45 | def __init__(self):
46 | self.compression = 32020 # https://portal.hdfgroup.org/display/support/Registered+Filters
47 | self.compression_opts = (0, 2, 1, 1) # VBZ_VERSION, VBZ_PACKING, VBZ_ZIG_ZAG, VBZ_ZSTD_COMPRESSION
48 |
49 | def __repr__(self):
50 | return "vbz"
51 |
52 | @property
53 | def filter_settings(self):
54 | return {str(self.compression): self.compression_opts}
55 |
56 |
57 | class GzipCompression(AbstractCompression):
58 | def __init__(self):
59 | self.compression = "gzip"
60 | self.compression_opts = 1
61 |
62 | @property
63 | def filter_settings(self):
64 | return {str(self.compression): self.compression_opts}
65 |
66 |
67 | VBZ_ERROR_MESSAGE = "Failed to read compressed raw data. " \
68 | "VBZ compression filter (id=32020) may be missing from expected path: '{}'"
69 |
70 |
71 | def raise_missing_vbz_error_read(err):
72 | if str(VBZ.compression) in str(err):
73 | raise IOError(VBZ_ERROR_MESSAGE.format(register_plugin())) from err
74 | # If we don't see anything relating to VBZ just raise the existing error without additional info
75 | raise
76 |
77 |
78 | def raise_missing_vbz_error_write(err):
79 | if type(err) is OSError and "Can't read data" in str(err):
80 | raise IOError(VBZ_ERROR_MESSAGE.format(register_plugin())) from err
81 | # If we don't see anything relating to VBZ just raise the existing error without additional info
82 | raise
83 |
84 |
85 | VBZ_ALPHA = VbzCompressionV1Alpha()
86 | VBZ = VbzCompression()
87 | GZIP = GzipCompression()
88 |
89 | COMPRESSION_MAP = {str(comp): comp for comp in (VBZ_ALPHA, VBZ, GZIP)}
90 |
--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/ont_fast5_api/conversion_tools/__init__.py
--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/check_file_compression.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 |
3 | from ont_fast5_api.compression_settings import COMPRESSION_MAP
4 | from ont_fast5_api.conversion_tools.conversion_utils import yield_fast5_files
5 | from ont_fast5_api.fast5_interface import get_fast5_file
6 |
7 |
8 | def check_read_compression(read):
9 | """
10 | Check the compresion type on the raw data of a read
11 | :param read: Fast5Read object
12 | :return: AbstractCompression object
13 | """
14 | detected_compression = read.raw_compression_filters
15 | for compression in COMPRESSION_MAP.values():
16 | if compression.filter_settings == detected_compression:
17 | return compression
18 | return detected_compression
19 |
20 |
21 | def check_compression(input_path, recursive, follow_symlinks, check_all_reads):
22 | """
23 | Check the compression type of the raw data in files in a folder
24 | :param input_path:
25 | :param recursive:
26 | :param follow_symlinks:
27 | :param check_all_reads: bool - check all reads in a file or just the first
28 | :return: (Compression, read_id, file_path)
29 | """
30 | for input_file in yield_fast5_files(input_path, recursive, follow_symlinks):
31 | with get_fast5_file(input_file, 'r') as f5:
32 | for read in f5.get_reads():
33 | compression = check_read_compression(read)
34 | yield (compression, read.read_id, input_file)
35 | if not check_all_reads:
36 | break
37 |
38 |
39 | def main():
40 | parser = ArgumentParser("Tool for checking the compression type of raw data in fast5 files")
41 | parser.add_argument('-i', '--input_path', required=True,
42 | help="Path to Fast5 file or directory of Fast5 files")
43 | parser.add_argument('--check_all_reads', action='store_true', required=False, default=False,
44 | help="Check all reads in a file individually (default: check only the first read)")
45 | parser.add_argument('-r', '--recursive', action='store_true', required=False, default=False,
46 | help="Search recursively through folders for MultiRead fast5 files")
47 | parser.add_argument('--ignore_symlinks', action='store_true',
48 | help="Ignore symlinks when searching recursively for fast5 files")
49 | parser.add_argument('--file_list', required=False,
50 | help="File containing names of files to search in")
51 | args = parser.parse_args()
52 | compression_results = check_compression(args.input_path, args.recursive, not args.ignore_symlinks,
53 | args.check_all_reads)
54 | for result in compression_results:
55 | print(result)
56 |
57 |
58 | if __name__ == '__main__':
59 | main()
60 |
--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/compress_fast5.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import shutil
4 | from argparse import ArgumentParser, ArgumentError
5 | from multiprocessing.pool import Pool
6 |
7 | from ont_fast5_api import __version__
8 | from ont_fast5_api.compression_settings import COMPRESSION_MAP
9 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, get_progress_bar
10 | from ont_fast5_api.fast5_file import Fast5File, EmptyFast5
11 | from ont_fast5_api.fast5_read import copy_attributes
12 | from ont_fast5_api.fast5_interface import is_multi_read
13 | from ont_fast5_api.multi_fast5 import MultiFast5File
14 | from ont_fast5_api.static_data import OPTIONAL_READ_GROUPS
15 |
16 |
17 | def compress_batch(input_folder, output_folder, target_compression, recursive=True, threads=1, follow_symlinks=True,
18 | in_place=False, sanitize=False):
19 | # We require an absolute input path to we can replicate the data structure relative to it later on
20 | input_folder = os.path.abspath(input_folder)
21 |
22 | file_list = get_fast5_file_list(input_folder, recursive, follow_symlinks=follow_symlinks)
23 | if len(file_list) == 0:
24 | raise ValueError("No input fast5 files found in '{}'. Recursive={}".format(input_folder, recursive))
25 |
26 | # Set up the process pool and the progressbar
27 | pool = Pool(min(threads, len(file_list)))
28 | pbar = get_progress_bar(len(file_list))
29 |
30 | def update(result):
31 | if in_place and result is not None:
32 | input_path, output_path = result
33 | shutil.move(output_path, input_path)
34 | pbar.update(pbar.currval + 1)
35 |
36 | for input_file in file_list:
37 | input_path = os.path.join(input_folder, input_file)
38 | if in_place:
39 | output_path = os.path.join(input_path + ".tmp.compressed")
40 | else:
41 | output_path = os.path.join(output_folder, os.path.relpath(input_path, input_folder))
42 |
43 | pool.apply_async(func=compress_file,
44 | args=(input_path, output_path, target_compression, sanitize),
45 | callback=update)
46 |
47 | # Tear down the process pool and pbar. We can't use contextmanagers since we need to close() then join()
48 | pool.close()
49 | pool.join()
50 | pbar.finish()
51 |
52 |
53 | def compress_file(input_file, output_file, target_compression, sanitize=False):
54 | try:
55 | os.makedirs(os.path.dirname(output_file), exist_ok=True)
56 | if is_multi_read(input_file):
57 | with MultiFast5File(input_file, 'r') as input_f5, MultiFast5File(output_file, 'a') as output_f5:
58 | for read in input_f5.get_reads():
59 | output_f5.add_existing_read(read, target_compression, sanitize=sanitize)
60 | else:
61 | with Fast5File(input_file, 'r') as input_f5, \
62 | EmptyFast5(output_file, 'a') as output_f5:
63 | compress_single_read(output_f5, input_f5, target_compression, sanitize=sanitize)
64 | except Exception as e:
65 | # Error raised in Pool.async will be lost so we explicitly print them.
66 | logging.exception(e)
67 | raise
68 | return (input_file, output_file)
69 |
70 |
71 | def compress_single_read(output_f5, read_to_copy, target_compression, sanitize=False):
72 | read_id = read_to_copy.get_read_id()
73 | raw_dataset_name = read_to_copy.raw_dataset_name
74 | raw_group_name = read_to_copy.raw_dataset_group_name
75 | read_name = "read_" + read_id
76 | # Recreating the status object is painful, but doesn't actually interact with the file so we can just reference it.
77 | output_f5.status = read_to_copy.status
78 |
79 | if str(target_compression) in read_to_copy.raw_compression_filters:
80 | # If we have the right compression then no need for doing anything fancy
81 | output_f5.handle.copy(read_to_copy.handle, read_name)
82 | else:
83 | copy_attributes(read_to_copy.handle.attrs, output_f5.handle)
84 | for subgroup in read_to_copy.handle:
85 | if subgroup not in raw_dataset_name:
86 | if sanitize and subgroup in OPTIONAL_READ_GROUPS:
87 | # skip optional groups when sanitizing
88 | continue
89 | output_f5.handle.copy(read_to_copy.handle[subgroup], subgroup)
90 | else:
91 | raw_attrs = read_to_copy.handle[raw_group_name].attrs
92 | raw_data = read_to_copy.handle[raw_dataset_name]
93 | output_f5.add_raw_data(raw_data, raw_attrs, compression=target_compression)
94 |
95 |
96 | def main():
97 | parser = ArgumentParser("Tool for changing the compression of Fast5 files")
98 | parser.add_argument('-i', '--input_path', required=True,
99 | help='Folder containing fast5 files')
100 |
101 | output_group = parser.add_mutually_exclusive_group(required=True)
102 | save_arg = output_group.add_argument('-s', '--save_path', default=None,
103 | help="Folder to output fast5 read files to")
104 | output_group.add_argument('--in_place', action='store_true',
105 | help='Replace the old files with new files in place')
106 |
107 | parser.add_argument('-c', '--compression', required=True, choices=list(COMPRESSION_MAP.keys()),
108 | help="Target output compression type")
109 | parser.add_argument('--sanitize', action='store_true',
110 | help="Clean output files of optional groups and datasets (e.g. 'Analyses')")
111 | parser.add_argument('-t', '--threads', type=int, default=1, required=False,
112 | help="Maximum number of threads to use")
113 | parser.add_argument('--recursive', action='store_true',
114 | help="Search recursively through folders for single_read fast5 files")
115 | parser.add_argument('--ignore_symlinks', action='store_true',
116 | help="Ignore symlinks when searching recursively for fast5 files")
117 | parser.add_argument('-v', '--version', action='version', version=__version__)
118 | args = parser.parse_args()
119 |
120 | if args.input_path == args.save_path:
121 | raise ArgumentError(save_arg, "--input_path and --save_path must be different locations, or use --in_place")
122 | if args.sanitize and args.save_path is None:
123 | raise ArgumentError(save_arg, "--save_path must be given if using --sanitize")
124 |
125 | compress_batch(input_folder=args.input_path,
126 | output_folder=args.save_path,
127 | target_compression=COMPRESSION_MAP[args.compression],
128 | threads=args.threads,
129 | recursive=args.recursive,
130 | follow_symlinks=not args.ignore_symlinks,
131 | in_place=args.in_place,
132 | sanitize=args.sanitize)
133 |
134 |
135 | if __name__ == '__main__':
136 | main()
137 |
--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/demux_fast5.py:
--------------------------------------------------------------------------------
1 | """
2 | Script for binning fast5 reads into separate directories based on column value in summary file
3 | Inteded for demultiplexing reads using barcoding summary file.
4 | """
5 | from pathlib import Path
6 | from typing import Union, Dict, Set, List
7 | from multiprocessing import Pool
8 | import logging
9 | from csv import reader
10 | from collections import defaultdict
11 | from time import sleep
12 | from math import ceil
13 | from argparse import ArgumentParser
14 |
15 | from ont_fast5_api.compression_settings import COMPRESSION_MAP
16 | from ont_fast5_api.conversion_tools.conversion_utils import (
17 | get_fast5_file_list,
18 | get_progress_bar,
19 | Fast5FilterWorker,
20 | READS_PER_FILE,
21 | FILENAME_BASE,
22 | ProgressBar,
23 | )
24 |
25 | DEMULTIPLEX_COLUMN = "barcode_arrangement"
26 | READ_ID_COLUMN = "read_id"
27 |
28 |
29 | class Fast5Demux:
30 | """
31 | Bin reads from directory of fast5 files according to demultiplex_column in sequencing_summary path
32 | :param input_dir: Path to input Fast5 file or directory of Fast5 files
33 | :param output_dir: Path to output directory
34 | :param summary_file: Path to TSV summary file
35 | :param demultiplex_column: str name of column with demultiplex values
36 | :param read_id_column: str name of column with read ids
37 | :param filename_base: str prefix for output Fast5 files
38 | :param batch_size: int maximum number of reads per output file
39 | :param threads: int maximum number of worker processes
40 | :param recursive: bool flag to search recursively through input_dir for Fast5 files
41 | :param follow_symlinks: bool flag to follow symlinks in input_dir
42 | :param target_compression: str compression type in output Fast5 files
43 | """
44 |
45 | def __init__(
46 | self,
47 | input_dir: Path,
48 | output_dir: Path,
49 | summary_file: Path,
50 | demultiplex_column: str,
51 | read_id_column: str = READ_ID_COLUMN,
52 | filename_base: str = FILENAME_BASE,
53 | batch_size: int = READS_PER_FILE,
54 | threads: int = 1,
55 | recursive: bool = False,
56 | follow_symlinks: bool = True,
57 | target_compression: Union[str, None] = None,
58 | ):
59 | self.input_dir = input_dir
60 | self.output_dir = output_dir
61 | self.summary = summary_file
62 | self.demultiplex_column = demultiplex_column
63 | self.read_id_column = read_id_column
64 | self.filename_base = filename_base
65 | self.batch_size = batch_size
66 | self.threads = threads
67 | self.recursive = recursive
68 | self.follow_symlinks = follow_symlinks
69 | self.target_compression = target_compression
70 |
71 | self.read_sets: Dict[str, Set[str]] = {}
72 | self.input_fast5s: List[Path] = []
73 | self.max_threads: int = 0
74 | self.workers: List = []
75 | self.progressbar: Union[ProgressBar, None] = None
76 | self.logger: logging.Logger = logging.getLogger(self.__class__.__name__)
77 |
78 | def create_output_dirs(self) -> None:
79 | """
80 | In output directory create a subdirectory per demux category
81 | :return:
82 | """
83 | self.output_dir.mkdir(parents=True, exist_ok=True)
84 | for demux in self.read_sets:
85 | out_dir = self.output_dir / demux
86 | out_dir.mkdir(exist_ok=True)
87 |
88 | def run_batch(self) -> None:
89 | """
90 | Run workers in pool or sequentially
91 | Starts multiprocessing pool if max_threads allows it
92 | :return:
93 | """
94 | self.workers_setup()
95 |
96 | if self.max_threads > 1:
97 | with Pool(self.max_threads) as pool:
98 | for worker in self.workers:
99 | worker.run_batch(pool=pool)
100 | while any(worker.tasks for worker in self.workers):
101 | sleep(1)
102 |
103 | pool.join()
104 | pool.close()
105 | else:
106 | for worker in self.workers:
107 | worker.run_batch(pool=None)
108 |
109 | self.progressbar.finish()
110 |
111 | def workers_setup(self) -> None:
112 | """
113 | Parse input summary and input file list to determine amount of work
114 | Create output directories and initialise workers
115 | :return:
116 | """
117 | self.read_sets = self.parse_summary_demultiplex()
118 | self.input_fast5s = get_fast5_file_list(
119 | input_path=self.input_dir,
120 | recursive=self.recursive,
121 | follow_symlinks=self.follow_symlinks,
122 | )
123 | self.max_threads = self.calculate_max_threads()
124 | # progressbar length is total numbers of reads to be extracted plus total number of files to be read
125 | total_progress = sum(len(item) for item in self.read_sets.values()) + (
126 | len(self.input_fast5s) * len(self.read_sets)
127 | )
128 | self.progressbar = get_progress_bar(num_reads=total_progress)
129 | self.create_output_dirs()
130 | for demux in sorted(self.read_sets):
131 | self.workers.append(
132 | Fast5FilterWorker(
133 | input_file_list=self.input_fast5s,
134 | output_dir=self.output_dir / demux,
135 | read_set=self.read_sets[demux],
136 | progressbar=self.progressbar,
137 | logger=self.logger,
138 | filename_base=self.filename_base,
139 | batch_size=self.batch_size,
140 | target_compression=self.target_compression,
141 | )
142 | )
143 |
144 | def report(self) -> None:
145 | """
146 | Log summary of work done
147 | :return:
148 | """
149 | total_reads = 0
150 | for idx, _ in enumerate(sorted(self.read_sets)):
151 | worker = self.workers[idx]
152 | for file, reads in worker.out_files.items():
153 | total_reads += len(reads)
154 |
155 | self.logger.info("{} reads extracted".format(total_reads))
156 |
157 | # report reads not found
158 | reads_to_extract = sum(len(item) for item in self.read_sets.values())
159 | if reads_to_extract > total_reads:
160 | self.logger.warning(
161 | "{} reads not found!".format(reads_to_extract - total_reads)
162 | )
163 |
164 | def calculate_max_threads(self) -> int:
165 | """
166 | Calculate max number of workers based on number of output files, input files and threads argument
167 | :return: int
168 | """
169 | max_inputs_per_worker = len(self.input_fast5s)
170 | total_outputs = 0
171 | for read_set in self.read_sets.values():
172 | outputs = int(ceil(len(read_set) / float(self.batch_size)))
173 | total_outputs += min(outputs, max_inputs_per_worker)
174 |
175 | return min(self.threads, total_outputs)
176 |
177 | def parse_summary_demultiplex(self) -> Dict[str, Set[str]]:
178 | """
179 | Open a TSV file and parse read_id and demultiplex columns into dict {demultiplex: read_id_set}
180 | :return:
181 | """
182 | read_sets = defaultdict(set)
183 | with open(str(self.summary), "r") as fh:
184 | read_list_tsv = reader(fh, delimiter="\t")
185 | header = next(read_list_tsv)
186 |
187 | if self.read_id_column in header:
188 | read_id_col_idx = header.index(self.read_id_column)
189 | else:
190 | raise ValueError(
191 | "No '{}' read_id column in header: {}".format(
192 | self.read_id_column, header
193 | )
194 | )
195 |
196 | if self.demultiplex_column in header:
197 | demultiplex_col_idx = header.index(self.demultiplex_column)
198 | else:
199 | raise ValueError(
200 | "No '{}' demultiplex column in header: {}".format(
201 | self.demultiplex_column, header
202 | )
203 | )
204 |
205 | for line in read_list_tsv:
206 | read_id = line[read_id_col_idx]
207 | demux = line[demultiplex_col_idx]
208 | read_sets[demux].add(read_id)
209 |
210 | return read_sets
211 |
212 |
213 | def create_arg_parser():
214 | parser = ArgumentParser(
215 | "Tool for binning reads from a multi_read_fast5_file by column value in summary file"
216 | )
217 | parser.add_argument(
218 | "-i",
219 | "--input",
220 | required=True,
221 | type=Path,
222 | help="Path to Fast5 file or directory of Fast5 files",
223 | )
224 | parser.add_argument(
225 | "-s",
226 | "--save_path",
227 | required=True,
228 | type=Path,
229 | help="Directory to output MultiRead subset to",
230 | )
231 | parser.add_argument(
232 | "-l",
233 | "--summary_file",
234 | required=True,
235 | type=Path,
236 | help="TSV file containing read_id column (sequencing_summary.txt file)",
237 | )
238 | parser.add_argument(
239 | "-f",
240 | "--filename_base",
241 | default="batch",
242 | required=False,
243 | help="Root of output filename, default='{}' -> '{}0.fast5'".format(
244 | FILENAME_BASE, FILENAME_BASE
245 | ),
246 | )
247 | parser.add_argument(
248 | "-n",
249 | "--batch_size",
250 | type=int,
251 | default=READS_PER_FILE,
252 | required=False,
253 | help="Number of reads per multi-read file (default {})".format(READS_PER_FILE),
254 | )
255 | parser.add_argument(
256 | "-t",
257 | "--threads",
258 | type=int,
259 | default=1,
260 | required=False,
261 | help="Maximum number of parallel processes to use (default 1)",
262 | )
263 | parser.add_argument(
264 | "-r",
265 | "--recursive",
266 | action="store_true",
267 | required=False,
268 | default=False,
269 | help="Flag to search recursively through input directory for MultiRead fast5 files",
270 | )
271 | parser.add_argument(
272 | "--ignore_symlinks",
273 | action="store_true",
274 | help="Ignore symlinks when searching recursively for fast5 files",
275 | )
276 | parser.add_argument(
277 | "-c",
278 | "--compression",
279 | required=False,
280 | default=None,
281 | choices=list(COMPRESSION_MAP.keys()) + [None],
282 | help="Target output compression type. If omitted - don't change compression type",
283 | )
284 | parser.add_argument(
285 | "--demultiplex_column",
286 | type=str,
287 | default=DEMULTIPLEX_COLUMN,
288 | required=False,
289 | help="Name of column for demultiplexing in summary file (default '{}'".format(
290 | DEMULTIPLEX_COLUMN
291 | ),
292 | )
293 | parser.add_argument(
294 | "--read_id_column",
295 | type=str,
296 | default=READ_ID_COLUMN,
297 | required=False,
298 | help="Name of read_id column in summary file (default '{}'".format(
299 | READ_ID_COLUMN
300 | ),
301 | )
302 | return parser
303 |
304 |
305 | def main():
306 | parser = create_arg_parser()
307 | args = parser.parse_args()
308 | if args.compression is not None:
309 | args.compression = COMPRESSION_MAP[args.compression]
310 |
311 | demux = Fast5Demux(
312 | input_dir=args.input,
313 | output_dir=args.save_path,
314 | summary_file=args.summary_file,
315 | demultiplex_column=args.demultiplex_column,
316 | read_id_column=args.read_id_column,
317 | filename_base=args.filename_base,
318 | batch_size=args.batch_size,
319 | threads=args.threads,
320 | recursive=args.recursive,
321 | follow_symlinks=not args.ignore_symlinks,
322 | target_compression=args.compression,
323 | )
324 | demux.run_batch()
325 | demux.report()
326 |
327 |
328 | if __name__ == "__main__":
329 | main()
330 |
--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/fast5_subset.py:
--------------------------------------------------------------------------------
1 | """Filter Fast5 files based on read_id list
2 | """
3 | import csv
4 | import logging
5 | from argparse import ArgumentParser
6 | from math import ceil
7 | from multiprocessing import Pool
8 | from os import makedirs, path
9 | from pathlib import Path
10 | from time import sleep
11 |
12 | from ont_fast5_api.compression_settings import COMPRESSION_MAP
13 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, get_progress_bar, Fast5FilterWorker
14 | from ont_fast5_api.conversion_tools.conversion_utils import READS_PER_FILE, FILENAME_BASE
15 |
16 | logging.basicConfig(level=logging.DEBUG)
17 |
18 |
19 | class Fast5Filter:
20 | """
21 | Extract reads listed read_list_file from fast5 files in input_folder, write to multi-fast5 files in
22 | output_folder
23 | """
24 |
25 | def __init__(self, input_folder, output_folder, read_list_file, filename_base=FILENAME_BASE,
26 | batch_size=READS_PER_FILE, threads=1, recursive=False, file_list_file=None, follow_symlinks=True,
27 | target_compression=None):
28 | assert path.isdir(input_folder)
29 | assert path.isfile(read_list_file)
30 | assert isinstance(filename_base, str)
31 | assert isinstance(batch_size, int)
32 | assert isinstance(threads, int)
33 | assert isinstance(recursive, bool)
34 | self.logger = logging.getLogger(self.__class__.__name__)
35 |
36 | self.read_set = parse_summary_file(read_list_file)
37 | self.input_f5s = get_fast5_file_list(str(input_folder), recursive, follow_symlinks=follow_symlinks)
38 | makedirs(output_folder, exist_ok=True)
39 |
40 | if len(self.read_set) < 1:
41 | raise ValueError("No reads in read list file {}".format(read_list_file))
42 |
43 | if len(self.input_f5s) < 1:
44 | raise ValueError(
45 | "No input fast5 files found in {}. Recursion is set to {}".format(str(input_folder), recursive))
46 |
47 | if batch_size < 1:
48 | raise ValueError("Batch size (--batch_size) must be a positive integer, not {}".format(batch_size))
49 |
50 | if threads < 1:
51 | raise ValueError("Max number of threads (--threads) must be a positive integer, not {}".format(threads))
52 |
53 | if file_list_file:
54 | file_set = parse_summary_file(file_list_file)
55 | for file in file_set:
56 | assert path.exists(file), "{} from file list doesn't exist".format(file)
57 | self.input_f5s = list(file_set.intersection(self.input_f5s))
58 |
59 | # determine max number of workers
60 | num_outputs = int(ceil(len(self.read_set) / float(batch_size)))
61 | self.num_workers = min(threads, num_outputs, len(self.input_f5s))
62 |
63 | # progressbar total is number of reads in read_set plus number of input files
64 | # (to see progress while scanning files that don't have any relevant reads)
65 | self.pbar = get_progress_bar(len(self.read_set) + len(self.input_f5s))
66 |
67 | self.worker = Fast5FilterWorker(
68 | input_file_list=self.input_f5s,
69 | output_dir=Path(output_folder),
70 | logger=self.logger,
71 | progressbar=self.pbar,
72 | read_set=self.read_set,
73 | filename_base=filename_base,
74 | batch_size=batch_size,
75 | target_compression=target_compression
76 | )
77 |
78 | def run_batch(self):
79 |
80 | if self.num_workers == 1:
81 | self.worker.run_batch(pool=None)
82 | else:
83 | with Pool(self.num_workers) as pool:
84 | self.worker.run_batch(pool=pool)
85 |
86 | while self.worker.tasks:
87 | sleep(1)
88 |
89 | pool.close()
90 | pool.join()
91 |
92 | self.pbar.finish()
93 | self.logger.info("{} reads extracted".format(sum(len(v) for v in self.worker.out_files.values())))
94 |
95 | # report reads not found
96 | if len(self.worker.read_set) > 0:
97 | self.logger.warning("{} reads not found!".format(len(self.worker.read_set)))
98 |
99 |
100 | def parse_summary_file(read_list_file):
101 | """
102 | Opens a text file and returns set of read_ids
103 | Expects either a single column file where every line is read_id or
104 | multi-column Tab-separated CSV, that contains a column read_id
105 | :param read_list_file: path to file
106 | :return: set
107 | """
108 | reads = set()
109 | with open(str(read_list_file), 'r') as fh:
110 | read_list_tsv = csv.reader(fh, delimiter='\t')
111 | header = next(read_list_tsv)
112 |
113 | if "read_id" in header:
114 | col_idx = header.index("read_id")
115 | else:
116 | if len(header) == 1:
117 | reads.add(header[0].strip())
118 | col_idx = 0
119 | else:
120 | raise TypeError("multi-column file without 'read_id' column")
121 |
122 | for line in read_list_tsv:
123 | reads.add(line[col_idx].strip())
124 | if len(reads) < 1:
125 | raise ValueError("No reads in read list file {}".format(read_list_file))
126 | return reads
127 |
128 |
129 | def main():
130 | parser = ArgumentParser("Tool for extracting reads from a multi_read_fast5_file by read_id")
131 | parser.add_argument('-i', '--input', required=True,
132 | help="Path to Fast5 file or directory of Fast5 files")
133 | parser.add_argument('-s', '--save_path', required=True,
134 | help="Folder to output MultiRead subset to")
135 | parser.add_argument('-l', '--read_id_list', required=True,
136 | help="File containing list of read ids to extract (or sequencing_summary.txt file)")
137 | parser.add_argument('-f', '--filename_base', default=FILENAME_BASE, required=False,
138 | help="Root of output filename, default='{}' -> '{}0.fast5'".format(FILENAME_BASE, FILENAME_BASE))
139 | parser.add_argument('-n', '--batch_size', type=int, default=READS_PER_FILE, required=False,
140 | help="Number of reads per multi-read file (default {}".format(READS_PER_FILE))
141 | parser.add_argument('-t', '--threads', type=int, default=1, required=False,
142 | help="Maximum number of threads to use")
143 | parser.add_argument('-r', '--recursive', action='store_true', required=False, default=False,
144 | help="Search recursively through folders for MultiRead fast5 files")
145 | parser.add_argument('--ignore_symlinks', action='store_true',
146 | help="Ignore symlinks when searching recursively for fast5 files")
147 | parser.add_argument('-c', '--compression', required=False, default=None,
148 | choices=list(COMPRESSION_MAP.keys()) + [None], help="Target output compression type")
149 | parser.add_argument('--file_list', required=False,
150 | help="File containing names of files to search in")
151 | args = parser.parse_args()
152 |
153 | if args.compression is not None:
154 | args.compression = COMPRESSION_MAP[args.compression]
155 |
156 | multifilter = Fast5Filter(input_folder=args.input,
157 | output_folder=args.save_path,
158 | filename_base=args.filename_base,
159 | read_list_file=args.read_id_list,
160 | batch_size=args.batch_size,
161 | threads=args.threads,
162 | recursive=args.recursive,
163 | file_list_file=args.file_list,
164 | follow_symlinks=not args.ignore_symlinks,
165 | target_compression=args.compression)
166 |
167 | multifilter.run_batch()
168 |
169 |
170 | if __name__ == '__main__':
171 | main()
172 |
--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/multi_to_single_fast5.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 | from multiprocessing import Pool
3 | import logging
4 | import os
5 |
6 | from ont_fast5_api import __version__
7 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, get_progress_bar
8 | from ont_fast5_api.fast5_file import EmptyFast5, Fast5FileTypeError
9 | from ont_fast5_api.fast5_interface import check_file_type, MULTI_READ
10 | from ont_fast5_api.multi_fast5 import MultiFast5File
11 |
12 | logging.basicConfig(level=logging.INFO)
13 | logger = logging.getLogger(__name__)
14 | exc_info = False
15 |
16 |
17 | def batch_convert_multi_files_to_single(input_path, output_folder, threads, recursive, follow_symlinks):
18 | pool = Pool(threads)
19 | file_list = get_fast5_file_list(input_path, recursive, follow_symlinks=follow_symlinks)
20 | pbar = get_progress_bar(len(file_list))
21 |
22 | def update(result):
23 | input_file = result[0]
24 | with open(os.path.join(output_folder, "filename_mapping.txt"), 'a') as output_table:
25 | for filename in result[1]:
26 | output_table.write("{}\t{}\n".format(input_file, filename))
27 | pbar.update(pbar.currval + 1)
28 |
29 | if not os.path.exists(output_folder):
30 | os.makedirs(output_folder)
31 |
32 | results_array = []
33 | for batch_num, filename in enumerate(file_list):
34 | results_array.append(pool.apply_async(convert_multi_to_single,
35 | args=(filename, output_folder,
36 | str(batch_num)),
37 | callback=update))
38 |
39 | pool.close()
40 | pool.join()
41 | pbar.finish()
42 |
43 |
44 | def convert_multi_to_single(input_file, output_folder, subfolder):
45 | output_files = ()
46 | try:
47 | output_files = try_multi_to_single_conversion(input_file, output_folder, subfolder)
48 | except Exception as e:
49 | logger.error("{}\n\tFailed to copy files from: {}"
50 | "".format(e, input_file), exc_info=exc_info)
51 | return input_file, output_files
52 |
53 |
54 | def try_multi_to_single_conversion(input_file, output_folder, subfolder):
55 | output_files = []
56 | with MultiFast5File(input_file, 'r') as multi_f5:
57 | file_type = check_file_type(multi_f5)
58 | if file_type != MULTI_READ:
59 | raise Fast5FileTypeError("Could not convert Multi->Single for file type '{}' with path '{}'"
60 | "".format(file_type, input_file))
61 | for read in multi_f5.get_reads():
62 | try:
63 | output_file = os.path.join(output_folder, subfolder, "{}.fast5".format(read.read_id))
64 | create_single_f5(output_file, read)
65 | output_files.append(os.path.basename(output_file))
66 | except Exception as e:
67 | logger.error("{}\n\tFailed to copy read '{}' from {}"
68 | "".format(str(e), read.read_id, input_file), exc_info=exc_info)
69 | return output_files
70 |
71 |
72 | def create_single_f5(output_file, read):
73 | if not os.path.exists(os.path.dirname(output_file)):
74 | os.makedirs(os.path.dirname(output_file))
75 | with EmptyFast5(output_file, 'w') as single_f5:
76 | for group in read.handle:
77 | if group == "Raw":
78 | read_number = read.handle["Raw"].attrs["read_number"]
79 | single_f5.handle.copy(read.handle[group], "Raw/Reads/Read_{}".format(read_number))
80 | elif group in ("channel_id", "context_tags", "tracking_id"):
81 | if "UniqueGlobalKey" not in single_f5.handle:
82 | single_f5.handle.create_group("UniqueGlobalKey")
83 | single_f5.handle.copy(read.handle[group], "UniqueGlobalKey/{}".format(group))
84 | else:
85 | single_f5.handle.copy(read.handle[group], group)
86 |
87 |
88 | def main():
89 | parser = ArgumentParser("")
90 | parser.add_argument('-i', '--input_path', required=True,
91 | help="MultiRead fast5 file or path to directory of MultiRead files")
92 | parser.add_argument('-s', '--save_path', required=True,
93 | help="Folder to output SingleRead fast5 files to")
94 | parser.add_argument('--recursive', action='store_true',
95 | help="Search recursively through folders for MultiRead fast5 files")
96 | parser.add_argument('--ignore_symlinks', action='store_true',
97 | help="Ignore symlinks when searching recursively for fast5 files")
98 | parser.add_argument('-t', '--threads', type=int, default=1, required=False,
99 | help="Number of threads to use")
100 | parser.add_argument('-v', '--version', action='version', version=__version__)
101 | args = parser.parse_args()
102 |
103 | batch_convert_multi_files_to_single(args.input_path, args.save_path, args.threads,
104 | args.recursive, follow_symlinks=not args.ignore_symlinks)
105 |
106 |
107 | if __name__ == '__main__':
108 | main()
109 |
--------------------------------------------------------------------------------
/ont_fast5_api/conversion_tools/single_to_multi_fast5.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from argparse import ArgumentParser
4 | from multiprocessing import Pool
5 |
6 | from ont_fast5_api import __version__
7 | from ont_fast5_api.compression_settings import COMPRESSION_MAP
8 | from ont_fast5_api.conversion_tools.conversion_utils import get_fast5_file_list, batcher, get_progress_bar
9 | from ont_fast5_api.fast5_file import Fast5File, Fast5FileTypeError
10 | from ont_fast5_api.multi_fast5 import MultiFast5File
11 |
12 | logging.basicConfig(level=logging.INFO)
13 | logger = logging.getLogger(__name__)
14 | exc_info = False
15 |
16 |
17 | def batch_convert_single_to_multi(input_path, output_folder, filename_base, batch_size,
18 | threads, recursive, follow_symlinks, target_compression):
19 | pool = Pool(threads)
20 | file_list = get_fast5_file_list(input_path, recursive, follow_symlinks)
21 | pbar = get_progress_bar(int((len(file_list) + batch_size - 1) / batch_size))
22 |
23 | def update(result):
24 | output_file = result[1]
25 | with open(os.path.join(output_folder, "filename_mapping.txt"), 'a') as output_table:
26 | for filename in result[0]:
27 | output_table.write("{}\t{}\n".format(filename, output_file))
28 | pbar.update(pbar.currval + 1)
29 |
30 | results_array = []
31 | os.makedirs(output_folder, exist_ok=True)
32 | for batch_num, batch in enumerate(batcher(file_list, batch_size)):
33 | output_file = os.path.join(output_folder, "{}_{}.fast5".format(filename_base, batch_num))
34 | results_array.append(pool.apply_async(create_multi_read_file,
35 | args=(batch, output_file, target_compression),
36 | callback=update))
37 |
38 | pool.close()
39 | pool.join()
40 | pbar.finish()
41 |
42 |
43 | def create_multi_read_file(input_files, output_file, target_compression):
44 | results = []
45 | os.makedirs(os.path.dirname(output_file), exist_ok=True)
46 | if os.path.exists(output_file):
47 | logger.info("FileExists - appending new reads to existing file: {}".format(output_file))
48 | try:
49 | with MultiFast5File(output_file, 'a') as multi_f5:
50 | for filename in input_files:
51 | try:
52 | with Fast5File(filename, 'r') as f5_input:
53 | read = f5_input.get_read(f5_input.read_id)
54 | multi_f5.add_existing_read(read, target_compression=target_compression)
55 | results.append(os.path.basename(filename))
56 | except Fast5FileTypeError as e:
57 | logger.error("{}: Cannot input MultiRead files to single_to_multi: '{}'"
58 | "".format(e, filename), exc_info=exc_info)
59 | raise
60 | except Exception as e:
61 | logger.error("{}\n\tFailed to add single read file: '{}' to '{}'"
62 | "".format(e, filename, output_file), exc_info=exc_info)
63 |
64 | except Fast5FileTypeError:
65 | raise
66 | except Exception as e:
67 | logger.error("{}\n\tFailed to write to MultiRead file: {}"
68 | "".format(e, output_file), exc_info=exc_info)
69 | return results, output_file
70 |
71 |
72 | def main():
73 | parser = ArgumentParser("")
74 | parser.add_argument('-i', '--input_path', required=True,
75 | help='Folder containing single read fast5 files')
76 | parser.add_argument('-s', '--save_path', required=True,
77 | help="Folder to output multi read files to")
78 | parser.add_argument('-f', '--filename_base', default='batch', required=False,
79 | help="Root of output filename, default='batch' -> 'batch_0.fast5'")
80 | parser.add_argument('-n', '--batch_size', type=int, default=4000, required=False,
81 | help="Number of reads per multi-read file")
82 | parser.add_argument('-t', '--threads', type=int, default=1, required=False,
83 | help="Number of threads to use")
84 | parser.add_argument('--recursive', action='store_true',
85 | help="Search recursively through folders for single_read fast5 files")
86 | parser.add_argument('--ignore_symlinks', action='store_true',
87 | help="Ignore symlinks when searching recursively for fast5 files")
88 | parser.add_argument('-c', '--compression', required=False, default=None,
89 | choices=list(COMPRESSION_MAP.keys()) + [None], help="Target output compression type")
90 | parser.add_argument('-v', '--version', action='version', version=__version__)
91 | args = parser.parse_args()
92 |
93 | if args.compression is not None:
94 | args.compression = COMPRESSION_MAP[args.compression]
95 |
96 | batch_convert_single_to_multi(args.input_path,
97 | args.save_path,
98 | args.filename_base,
99 | args.batch_size,
100 | args.threads,
101 | args.recursive,
102 | follow_symlinks=not args.ignore_symlinks,
103 | target_compression=args.compression)
104 |
105 |
106 | if __name__ == '__main__':
107 | main()
108 |
--------------------------------------------------------------------------------
/ont_fast5_api/data_sanitisation.py:
--------------------------------------------------------------------------------
1 | import h5py
2 | import numpy as np
3 |
4 |
5 | def _clean(value):
6 | """ Convert numpy numeric types to their python equivalents. """
7 | if isinstance(value, np.ndarray):
8 | if value.dtype.kind == 'S':
9 | return np.char.decode(value).tolist()
10 | else:
11 | return value.tolist()
12 | elif type(value).__module__ == np.__name__:
13 | # h5py==2.8.0 on windows sometimes fails to cast this from an np.float64 to a python.float
14 | # We have to let the user do this themselves, since casting here could be dangerous
15 | # https://github.com/h5py/h5py/issues/1051
16 | conversion = value.item() # np.asscalar(value) was deprecated in v1.16
17 | if isinstance(conversion, bytes):
18 | conversion = conversion.decode()
19 | return conversion
20 | elif isinstance(value, bytes):
21 | return value.decode()
22 | else:
23 | return value
24 |
25 |
26 | def _sanitize_data_for_writing(data):
27 | # To make the interface more user friendly we encode python strings as byte-strings when writing datasets
28 | if isinstance(data, str):
29 | # Plain python-strings can be encoded trivially
30 | return data.encode()
31 | elif isinstance(data, np.ndarray) and data.dtype.kind == np.dtype(np.unicode_):
32 | # If the array is all of one type, unicode-string, we can encode with numpy
33 | return data.astype('S')
34 | elif isinstance(data, np.ndarray) and len(data.dtype) > 1:
35 | # If the array is of mixed types we have to set the encoding column by column
36 | encoded_dtypes = []
37 | for field_name in data.dtype.names:
38 | field_dtype, field_byte_index = data.dtype.fields[field_name]
39 | if field_dtype.kind == 'U':
40 | str_len = field_dtype.itemsize // field_dtype.alignment
41 | field_dtype = np.dtype("|S{}".format(str_len))
42 | encoded_dtypes.append((field_name, field_dtype))
43 | return data.astype(encoded_dtypes)
44 |
45 | return data
46 |
47 |
48 | def _sanitize_data_for_reading(data):
49 | # To make the interface more user friendly we decode byte-strings into unicode strings when reading datasets
50 | if isinstance(data, h5py.Dataset):
51 | data = data[()]
52 |
53 | if isinstance(data, bytes):
54 | # Plain byte-strings can be decoded trivially
55 | return data.decode()
56 | elif isinstance(data, np.ndarray) and data.dtype.kind == 'S':
57 | # If the array is all of one type, byte-string, we can decode with numpy
58 | return np.char.decode(data)
59 | elif isinstance(data, np.ndarray) and len(data.dtype) > 1:
60 | # If the array is of mixed types we have to decode column by column
61 | decoded_dtypes = []
62 | for field_name in data.dtype.names:
63 | field_dtype, field_byte_index = data.dtype.fields[field_name]
64 | if field_dtype.kind == 'S':
65 | field_dtype = np.dtype("=3',
26 | 'numpy>=1.16',
27 | 'packaging',
28 | 'progressbar33>=2.3.1',
29 | 'setuptools']
30 |
31 | setup(name=__pkg_name__.replace("_", "-"),
32 | author='Oxford Nanopore Technologies, Limited',
33 | description='Oxford Nanopore Technologies fast5 API software',
34 | license='MPL 2.0',
35 | long_description=documentation,
36 | version=get_version(),
37 | url='https://github.com/nanoporetech/{}'.format(__pkg_name__),
38 | install_requires=installation_requirements,
39 | packages=find_packages(),
40 | package_data={__pkg_name__: ['vbz_plugin/*.so', 'vbz_plugin/*.dylib', 'vbz_plugin/*.dll']},
41 | python_requires='>=3.7',
42 | entry_points={'console_scripts': [
43 | "multi_to_single_fast5={}.conversion_tools.multi_to_single_fast5:main".format(__pkg_name__),
44 | "single_to_multi_fast5={}.conversion_tools.single_to_multi_fast5:main".format(__pkg_name__),
45 | "fast5_subset={}.conversion_tools.fast5_subset:main".format(__pkg_name__),
46 | "compress_fast5={}.conversion_tools.compress_fast5:main".format(__pkg_name__),
47 | "check_compression={}.conversion_tools.check_file_compression:main".format(__pkg_name__),
48 | "demux_fast5={}.conversion_tools.demux_fast5:main".format(__pkg_name__),
49 | ]},
50 | classifiers=[
51 | 'Development Status :: 5 - Production/Stable',
52 | 'Environment :: Console',
53 | 'Intended Audience :: Developers',
54 | 'Intended Audience :: Science/Research',
55 | 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
56 | 'Natural Language :: English',
57 | 'Operating System :: Microsoft :: Windows',
58 | 'Operating System :: POSIX :: Linux',
59 | 'Operating System :: MacOS',
60 | 'Programming Language :: Python :: 3 :: Only',
61 | 'Topic :: Scientific/Engineering :: Bio-Informatics',
62 | ],
63 | keywords='fast5 nanopore')
64 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/__init__.py
--------------------------------------------------------------------------------
/test/data/basecall_2d_file_v1.0.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/basecall_2d_file_v1.0.fast5
--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/00031f3e-415c-4ab5-9c16-fb6fe45ff519.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/00031f3e-415c-4ab5-9c16-fb6fe45ff519.fast5
--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/000c0b4e-46c2-4fb5-9b17-d7031eefb975.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/000c0b4e-46c2-4fb5-9b17-d7031eefb975.fast5
--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/000ebd63-3e1a-4499-9ded-26af3225a022.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/000ebd63-3e1a-4499-9ded-26af3225a022.fast5
--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/002ad0e4-c6bb-4eff-a30f-5fec01475ab8.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/002ad0e4-c6bb-4eff-a30f-5fec01475ab8.fast5
--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/002b0891-03bf-4622-ae66-ae6984890ed4.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/002b0891-03bf-4622-ae66-ae6984890ed4.fast5
--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/0048058c-ecb4-4a0f-b283-9a128bd598c5.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/0048058c-ecb4-4a0f-b283-9a128bd598c5.fast5
--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/004a87b0-c9f6-4237-b4d6-466ab979aee2.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/004a87b0-c9f6-4237-b4d6-466ab979aee2.fast5
--------------------------------------------------------------------------------
/test/data/hardlink/single_reads/0059d270-3238-4413-b38b-f588e28326df.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/single_reads/0059d270-3238-4413-b38b-f588e28326df.fast5
--------------------------------------------------------------------------------
/test/data/hardlink/unlinked/batch0.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/hardlink/unlinked/batch0.fast5
--------------------------------------------------------------------------------
/test/data/multi_read/batch_0.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/multi_read/batch_0.fast5
--------------------------------------------------------------------------------
/test/data/multi_read_analyses/batch_0.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/multi_read_analyses/batch_0.fast5
--------------------------------------------------------------------------------
/test/data/read_file_v0.6_raw.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/read_file_v0.6_raw.fast5
--------------------------------------------------------------------------------
/test/data/read_file_v0.6_single.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/read_file_v0.6_single.fast5
--------------------------------------------------------------------------------
/test/data/read_file_v1.0_single.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/read_file_v1.0_single.fast5
--------------------------------------------------------------------------------
/test/data/rle_basecall_table/rle_example.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/rle_basecall_table/rle_example.fast5
--------------------------------------------------------------------------------
/test/data/single_read_analyses/read.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_read_analyses/read.fast5
--------------------------------------------------------------------------------
/test/data/single_reads/fe85b517-62ee-4a33-8767-41cab5d5ab39.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_reads/fe85b517-62ee-4a33-8767-41cab5d5ab39.fast5
--------------------------------------------------------------------------------
/test/data/single_reads/fe8a3026-d1f4-46b3-8daa-e610f27acde1.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_reads/fe8a3026-d1f4-46b3-8daa-e610f27acde1.fast5
--------------------------------------------------------------------------------
/test/data/single_reads/fe9374ee-b86a-4ca4-81dc-ac06e3297728.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_reads/fe9374ee-b86a-4ca4-81dc-ac06e3297728.fast5
--------------------------------------------------------------------------------
/test/data/single_reads/read0.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/single_reads/read0.fast5
--------------------------------------------------------------------------------
/test/data/summaries/two_barcode_summary.txt:
--------------------------------------------------------------------------------
1 | read_id barcode_arrangement
2 | fe85b517-62ee-4a33-8767-41cab5d5ab39 barcode01
3 | fe9374ee-b86a-4ca4-81dc-ac06e3297728 barcode02
4 | fe849dd3-63bc-4044-8910-14e1686273bb barcode02
5 | fe8a3026-d1f4-46b3-8daa-e610f27acde1 barcode01
--------------------------------------------------------------------------------
/test/data/telemetry_test.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/telemetry_test.fast5
--------------------------------------------------------------------------------
/test/data/vbz_reads/vbz_reads.fast5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_fast5_api/bbabb444a80e8dad7996aff47c782e3d45423ff5/test/data/vbz_reads/vbz_reads.fast5
--------------------------------------------------------------------------------
/test/helpers.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from tempfile import TemporaryDirectory, _get_candidate_names
4 | import unittest
5 |
6 | test_data = os.path.join(os.path.dirname(__file__), 'data')
7 |
8 |
9 | def disable_logging(test_func):
10 | def do_test(self, *args, **kwargs):
11 | logging.disable(logging.CRITICAL)
12 | test_func(self, *args, **kwargs)
13 |
14 | return do_test
15 |
16 |
17 | class TestFast5ApiHelper(unittest.TestCase):
18 |
19 | def setUp(self):
20 | self._tmp_dir = TemporaryDirectory()
21 | self.save_path = self._tmp_dir.name
22 |
23 | def tearDown(self):
24 | self._tmp_dir.cleanup()
25 |
26 | def generate_temp_filename(self):
27 | return os.path.join(self.save_path, next(_get_candidate_names()))
28 |
--------------------------------------------------------------------------------
/test/test_alignment_tools.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | from ont_fast5_api.fast5_file import Fast5File
4 | from ont_fast5_api.analysis_tools.alignment import AlignmentTools
5 | from test.helpers import TestFast5ApiHelper
6 |
7 |
8 | class TestAlignmentTools(TestFast5ApiHelper):
9 |
10 | def test_001_put_and_retrieve(self):
11 | fname = self.generate_temp_filename()
12 | summary_temp = {'genome': 'Lambda',
13 | 'genome_start': 100,
14 | 'genome_end': 200,
15 | 'strand_start': 1,
16 | 'strand_end': 101,
17 | 'num_events': 125,
18 | 'num_aligned': 92,
19 | 'num_correct': 87,
20 | 'num_insertions': 8,
21 | 'num_deletions': 8,
22 | 'identity': 0.9457,
23 | 'accuracy': 0.8056}
24 | summary_comp = {'genome': 'Lambda_rc',
25 | 'genome_start': 100,
26 | 'genome_end': 200,
27 | 'strand_start': 0,
28 | 'strand_end': 96,
29 | 'num_events': 120,
30 | 'num_aligned': 90,
31 | 'num_correct': 88,
32 | 'num_insertions': 6,
33 | 'num_deletions': 10,
34 | 'identity': 0.9778,
35 | 'accuracy': 0.8302}
36 | summary_2d = {'genome': 'Lambda',
37 | 'genome_start': 100,
38 | 'genome_end': 200,
39 | 'strand_start': 0,
40 | 'strand_end': 100,
41 | 'num_events': 125,
42 | 'num_aligned': 98,
43 | 'num_correct': 96,
44 | 'num_insertions': 4,
45 | 'num_deletions': 4,
46 | 'identity': 0.9796,
47 | 'accuracy': 0.9057}
48 | sam1 = 'Dummy string for template SAM.'
49 | sam2 = 'Dummy string for complement SAM.'
50 | sam3 = 'Dummy string for 2D SAM.'
51 | sequence1 = ''.join(np.random.choice(['A', 'C', 'G', 'T'], 100))
52 | bc = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
53 | sequence2 = ''.join([bc[letter] for letter in sequence1[::-1]])
54 | with Fast5File(fname, mode='w') as fh:
55 | fh.add_channel_info({'channel_number': 1,
56 | 'sampling_rate': 4000,
57 | 'digitisation': 8192,
58 | 'range': 819.2,
59 | 'offset': 0})
60 | fh.add_read(12, 'unique_snowflake', 12345, 4000, 0, 120.75)
61 | attrs = {'name': 'test', 'version': 0, 'time_stamp': 'just now', 'component': 'segmentation'}
62 | fh.add_analysis('segmentation', 'Segmentation_000', attrs)
63 | seg_data = {'has_template': 1,
64 | 'has_complement': 1,
65 | 'first_sample_template': 0,
66 | 'duration_template': 2000,
67 | 'first_sample_complement': 2000,
68 | 'duration_complement': 2000}
69 | fh.set_summary_data('Segmentation_000', 'segmentation', seg_data)
70 | attrs['component'] = 'alignment'
71 | attrs['segmentation'] = 'Analyses/Segmentation_000'
72 | fh.add_analysis('alignment', 'Alignment_000', attrs)
73 | fh.set_summary_data('Alignment_000', 'genome_mapping_template', summary_temp)
74 | fh.set_summary_data('Alignment_000', 'genome_mapping_complement', summary_comp)
75 | fh.set_summary_data('Alignment_000', 'genome_mapping_2d', summary_2d)
76 | with AlignmentTools(fh, group_name='Alignment_000') as align:
77 | align.add_alignment_data('template', sam1, sequence1)
78 | align.add_alignment_data('complement', sam2, sequence2)
79 | align.add_alignment_data('2d', sam3, sequence1)
80 | with Fast5File(fname, mode='r') as fh:
81 | with AlignmentTools(fh, group_name='Alignment_000') as align:
82 | sam, seq = align.get_alignment_data('template')
83 | self.assertEqual(sam1, sam)
84 | self.assertEqual(sequence1, seq)
85 | sam, seq = align.get_alignment_data('complement')
86 | self.assertEqual(sam2, sam)
87 | self.assertEqual(sequence2, seq)
88 | sam, seq = align.get_alignment_data('2d')
89 | self.assertEqual(sam3, sam)
90 | self.assertEqual(sequence1, seq)
91 | results = align.get_results()
92 | speed_temp = align.calculate_speed('template')
93 | speed_comp = align.calculate_speed('complement')
94 | # Make sure we can calculate speed using only what's in the
95 | # summary
96 | summary = fh.get_summary_data('Alignment_000')
97 | template_summary = summary['genome_mapping_template']
98 | summary_speed_temp = align.calculate_speed('template',
99 | template_summary)
100 | self.assertEqual(250, speed_temp)
101 | self.assertEqual(250, speed_comp)
102 | self.assertEqual(speed_temp, summary_speed_temp)
103 | self.assertDictEqual({'status': 'match found',
104 | 'direction': 'forward',
105 | 'ref_name': 'Lambda',
106 | 'ref_span': (100, 200),
107 | 'seq_span': (1, 101),
108 | 'seq_len': 125,
109 | 'num_aligned': 92,
110 | 'num_correct': 87,
111 | 'num_insertions': 8,
112 | 'num_deletions': 8,
113 | 'identity': 0.9457,
114 | 'accuracy': 0.8056}, results['template'])
115 | self.assertDictEqual({'status': 'match found',
116 | 'direction': 'reverse',
117 | 'ref_name': 'Lambda',
118 | 'ref_span': (100, 200),
119 | 'seq_span': (0, 96),
120 | 'seq_len': 120,
121 | 'num_aligned': 90,
122 | 'num_correct': 88,
123 | 'num_insertions': 6,
124 | 'num_deletions': 10,
125 | 'identity': 0.9778,
126 | 'accuracy': 0.8302}, results['complement'])
127 | self.assertDictEqual({'status': 'match found',
128 | 'direction': 'forward',
129 | 'ref_name': 'Lambda',
130 | 'ref_span': (100, 200),
131 | 'seq_span': (0, 100),
132 | 'seq_len': 125,
133 | 'num_aligned': 98,
134 | 'num_correct': 96,
135 | 'num_insertions': 4,
136 | 'num_deletions': 4,
137 | 'identity': 0.9796,
138 | 'accuracy': 0.9057}, results['2d'])
139 |
--------------------------------------------------------------------------------
/test/test_basecall_1d_tools.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from ont_fast5_api.fast5_file import Fast5File
3 | from ont_fast5_api.analysis_tools.basecall_1d import Basecall1DTools
4 | from test.helpers import TestFast5ApiHelper
5 |
6 |
7 | class TestBasecall1DTools(TestFast5ApiHelper):
8 |
9 | def test_001_put_and_retrieve(self):
10 | fname = self.generate_temp_filename()
11 | dtypes = [('mean', float),
12 | ('start', float),
13 | ('stdv', float),
14 | ('length', float),
15 | ('called_state', ' None:
13 | super().setUp()
14 |
15 | # Known good read_ids from test_data/multi_read/batch_0.fast5
16 | self.read_id_set = {'fe849dd3-63bc-4044-8910-14e1686273bb',
17 | 'fe85b517-62ee-4a33-8767-41cab5d5ab39'}
18 | self.read_id_list = ['fe849dd3-63bc-4044-8910-14e1686273bb',
19 | 'fe85b517-62ee-4a33-8767-41cab5d5ab39']
20 | self.fast5_path = test_data + "/multi_read/batch_0.fast5"
21 |
22 |
23 | def test_yield_fast5_files_from_fast5_file(self):
24 | f5_gen = yield_fast5_files(self.fast5_path, recursive=False)
25 | f5_path = next(f5_gen)
26 | self.assertTrue(Path(f5_path).is_file(), "Filepath is not a file")
27 | self.assertTrue(f5_path.endswith('.fast5'), "Filepath does not end with fast5 extension")
28 | self.assertTrue(Path(f5_path).absolute() == Path(self.fast5_path).absolute(),
29 | "Direct path did not return itself")
30 |
31 | def test_yield_fast5_files_from_dir(self):
32 | f5_gen = yield_fast5_files(test_data, recursive=False)
33 |
34 | for f5_path in f5_gen:
35 | self.assertTrue(Path(f5_path).is_file(), "Filepath is not a file")
36 | self.assertTrue(f5_path.endswith('.fast5'), "Filepath does not end with fast5 extension")
37 |
38 | def test_yield_fast5_reads_from_fast5_file(self):
39 | f5_read_gen = yield_fast5_reads(self.fast5_path, recursive=False)
40 | read_id, read_data = next(f5_read_gen)
41 | self.assertTrue(read_id is not None, "read_id is None")
42 | self.assertTrue(isinstance(read_data, Fast5Read), "Return is not Fast5Read instance")
43 |
44 | def test_yield_fast5_reads_from_dir(self):
45 | f5_read_gen = yield_fast5_reads(test_data, recursive=False)
46 | read_id, read_data = next(f5_read_gen)
47 | self.assertTrue(read_id is not None, "read_id is None")
48 | self.assertTrue(isinstance(read_data, Fast5Read), "Return is not Fast5Read instance")
49 |
50 | def test_yield_fast5_reads_with_set(self):
51 | f5_read_gen = yield_fast5_reads(self.fast5_path,
52 | recursive=False,
53 | read_ids=self.read_id_set)
54 | f5_reads = list(f5_read_gen)
55 | self.assertTrue(len(f5_reads) == len(self.read_id_set))
56 |
57 | for read_id, read_data in f5_reads:
58 | self.assertTrue(read_id in self.read_id_set, "A read_id is not a member of read_ids")
59 | self.assertTrue(isinstance(read_data, Fast5Read), "Return is not Fast5Read instance")
60 |
61 | def test_yield_fast5_reads_with_list(self):
62 | f5_read_gen = yield_fast5_reads(self.fast5_path,
63 | recursive=False,
64 | read_ids=self.read_id_set)
65 | f5_reads = list(f5_read_gen)
66 | self.assertTrue(len(f5_reads) == len(self.read_id_list))
67 |
68 | for read_id, read_data in f5_reads:
69 | self.assertTrue(read_id in self.read_id_set, "A read_id is not a member of read_id_list")
70 | self.assertTrue(isinstance(read_data, Fast5Read), "Return is not Fast5Read instance")
71 |
72 | def test_yield_fast5_reads_set_versus_list_equality(self):
73 | f5_read_gen_by_id_set = yield_fast5_reads(self.fast5_path,
74 | recursive=False,
75 | read_ids=self.read_id_set)
76 |
77 | f5_read_gen_by_id_list = yield_fast5_reads(self.fast5_path,
78 | recursive=False,
79 | read_ids=self.read_id_list)
80 |
81 | # Consume the generators into sets
82 | ids_by_set = set(rid for rid, _ in f5_read_gen_by_id_set)
83 | ids_by_list = set(rid for rid, _ in f5_read_gen_by_id_list)
84 | self.assertTrue(ids_by_list == ids_by_set, 'Ids differ when using read_id list versus set')
85 |
86 |
87 | def test_yield_fast5_reads_with_empty_set(self):
88 | f5_read_gen = yield_fast5_reads(self.fast5_path,
89 | recursive=False,
90 | read_ids=set([]))
91 |
92 | self.assertTrue(len(list(f5_read_gen)) != 0, "Empty read_ids resulted in zero returned reads")
93 |
94 | def test_yield_fast5_reads_with_garbage_set(self):
95 | f5_read_gen = yield_fast5_reads(self.fast5_path,
96 | recursive=False,
97 | read_ids={'_g4rbag£_'})
98 | f5_reads = list(f5_read_gen)
99 | self.assertTrue(len(f5_reads) == 0, "Garbage read_ids returned non-zero reads")
100 |
101 | def test_yield_fast5_reads_type_error(self):
102 | with self.assertRaisesRegex(TypeError, 'read_ids'):
103 | f5_read_gen = yield_fast5_reads(self.fast5_path,
104 | recursive=False,
105 | read_ids=int(1))
106 | next(f5_read_gen)
--------------------------------------------------------------------------------
/test/test_fast5_converter.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import patch
2 |
3 | import os
4 | import h5py
5 | import numpy
6 |
7 | from ont_fast5_api.conversion_tools.multi_to_single_fast5 import convert_multi_to_single, try_multi_to_single_conversion
8 | from ont_fast5_api.conversion_tools.single_to_multi_fast5 import batch_convert_single_to_multi, get_fast5_file_list, \
9 | create_multi_read_file
10 | from ont_fast5_api.multi_fast5 import MultiFast5File
11 | from ont_fast5_api.fast5_file import Fast5FileTypeError, Fast5File
12 | from test.helpers import TestFast5ApiHelper, test_data, disable_logging
13 |
14 |
15 | class TestFast5Converter(TestFast5ApiHelper):
16 |
17 | @patch('ont_fast5_api.conversion_tools.single_to_multi_fast5.get_progress_bar')
18 | def test_single_to_multi(self, mock_pbar):
19 | input_folder = os.path.join(test_data, "single_reads")
20 | batch_size = 3
21 | file_count = len(os.listdir(input_folder))
22 | batch_convert_single_to_multi(input_folder, self.save_path, filename_base="batch", batch_size=batch_size,
23 | threads=1, recursive=False, follow_symlinks=False, target_compression=None)
24 |
25 | expected_output_reads = {"filename_mapping.txt": 0,
26 | "batch_0.fast5": batch_size,
27 | "batch_1.fast5": file_count % batch_size}
28 | self.assertEqual(sorted(os.listdir(self.save_path)), sorted(list(expected_output_reads.keys())))
29 | for file, read_count in expected_output_reads.items():
30 | if read_count > 0:
31 | with h5py.File(os.path.join(self.save_path, file), 'r') as f5:
32 | self.assertEqual(len(f5), read_count)
33 |
34 | def test_multi_to_single(self):
35 | input_file = os.path.join(test_data, "multi_read", "batch_0.fast5")
36 | with MultiFast5File(input_file, 'r') as f5:
37 | read_count = len(f5.handle)
38 | expected_files = sorted([os.path.join(self.save_path, "{}", i + '.fast5') for i in f5.get_read_ids()])
39 |
40 | subfolder = '0'
41 | convert_multi_to_single(input_file, self.save_path, subfolder)
42 |
43 | out_files = sorted(get_fast5_file_list(self.save_path, recursive=True, follow_symlinks=True))
44 | self.assertEqual(len(out_files), read_count)
45 | self.assertEqual(out_files, [f.format(subfolder) for f in expected_files])
46 |
47 | @disable_logging
48 | def test_single_to_multi_incorrect_types(self):
49 | input_files = [os.path.join(test_data, "multi_read", "batch_0.fast5")]
50 | with self.assertRaises(Fast5FileTypeError):
51 | create_multi_read_file(input_files, self.generate_temp_filename(), target_compression=None)
52 |
53 | def test_multi_to_single_incorrect_types(self):
54 | input_folder = os.path.join(test_data, "single_reads")
55 | input_file = os.path.join(input_folder, os.listdir(input_folder)[0])
56 | with self.assertRaises(Fast5FileTypeError):
57 | try_multi_to_single_conversion(input_file, self.save_path, subfolder='0')
58 |
59 | def test_add_read_to_multi(self):
60 | with Fast5File(os.path.join(test_data, "single_reads", "read0.fast5"), 'r') as single_fast5, \
61 | MultiFast5File(self.generate_temp_filename(), 'w') as multi_out:
62 | multi_out.add_existing_read(single_fast5)
63 | expected_raw = single_fast5.get_raw_data()
64 | actual_raw = multi_out.get_read(single_fast5.get_read_id()).get_raw_data()
65 | self.assertTrue(numpy.array_equal(actual_raw, expected_raw))
66 |
--------------------------------------------------------------------------------
/test/test_fast5_interface.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | from ont_fast5_api.fast5_file import Fast5File
5 | from ont_fast5_api.fast5_interface import get_fast5_file, check_file_type, MULTI_READ, SINGLE_READ
6 | from ont_fast5_api.multi_fast5 import MultiFast5File
7 | from test.helpers import test_data
8 |
9 |
10 |
11 | class TestFast5Interface(unittest.TestCase):
12 |
13 | def test_correct_type(self):
14 | single_read_path = os.path.join(test_data, "single_reads", "read0.fast5")
15 | single_read_id = Fast5File(single_read_path).get_read_id()
16 | with get_fast5_file(single_read_path) as f5:
17 | self.assertTrue(isinstance(f5, Fast5File))
18 | self.assertEqual(check_file_type(f5), SINGLE_READ)
19 | self.assertEqual(len(f5.get_read_ids()), 1)
20 | self.assertEqual(single_read_id, f5.get_read_ids()[0])
21 | self.get_raw(f5)
22 |
23 | multi_read_path = os.path.join(test_data, "multi_read", "batch_0.fast5")
24 | with get_fast5_file(multi_read_path) as f5:
25 | self.assertTrue(isinstance(f5, MultiFast5File))
26 | self.assertEqual(check_file_type(f5), MULTI_READ)
27 | self.assertTrue(len(f5.get_read_ids()) >= 1)
28 | self.get_raw(f5)
29 |
30 | def get_raw(self, f5):
31 | # Test we can get raw data using the same method for single and multi
32 | raw_data = f5.get_read(f5.get_read_ids()[0]).get_raw_data()
33 | self.assertTrue(len(raw_data) >= 0)
34 |
--------------------------------------------------------------------------------
/test/test_fast5_subset.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy
3 | from unittest.mock import patch
4 | from pathlib import Path
5 |
6 | from ont_fast5_api.compression_settings import VBZ
7 | from ont_fast5_api.conversion_tools.fast5_subset import Fast5Filter
8 | from ont_fast5_api.conversion_tools.conversion_utils import Fast5FilterWorker, extract_selected_reads, read_generator
9 | from ont_fast5_api.multi_fast5 import MultiFast5File
10 | from ont_fast5_api.fast5_file import Fast5File
11 | from test.helpers import TestFast5ApiHelper, test_data
12 |
13 |
14 | class TestFast5Subset(TestFast5ApiHelper):
15 | input_multif5_path = Path(test_data) / "multi_read" / "batch_0.fast5"
16 | read_set = {"fe85b517-62ee-4a33-8767-41cab5d5ab39", "fe9374ee-b86a-4ca4-81dc-ac06e3297728"}
17 |
18 | def test_read_generator(self):
19 | count = 0
20 | for read_id, read in read_generator(input_file=self.input_multif5_path, read_set=self.read_set):
21 | self.assertIn(read_id, self.read_set)
22 | count += 1
23 |
24 | self.assertEqual(len(self.read_set), count)
25 |
26 | def _create_read_list_file(self, read_ids):
27 | output_path = os.path.join(self.save_path, 'read_list.txt')
28 | with open(output_path, 'w') as fh:
29 | for read_id in read_ids:
30 | fh.write(read_id + "\n")
31 | return output_path
32 |
33 | @patch('ont_fast5_api.conversion_tools.fast5_subset.logging')
34 | @patch('ont_fast5_api.conversion_tools.fast5_subset.get_progress_bar')
35 | def test_subset_from_single(self, mock_log, mock_pbar):
36 | input_path = os.path.join(test_data, "single_reads")
37 | read_list = self._create_read_list_file(self.read_set)
38 | f5_filter = Fast5Filter(input_folder=input_path,
39 | output_folder=self.save_path,
40 | read_list_file=read_list)
41 | f5_filter.run_batch()
42 |
43 | count = 0
44 | with MultiFast5File(os.path.join(self.save_path, 'batch0.fast5'), 'r') as output_f5:
45 | for input_file in os.listdir(input_path):
46 | with Fast5File(os.path.join(input_path, input_file), 'r') as input_f5:
47 | read_id = input_f5.get_read_id()
48 | if read_id in self.read_set:
49 | read_in = input_f5.get_read(read_id)
50 | read_out = output_f5.get_read(read_id)
51 | self.assertTrue(numpy.array_equal(read_in.get_raw_data(), read_out.get_raw_data()))
52 | count += 1
53 | self.assertEqual(len(self.read_set), count)
54 |
55 | @patch('ont_fast5_api.conversion_tools.fast5_subset.logging')
56 | @patch('ont_fast5_api.conversion_tools.fast5_subset.get_progress_bar')
57 | def test_subset_from_multi(self, mock_log, mock_pbar):
58 | read_list = self._create_read_list_file(self.read_set)
59 | f5_filter = Fast5Filter(input_folder=os.path.dirname(self.input_multif5_path),
60 | output_folder=self.save_path,
61 | read_list_file=read_list)
62 | f5_filter.run_batch()
63 | with MultiFast5File(self.input_multif5_path, 'r') as input_f5, \
64 | MultiFast5File(os.path.join(self.save_path, 'batch0.fast5'), 'r') as output_f5:
65 | self.assertEqual(len(self.read_set), len(output_f5.get_read_ids()))
66 | for read_id in self.read_set:
67 | read_in = input_f5.get_read(read_id)
68 | read_out = output_f5.get_read(read_id)
69 | self.assertTrue(numpy.array_equal(read_in.get_raw_data(), read_out.get_raw_data()))
70 |
71 | def test_extract_selected_reads(self):
72 | # three test for count below, equaling and above number of read in input file
73 | for count in (1, 2, 3):
74 | temp_file_name = self.generate_temp_filename()
75 | found_reads, output_file, input_file = extract_selected_reads(input_file=self.input_multif5_path,
76 | output_file=temp_file_name,
77 | count=count, read_set=self.read_set)
78 | if count < len(self.read_set):
79 | self.assertTrue(found_reads.issubset(self.read_set))
80 | self.assertEqual(input_file, self.input_multif5_path)
81 | elif count == len(self.read_set):
82 | self.assertEqual(found_reads, self.read_set)
83 | self.assertEqual(input_file, self.input_multif5_path)
84 | elif count >= len(self.read_set):
85 | self.assertEqual(found_reads, self.read_set)
86 | self.assertIsNone(input_file)
87 |
88 | self.assertEqual(output_file, temp_file_name)
89 | # verify that resulting output file is a legal MultiFast5 with desired reads in it
90 | with MultiFast5File(output_file) as multi_file:
91 | readlist = multi_file.get_read_ids()
92 | self.assertTrue(set(readlist).issubset(self.read_set))
93 |
94 | @patch('ont_fast5_api.conversion_tools.conversion_utils.ProgressBar')
95 | @patch('ont_fast5_api.conversion_tools.fast5_subset.logging')
96 | def test_selector_args_generator(self, mock_pbar, mock_logger):
97 | single_reads = os.path.join(test_data, "single_reads")
98 | self.assertTrue(os.path.isdir(single_reads), msg=single_reads)
99 |
100 | input_f5s = list(Path(single_reads).glob('*.fast5'))
101 | batch_size = 1
102 |
103 | f = Fast5FilterWorker(
104 | input_file_list=input_f5s,
105 | output_dir=Path(self.save_path),
106 | read_set=self.read_set,
107 | batch_size=batch_size,
108 | filename_base="batch",
109 | target_compression=VBZ,
110 | progressbar=mock_pbar,
111 | logger=mock_logger
112 | )
113 |
114 | args_combos = list(f._args_generator())
115 | # there should be two tuples of arguments
116 | self.assertEqual(len(args_combos), len(self.read_set) / batch_size)
117 |
118 | num_files_queued = len(f.input_f5s) # should be 0
119 | self.assertEqual(num_files_queued, (len(input_f5s) - len(args_combos)), msg=f.input_f5s)
120 | self.assertEqual(len(f.available_out_files), 0)
121 |
122 | # "exhaust" an input file and put output file back on queue
123 | input_file, output_file, reads, count, compression = args_combos[0]
124 | f._update_file_lists(reads={}, in_file=None, out_file=output_file)
125 | self.assertEqual(len(f.input_f5s), num_files_queued)
126 | self.assertEqual(len(f.available_out_files), 1)
127 | self.assertEqual(compression, VBZ)
128 |
129 | # this results in another args tuple generated
130 | new_args_combos = list(f._args_generator())
131 | self.assertEqual(len(new_args_combos), 1, msg=len(new_args_combos))
--------------------------------------------------------------------------------
/test/test_hardlink_metadata.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from ont_fast5_api.compression_settings import VBZ
4 | from ont_fast5_api.conversion_tools.compress_fast5 import compress_file
5 | from ont_fast5_api.conversion_tools.conversion_utils import extract_selected_reads
6 | from ont_fast5_api.fast5_interface import get_fast5_file
7 | from ont_fast5_api.multi_fast5 import MultiFast5File
8 | from ont_fast5_api.static_data import HARDLINK_GROUPS
9 | from test.helpers import TestFast5ApiHelper, test_data
10 |
11 |
12 | class TestHardlinkMetaData(TestFast5ApiHelper):
13 | read_subset = {'00031f3e-415c-4ab5-9c16-fb6fe45ff519',
14 | "000c0b4e-46c2-4fb5-9b17-d7031eefb975",
15 | '000ebd63-3e1a-4499-9ded-26af3225a022',
16 | '002ad0e4-c6bb-4eff-a30f-5fec01475ab8',
17 | '0059d270-3238-4413-b38b-f588e28326df'}
18 |
19 | def test_create_read(self):
20 | input_path = os.path.join(test_data, 'hardlink', 'unlinked', 'batch0.fast5')
21 | output_path = self.generate_temp_filename()
22 | compress_file(input_path, output_path, target_compression=VBZ)
23 | new_read_id = "123456789abcdef"
24 | with MultiFast5File(output_path, 'a') as f5:
25 | # Test we can hardlink to existing metadata when creating an new empty read
26 | run_id = list(f5.run_id_map.keys())[0]
27 | master_read_id = f5.run_id_map[run_id]
28 | f5.create_empty_read(new_read_id, run_id)
29 | for group in HARDLINK_GROUPS:
30 | self.assertTrue(self.is_read_hardlinked(f5, new_read_id, master_read_id, group))
31 |
32 | # Test we don't explode if there is no metadata
33 | f5.create_empty_read(new_read_id[::-1], "not an existing run_id")
34 |
35 | def test_hardlink_multi_compression(self):
36 | input_path = os.path.join(test_data, 'hardlink', 'unlinked', 'batch0.fast5')
37 | output_path = self.generate_temp_filename()
38 |
39 | self.assertFalse(self.is_file_hardlinked(input_path))
40 | compress_file(input_path, output_path, target_compression=VBZ)
41 | self.assertTrue(self.is_file_hardlinked(output_path))
42 |
43 | def test_hardlink_subset(self):
44 | input_path = os.path.join(test_data, 'hardlink', 'unlinked', 'batch0.fast5')
45 | output_path = self.generate_temp_filename()
46 |
47 | self.assertFalse(self.is_file_hardlinked(input_path))
48 | extract_selected_reads(input_path, output_path, self.read_subset, count=len(self.read_subset))
49 | self.assertTrue(self.is_file_hardlinked(output_path))
50 |
51 | def test_hardlink_subset_single_reads(self):
52 | input_path = os.path.join(test_data, 'hardlink', 'single_reads')
53 | output_path = self.generate_temp_filename()
54 |
55 | for single_read_file in os.listdir(input_path):
56 | extract_selected_reads(os.path.join(input_path, single_read_file), output_path, self.read_subset, count=1)
57 | self.assertTrue(self.is_file_hardlinked(output_path))
58 |
59 | def test_hardlink_single_to_multi(self):
60 | input_folder = os.path.join(test_data, 'hardlink', 'single_reads')
61 | input_files = [os.path.join(input_folder, file) for file in os.listdir(input_folder)]
62 | output_path = self.generate_temp_filename()
63 |
64 | with MultiFast5File(output_path, 'a') as multi_f5:
65 | for input_file in input_files:
66 | with get_fast5_file(input_file, 'r') as f5_file:
67 | for read in f5_file.get_reads():
68 | multi_f5.add_existing_read(read)
69 |
70 | with MultiFast5File(output_path, 'r') as multi_f5:
71 | self.assertEqual(len(input_files), len(multi_f5.get_read_ids()))
72 | self.assertTrue(self.is_file_hardlinked(output_path))
73 |
74 | def is_file_hardlinked(self, input_path):
75 | file_hardlinked = True
76 | with MultiFast5File(input_path, 'r') as f5_file:
77 | for read in f5_file.get_reads():
78 | master_read_id = f5_file.run_id_map[read.get_run_id()]
79 | for group in HARDLINK_GROUPS:
80 | file_hardlinked &= self.is_read_hardlinked(f5_file, read.read_id, master_read_id, group)
81 | return file_hardlinked
82 |
83 | def is_read_hardlinked(self, f5_handle, read_id1, read_id2, group):
84 | if read_id1 == read_id2:
85 | return True
86 | group1 = f5_handle.get_read(read_id1).handle[group]
87 | group2 = f5_handle.get_read(read_id2).handle[group]
88 | return group1 == group2
89 |
--------------------------------------------------------------------------------
/test/test_multi_fast5.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import os
3 | import random
4 |
5 | from ont_fast5_api.fast5_file import Fast5File
6 | from ont_fast5_api.fast5_read import Fast5Read
7 | from ont_fast5_api.multi_fast5 import MultiFast5File
8 | from test.helpers import TestFast5ApiHelper
9 |
10 | hexdigits = "0123456789abcdef"
11 | run_id = "123abc"
12 |
13 |
14 | class TestMultiFast5(TestFast5ApiHelper):
15 |
16 | def create_multi_file(self, read_ids):
17 | filename = self.generate_temp_filename()
18 | # driver=None is the default, but adding this in here makes sure we
19 | # preserve the constructor argument.
20 | with MultiFast5File(filename, 'w', driver=None) as multi_f5:
21 | for read_id in read_ids:
22 | multi_f5.create_empty_read(read_id, run_id)
23 | return filename
24 |
25 | def test_read_interface(self):
26 | read_ids = generate_read_ids(6)
27 | f5_file = self.create_multi_file(read_ids)
28 |
29 | with MultiFast5File(f5_file, 'a') as multi_f5:
30 | # Check we have the read_ids we expect
31 | self.assertEqual(sorted(read_ids), sorted(multi_f5.get_read_ids()))
32 |
33 | # Try and add another read with the same read_id and expect error
34 | with self.assertRaises(ValueError):
35 | multi_f5.create_empty_read(read_ids[0], run_id)
36 |
37 | # Test we can get a read from the file and it has the interface we expect
38 | read_0 = multi_f5.get_read(read_ids[0])
39 | self.assertTrue(isinstance(read_0, Fast5Read))
40 |
41 | # Test we cannot get a read which doesn't exit
42 | with self.assertRaises(KeyError):
43 | multi_f5.get_read("0123")
44 |
45 | def test_raw_data(self):
46 | f5_file = self.create_multi_file(generate_read_ids(4))
47 | data = list(range(10))
48 | raw_attrs = {
49 | "duration": 1,
50 | "median_before": 2.5,
51 | "read_id": "abcd",
52 | "read_number": 8,
53 | "start_mux": 2,
54 | "start_time": 99
55 | }
56 | with MultiFast5File(f5_file, 'a') as multi_f5:
57 | read0 = multi_f5.get_read(multi_f5.get_read_ids()[0])
58 | read0.add_raw_data(data, attrs=raw_attrs)
59 | output_data = read0.get_raw_data()
60 | numpy.testing.assert_array_equal(output_data, data)
61 |
62 | def test_channel_info(self):
63 | f5_file = self.create_multi_file(generate_read_ids(4))
64 | channel_info = {
65 | "digitisation": 2048,
66 | "offset": -119.5,
67 | "range": 74.2,
68 | "sampling_rate": 4000,
69 | "channel_number": "72"
70 | }
71 | # Fast5File explicitly casts the channel number on reading
72 | expected_out = channel_info.copy()
73 | expected_out['channel_number'] = int(channel_info['channel_number'])
74 | with MultiFast5File(f5_file, 'a') as multi_f5:
75 | read0 = multi_f5.get_read(multi_f5.get_read_ids()[0])
76 | read0.add_channel_info(channel_info)
77 | output_data = read0.get_channel_info()
78 | self.assertEqual(output_data, expected_out)
79 |
80 | def test_tracking_id(self):
81 | f5_file = self.create_multi_file(generate_read_ids(4))
82 | tracking_id = {
83 | "asic_id_eeprom": "some string",
84 | "device_id": "some string",
85 | "exp_script_name": "some string",
86 | "exp_script_purpose": "some string",
87 | "exp_start_time": "some string",
88 | "flow_cell_id": "some string",
89 | "hostname": "some string",
90 | "protocol_run_id": "some string",
91 | "protocols_version": "some string",
92 | "run_id": "some string",
93 | "version": "some string",
94 | }
95 |
96 | with MultiFast5File(f5_file, 'a') as multi_f5:
97 | read0 = multi_f5.get_read(multi_f5.get_read_ids()[0])
98 | read0.add_tracking_id(tracking_id)
99 | output_data = read0.get_tracking_id()
100 | self.assertEqual(output_data, tracking_id)
101 |
102 | def test_add_analysis(self):
103 | f5_file = self.create_multi_file(generate_read_ids(4))
104 | group = "Test"
105 | component = "test_component"
106 | attrs = {"attribute": 1}
107 |
108 | # Fast5File.add_analysis includes the component name in the analysis attributes
109 | expected_attributes = attrs.copy()
110 | expected_attributes['component'] = component
111 | with MultiFast5File(f5_file, 'a') as multi_f5:
112 | read0 = multi_f5.get_read(multi_f5.get_read_ids()[0])
113 | self.assertEqual(read0.list_analyses(), [])
114 | read0.add_analysis(component, group, attrs)
115 | self.assertEqual(read0.list_analyses(), [(component, group)])
116 | self.assertEqual(read0.get_analysis_attributes(group), expected_attributes)
117 |
118 |
119 | def generate_read_ids(num_ids, id_len=8):
120 | return ["".join(random.choice(hexdigits) for _ in range(id_len)) for _ in range(num_ids)]
121 |
--------------------------------------------------------------------------------
/test/test_segmentation_tools.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 |
4 | from ont_fast5_api.fast5_file import Fast5File
5 | from ont_fast5_api.analysis_tools.event_detection import EventDetectionTools
6 | from ont_fast5_api.analysis_tools.segmentation import SegmentationTools
7 | from test.helpers import TestFast5ApiHelper
8 |
9 |
10 | class TestSegmentationTools(TestFast5ApiHelper):
11 |
12 | def test_001_raw_only(self):
13 | fname = self.generate_temp_filename()
14 | with Fast5File(fname, mode='w') as fh:
15 | fh.add_channel_info({'channel_number': 1,
16 | 'sampling_rate': 4000,
17 | 'digitisation': 8192,
18 | 'range': 819.2,
19 | 'offset': 0})
20 | fh.add_read(12, 'unique_snowflake', 12345, 1000, 0, 120.75)
21 | raw = np.empty(1000, dtype=np.int16)
22 | raw[:] = range(1000)
23 | fh.add_raw_data(raw)
24 | attrs = {'name': 'test', 'version': 0, 'time_stamp': 'just now'}
25 | fh.add_analysis('segmentation', 'Segmentation_000', attrs)
26 | segment_data = {'has_template': 1,
27 | 'has_complement': 1,
28 | 'first_sample_template': 10,
29 | 'duration_template': 470,
30 | 'first_sample_complement': 520,
31 | 'duration_complement': 460}
32 | fh.set_summary_data('Segmentation_000', 'segmentation', segment_data)
33 | with SegmentationTools(fh, group_name='Segmentation_000') as segment:
34 | results = segment.get_results()
35 | self.assertDictEqual({'has_template': True,
36 | 'has_complement': True,
37 | 'first_sample_template': 10,
38 | 'duration_template': 470,
39 | 'first_sample_complement': 520,
40 | 'duration_complement': 460}, results)
41 | temp_raw = segment.get_raw_data('template', scale=False)
42 | np.testing.assert_array_equal(temp_raw, raw[10:480])
43 | comp_raw = segment.get_raw_data('complement', scale=False)
44 | np.testing.assert_array_equal(comp_raw, raw[520:980])
45 | temp_raw, comp_raw = segment.get_raw_data('both', scale=False)
46 | np.testing.assert_array_equal(temp_raw, raw[10:480])
47 | np.testing.assert_array_equal(comp_raw, raw[520:980])
48 | temp_raw, comp_raw = segment.get_raw_data('both', scale=True)
49 | scaled_temp = raw[10:480] * 0.1
50 | scaled_comp = raw[520:980] * 0.1
51 | np.testing.assert_array_almost_equal(temp_raw, scaled_temp, decimal=5)
52 | np.testing.assert_array_almost_equal(comp_raw, scaled_comp, decimal=5)
53 |
54 | def test_002_events_only(self):
55 | fname = self.generate_temp_filename()
56 | with Fast5File(fname, mode='w') as fh:
57 | fh.add_channel_info({'channel_number': 1,
58 | 'sampling_rate': 4000,
59 | 'digitisation': 8192,
60 | 'range': 819.2,
61 | 'offset': 0})
62 | fh.add_read(12, 'unique_snowflake', 10000, 1000, 0, 120.75)
63 | with EventDetectionTools(fh, group_name='EventDetection_000', meta={'name': 'test'}) as evdet:
64 | data = np.zeros(100, dtype=[('start', int), ('length', int), ('mean', float), ('stdv', float)])
65 | data['start'][2] = 10010
66 | data['start'][46] = 10470
67 | data['length'][46] = 10
68 | data['start'][53] = 10520
69 | data['start'][97] = 10960
70 | data['length'][97] = 20
71 | read_attrs = {'read_number': 12}
72 | evdet.set_event_data(data, read_attrs)
73 | attrs = {'name': 'test', 'version': 0, 'time_stamp': 'just now',
74 | 'event_detection': 'Analyses/EventDetection_000'}
75 | fh.add_analysis('segmentation', 'Segmentation_000', attrs)
76 | segment_data = {'has_template': 1,
77 | 'has_complement': 1,
78 | 'start_event_template': 2,
79 | 'end_event_template': 47,
80 | 'start_event_complement': 53,
81 | 'end_event_complement': 98}
82 | fh.set_summary_data('Segmentation_000', 'segmentation', segment_data)
83 | with SegmentationTools(fh, group_name='Segmentation_000') as segment:
84 | results = segment.get_results()
85 | self.assertDictEqual({'has_template': True,
86 | 'has_complement': True,
87 | 'start_event_template': 2,
88 | 'end_event_template': 47,
89 | 'start_event_complement': 53,
90 | 'end_event_complement': 98,
91 | 'first_sample_template': 10,
92 | 'duration_template': 470,
93 | 'first_sample_complement': 520,
94 | 'duration_complement': 460}, results)
95 |
--------------------------------------------------------------------------------