├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── bulkvis
    ├── __init__.py
    ├── _version.py
    ├── bulkvis.py
    ├── bulkvis_server
    │   ├── main.py
    │   └── templates
    │   │   ├── index.html
    │   │   └── styles.css
    ├── cite.py
    ├── core.py
    ├── fuse.py
    ├── mappings.py
    ├── merge.py
    └── serve.py
├── docs
    ├── Makefile
    ├── _static
    │   ├── css
    │   │   └── custom.css
    │   ├── icons
    │   │   ├── save.png
    │   │   ├── xpan.png
    │   │   └── zoom.png
    │   └── images
    │   │   ├── bulk_file
    │   │       ├── 01_pop_up.png
    │   │       ├── 02_read_config.png
    │   │       └── 03_bulk_config.png
    │   │   ├── quickstart
    │   │       ├── 01_initial.png
    │   │       ├── 02_position.png
    │   │       ├── 03_plot.png
    │   │       ├── 04_sidebar.png
    │   │       ├── 05_annotations.png
    │   │       ├── 06_adjustments.png
    │   │       ├── 07_plot.png
    │   │       └── 08_read_file.png
    │   │   └── utilities
    │   │       └── 01_plot.png
    ├── collecting_a_bulk_file.rst
    ├── conf.py
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── quickstart.rst
    └── utilities.rst
├── env.yml
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | .DS_STORE
141 | *.fast5
142 | .idea/
143 | *.ini
144 | docs/_build
145 | docs/_template
146 | data/*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 LooseLab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include bulkvis/ *.py
2 | recursive-include bulkvis/ *.html
3 | recursive-include bulkvis/ *.css
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ⇜ bulkvis ⇝
 2 | ============
 3 | 
 4 | An app written in Python3 using [Bokeh][1] to visualise raw squiggle data from Oxford Nanopore Technologies (ONT) bulkfiles. 
 5 | 
 6 | Quickstart
 7 | ==========
 8 | 
 9 | Our preferred installation method uses `conda` with this environment setup:
10 | ```yaml
11 | name: bulkvis
12 | channels:
13 |   - bioconda
14 |   - conda-forge
15 |   - defaults
16 | dependencies:
17 |   - python=3.11
18 |   - pip
19 |   - pip:
20 |     - numpy==1.26.4
21 |     - git+https://github.com/LooseLab/bulkvis.git@2.0
22 | ```
23 | 
24 | Either copy the YAML above into a file or:
25 | 
26 | ```console
27 | curl -O https://raw.githubusercontent.com/LooseLab/bulkvis/2.0/env.yml
28 | conda env create -f env.yml
29 | ```
30 | 
31 | Then bulkvis can be started using:
32 | ```console
33 | conda activate bulkvis
34 | bulkvis serve <BULK_FILE_DIRECTORY> --show
35 | ```
36 | 
37 | <details>
38 | <summary>or with another python source</summary>
39 | 
40 | ```bash
41 | # Make a python3 virtual environment
42 | python3 -m venv bulkvis
43 | 
44 | # Activate virtual environment
45 | source bulkvis/bin/activate
46 | 
47 | # Clone the repo to your installation/projects directory
48 | pip install git+https://github.com/LooseLab/bulkvis.git@2.0
49 | 
50 | # Start bokeh server
51 | bulkvis serve <BULK_FILE_DIRECTORY> --show
52 | ```
53 | </details>
54 | 
55 | Other install requires:
56 | ===
57 | 
58 | To open some bulk FAST5 files [`vbz compression plugins`][2] are required. 
59 | These are written and maintained by Oxford Nanopore Technologies.
60 | 
61 | 
62 |  [1]: https://github.com/bokeh/bokeh/
63 |  [2]: https://github.com/nanoporetech/vbz_compression
64 | 


--------------------------------------------------------------------------------
/bulkvis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/bulkvis/__init__.py


--------------------------------------------------------------------------------
/bulkvis/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "2.0.1"
2 | 


--------------------------------------------------------------------------------
/bulkvis/bulkvis.py:
--------------------------------------------------------------------------------
 1 | """bulkvis.py
 2 | 
 3 | This is the main entry point for the bulkvis CLI
 4 | """
 5 | import argparse
 6 | import importlib
 7 | 
 8 | from ._version import __version__
 9 | 
10 | 
11 | def run_command(parser, args):
12 |     try:
13 |         command = importlib.import_module(f"bulkvis.{args.command}")
14 |     except ImportError:
15 |         parser.exit(2, f"Could not use subcommand: {args.command!r}")
16 | 
17 |     command.run(parser, args)
18 | 
19 | 
20 | def main():
21 |     parser = argparse.ArgumentParser(
22 |         prog="bulkvis",
23 |         epilog="See '<command> --help' to read about a specific sub-command.",
24 |     )
25 |     version = f"bulkvis {__version__}"
26 |     parser.add_argument("--version", action="version", version=version)
27 |     subparsers = parser.add_subparsers(dest="command", help="Sub-commands")
28 | 
29 |     for module in ["fuse", "merge", "serve", "mappings", "cite"]:
30 |         _module = importlib.import_module(f"bulkvis.{module}")
31 |         _parser = subparsers.add_parser(
32 |             module, description=_module._help, help=_module._help
33 |         )
34 |         for *flags, opts in _module._cli:
35 |             _parser.add_argument(*flags, **opts)
36 |         _parser.set_defaults(func=run_command)
37 | 
38 |     args = parser.parse_args()
39 |     if args.command is not None:
40 |         args.func(parser, args)
41 |     else:
42 |         parser.print_help()
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 
48 | # TODO: Changelog and deprecations
49 | # TODO: github workflows
50 | # TODO: Make sure CLIs match run
51 | 


--------------------------------------------------------------------------------
/bulkvis/bulkvis_server/main.py:
--------------------------------------------------------------------------------
   1 | import configparser
   2 | from dateutil import parser
   3 | import math
   4 | from pathlib import Path
   5 | import re
   6 | import io
   7 | import argparse
   8 | import logging
   9 | from collections import OrderedDict
  10 | 
  11 | import h5py
  12 | import numpy as np
  13 | import pandas as pd
  14 | from bokeh.layouts import row, column
  15 | from bokeh.models import (
  16 |     TextInput,
  17 |     Toggle,
  18 |     Div,
  19 |     Range1d,
  20 |     Label,
  21 |     Span,
  22 |     Title,
  23 |     LabelSet,
  24 |     RadioButtonGroup,
  25 | )
  26 | from bokeh.models import (
  27 |     CheckboxGroup,
  28 |     Dropdown,
  29 |     PreText,
  30 |     Select,
  31 |     Button,
  32 |     ColumnDataSource,
  33 | )
  34 | from bokeh.plotting import curdoc, figure
  35 | 
  36 | 
  37 | def export_read_file(channel, start_index, end_index, bulkfile, output_dir):
  38 |     """
  39 |     Export a read file generated from index coordinates and
  40 |     :param channel: int, channel number
  41 |     :param start_index: int, start index for read
  42 |     :param end_index: int, end index for read
  43 |     :param bulkfile: bulkfile object
  44 |     :param output_dir: str, output directory, including trailing slash
  45 |     :return: 0 for success
  46 |     """
  47 |     out_filename = Path(bulkfile.filename).stem
  48 |     # out_filename = (
  49 |     #     bulkfile["UniqueGlobalKey"]["context_tags"].attrs["filename"].decode("utf8")
  50 |     # )
  51 | 
  52 |     output_arg = "{dir}/{fn}_bulkvis-read_{start}-{end}_ch_{ch}.fast5".format(
  53 |         dir=output_dir,
  54 |         fn=out_filename,
  55 |         start=start_index,
  56 |         end=end_index,
  57 |         ch=channel,
  58 |     )
  59 | 
  60 |     LOGGER.info(f"Exporting to {output_arg}")
  61 | 
  62 |     readfile = h5py.File(output_arg, "w")
  63 |     read_id_str = "{ch}-{start}-{end}".format(
  64 |         ch=channel, start=start_index, end=end_index
  65 |     )
  66 |     version_num = 0.6
  67 | 
  68 |     ch_num = channel
  69 |     ch_str = "Channel_{ch}".format(ch=ch_num)
  70 | 
  71 |     ugk = readfile.create_group("UniqueGlobalKey")
  72 | 
  73 |     bulkfile.copy("UniqueGlobalKey/context_tags", ugk)
  74 |     bulkfile.copy("UniqueGlobalKey/tracking_id", ugk)
  75 |     bulkfile.copy("IntermediateData/{ch}/Meta".format(ch=ch_str), ugk)
  76 | 
  77 |     readfile["UniqueGlobalKey"]["channel_id"] = readfile["UniqueGlobalKey"]["Meta"]
  78 |     readfile["UniqueGlobalKey"]["channel_id"].attrs.create(
  79 |         "sampling_rate",
  80 |         readfile["UniqueGlobalKey"]["Meta"].attrs["sample_rate"],
  81 |         None,
  82 |         dtype="Float64",
  83 |     )
  84 |     del readfile["UniqueGlobalKey"]["Meta"]
  85 | 
  86 |     readfile["UniqueGlobalKey"]["channel_id"].attrs.create(
  87 |         "channel_number", ch_num, None, dtype="<S4"
  88 |     )
  89 |     remove_attrs = [
  90 |         "description",
  91 |         "elimit",
  92 |         "scaling_used",
  93 |         "smallest_event",
  94 |         "threshold",
  95 |         "window",
  96 |         "sample_rate",
  97 |     ]
  98 |     for attr in remove_attrs:
  99 |         del readfile["UniqueGlobalKey"]["channel_id"].attrs[attr]
 100 | 
 101 |     int_data_path = bulkfile["IntermediateData"][ch_str]["Reads"]
 102 |     int_dict = {
 103 |         "read_start": int_data_path["read_start"],
 104 |         "median_before": int_data_path["median_before"],
 105 |         # "current_well_id": int_data_path["current_well_id"],
 106 |     }
 107 |     df = pd.DataFrame(data=int_dict)
 108 |     df = df.where(df.read_start > start_index).dropna()
 109 |     read_number = 0
 110 |     attrs = {
 111 |         "duration": {"val": end_index - start_index, "d": "uint32"},
 112 |         "median_before": {"val": df.iloc[0].median_before, "d": "Float64"},
 113 |         "read_id": {"val": read_id_str, "d": "<S38"},
 114 |         "read_number": {"val": read_number, "d": "uint16"},
 115 |         # "start_mux": {"val": int(df.iloc[0].current_well_id), "d": "uint8"},
 116 |         "start_time": {"val": start_index, "d": "uint64"},
 117 |     }
 118 | 
 119 |     dataset = bulkfile["Raw"][ch_str]["Signal"][()]
 120 |     dataset = dataset[start_index:end_index]
 121 | 
 122 |     readfile.create_group("Raw/Reads/Read_{n}".format(n=read_number))
 123 |     readfile.attrs.create("file_version", version_num, None, dtype="Float64")
 124 |     # add read_### attrs
 125 |     for k, v in attrs.items():
 126 |         readfile["Raw"]["Reads"]["Read_{n}".format(n=read_number)].attrs.create(
 127 |             k, v["val"], None, dtype=v["d"]
 128 |         )
 129 | 
 130 |     ms = [18446744073709551615]
 131 |     readfile.create_dataset(
 132 |         "Raw/Reads/Read_{n}/Signal".format(n=read_number),
 133 |         data=(dataset),
 134 |         maxshape=(ms),
 135 |         chunks=True,
 136 |         dtype="int16",
 137 |         compression="gzip",
 138 |         compression_opts=1,
 139 |     )
 140 | 
 141 |     readfile.close()
 142 |     return 0
 143 | 
 144 | 
 145 | LOGGER = logging.getLogger("bokeh")
 146 | 
 147 | arg_parser = argparse.ArgumentParser()
 148 | arg_parser.add_argument("dir")
 149 | args = arg_parser.parse_args()
 150 | 
 151 | LOGGER.info(f"Using dir: {args.dir}")
 152 | 
 153 | config = configparser.ConfigParser()
 154 | 
 155 | cfg = f"""
 156 | [data]
 157 | dir = {args.dir}
 158 | map = {args.dir}
 159 | out = {args.dir}
 160 | 
 161 | [plot_opts]
 162 | wdg_width = 300
 163 | plot_width = 800
 164 | plot_height = 1000
 165 | y_min = 200
 166 | y_max = 4000
 167 | label_height = 750
 168 | upper_cut_off = 10000
 169 | lower_cut_off = -4100
 170 | output_backend = canvas
 171 | 
 172 | [labels]
 173 | adapter = True
 174 | pore = True
 175 | strand = True
 176 | transition = True
 177 | unavailable = True
 178 | unblocking = True
 179 | """
 180 | 
 181 | config.read_file(io.StringIO(cfg))
 182 | cfg_po = config["plot_opts"]
 183 | cfg_dr = config["data"]
 184 | cfg_lo = config["labels"]
 185 | output_backend = {"canvas", "svg", "webgl"}
 186 | 
 187 | """
 188 | 
 189 | PUT IN A CHECK THAT REQUIRED CFG PARAMS ARE SET!!!!
 190 | 
 191 | if cfg_dr[out] == '':
 192 |     disable read-file write function <- remove button
 193 | 
 194 | """
 195 | 
 196 | 
 197 | def init_wdg_dict():
 198 |     """
 199 |     Initialise the widget dictionary, adds in the initial bulkfile selector
 200 |     Returns
 201 |     -------
 202 | 
 203 |     """
 204 |     wdg_dict = OrderedDict()
 205 |     wdg_dict["file_list"] = Select(
 206 |         title="Select bulk FAST5 file:", options=app_data["app_vars"]["files"]
 207 |     )
 208 |     wdg_dict["file_list"].on_change("value", update_file)
 209 |     return wdg_dict
 210 | 
 211 | 
 212 | def update_file(attr, old, new):
 213 |     """"""
 214 |     if app_data["bulkfile"]:
 215 |         app_data["bulkfile"].flush()
 216 |         app_data["bulkfile"].close()
 217 | 
 218 |     if new == "":
 219 |         app_data["wdg_dict"] = init_wdg_dict()
 220 |         f = figure(toolbar_location=None)
 221 |         f.outline_line_color = None
 222 |         f.toolbar.logo = None
 223 |         f.xaxis.visible = False
 224 |         f.yaxis.visible = False
 225 |         layout.children[0] = column(
 226 |             list(app_data["wdg_dict"].values()), width=int(cfg_po["wdg_width"])
 227 |         )
 228 |         layout.children[1] = f
 229 |         return
 230 | 
 231 |     file_src = app_data["wdg_dict"]["file_list"].value
 232 |     file_wdg = app_data["wdg_dict"]["file_list"]
 233 |     file_list = app_data["app_vars"]["files"]
 234 |     map_file_list = app_data["app_vars"]["map_files"]
 235 |     # Clear old bulkfile data and build new data structures
 236 |     app_data.clear()
 237 |     app_data["app_vars"] = {}
 238 |     app_data["wdg_dict"] = OrderedDict()
 239 |     app_data["label_dt"] = OrderedDict()
 240 |     app_data["file_src"] = Path(Path(cfg_dr["dir"]) / file_src)
 241 |     app_data["INIT"] = True
 242 |     app_data["app_vars"]["files"] = file_list
 243 |     app_data["app_vars"]["map_files"] = map_file_list
 244 | 
 245 |     (
 246 |         app_data["bulkfile"],
 247 |         app_data["app_vars"]["sf"],
 248 |         app_data["app_vars"]["attributes"],
 249 |     ) = open_bulkfile(app_data["file_src"])
 250 | 
 251 |     raw_path = app_data["bulkfile"]["Raw"]
 252 |     for i, member in enumerate(raw_path):
 253 |         if i == 0:
 254 |             signal_ds = raw_path[member]["Signal"][()]
 255 |             # get dataset length in seconds
 256 |             # app_data['app_vars']['len_ds'] = math.ceil(len(signal_ds) / app_data['app_vars']['sf'])
 257 |             app_data["app_vars"]["len_ds"] = len(signal_ds) / app_data["app_vars"]["sf"]
 258 | 
 259 |     # add fastq and position inputs
 260 |     app_data["wdg_dict"] = init_wdg_dict()
 261 |     app_data["wdg_dict"]["file_list"] = file_wdg
 262 |     # app_data['wdg_dict']['maps_list'] = Select(title="Select mapping file:", options=map_file_list)
 263 |     # app_data['wdg_dict']['position_label'] = Div(text='Position', css_classes=['position-dropdown', 'help-text'])
 264 |     app_data["wdg_dict"]["position_text"] = Div(
 265 |         text="""Enter a position in your bulk FAST5 file as <code>channel:start-end</code> or a <code>complete FASTQ header</code>.""",
 266 |         css_classes=["position-drop"],
 267 |     )
 268 |     app_data["wdg_dict"]["position"] = TextInput(
 269 |         value="",
 270 |         placeholder="e.g 391:120-150 or complete FASTQ header",
 271 |         css_classes=["position-label"],
 272 |     )
 273 |     read_bmf(app_data["app_vars"]["Run ID"])
 274 |     app_data["wdg_dict"]["position"].on_change("value", parse_position)
 275 | 
 276 |     layout.children[0] = column(
 277 |         list(app_data["wdg_dict"].values()), width=int(cfg_po["wdg_width"])
 278 |     )
 279 | 
 280 | 
 281 | def read_bmf(run_id):
 282 |     run_id = run_id + ".bmf"
 283 |     try:
 284 |         app_data["bmf"] = pd.read_csv(Path(Path(cfg_dr["map"]) / run_id), sep="\t")
 285 |         # filter mappings to just this run
 286 |         app_data["bmf"] = app_data["bmf"][
 287 |             app_data["bmf"]["run_id"] == app_data["app_vars"]["Run ID"]
 288 |         ]
 289 |     except FileNotFoundError:
 290 |         pass
 291 |     except Exception as e:
 292 |         print(e)
 293 |     return
 294 | 
 295 | 
 296 | def open_bulkfile(path):
 297 |     # !!! add in check to see if this is a ONT bulkfile
 298 |     # Open bulkfile in read-only mode
 299 |     open_file = h5py.File(path, "r")
 300 |     # Get sample frequency, how many data points are collected each second
 301 |     sf = int(
 302 |         open_file["UniqueGlobalKey"]["context_tags"]
 303 |         .attrs["sample_frequency"]
 304 |         .decode("utf8")
 305 |     )
 306 |     attributes = OrderedDict(
 307 |         [
 308 |             (
 309 |                 "tracking_id",
 310 |                 [
 311 |                     ("Experiment", "sample_id"),
 312 |                     ("Flowcell ID", "flow_cell_id"),
 313 |                     ("MinKNOW version", "version"),
 314 |                     ("Protocols version", "protocols_version"),
 315 |                     ("MinION ID", "device_id"),
 316 |                     ("Hostname", "hostname"),
 317 |                     ("Run ID", "run_id"),
 318 |                     ("ASIC ID", "asic_id"),
 319 |                     ("Experiment start", "exp_start_time"),
 320 |                 ],
 321 |             ),
 322 |             (
 323 |                 "context_tags",
 324 |                 [
 325 |                     ("Sequencing kit", "sequencing_kit"),
 326 |                     ("Flowcell type", "flowcell_type"),
 327 |                 ],
 328 |             ),
 329 |         ]
 330 |     )
 331 | 
 332 |     for k, v in attributes.items():
 333 |         for attribute in v:
 334 |             try:
 335 |                 app_data["app_vars"][attribute[0]] = (
 336 |                     open_file["UniqueGlobalKey"][k].attrs[attribute[1]].decode("utf8")
 337 |                 )
 338 |                 if attribute[1] == "exp_start_time":
 339 |                     app_data["app_vars"][attribute[0]] = parser.parse(
 340 |                         app_data["app_vars"][attribute[0]]
 341 |                     ).strftime("%d-%b-%Y %H:%M:%S")
 342 |             except KeyError:
 343 |                 app_data["app_vars"][attribute[0]] = "N/A"
 344 |     return open_file, sf, attributes
 345 | 
 346 | 
 347 | # noinspection PyUnboundLocalVariable
 348 | def parse_position(attr, old, new):
 349 |     if re.match(r"^(\@[a-f0-9\-]{36})([a-z0-9=\s]{1,})ch=[0-9]{1,4}", new):
 350 |         # https://regex101.com/r/9VvgNM/4
 351 |         # Match UUID / read_id as fastq str
 352 |         #   ^(\@[a-f0-9\-]{36})
 353 |         # Match lowercase a-z, 0-9, '=' and whitespace
 354 |         #   ([a-z0-9=\s]{1,})
 355 |         # Match 'ch=' and up to 4 numbers
 356 |         #   ch=[0-9]{1,4}
 357 |         # if new[0] == "@":
 358 |         input_error(app_data["wdg_dict"]["position"], "remove")
 359 |         fq = new[1:]
 360 |         fq_list = fq.split(" ")
 361 |         # split out read_id and channel
 362 |         for k, item in enumerate(fq_list):
 363 |             if k == 0:
 364 |                 read_id = item
 365 |             if item.split("=")[0] == "ch":
 366 |                 channel_num = item.split("=")[1]
 367 |                 channel_str = "Channel_{num}".format(num=channel_num)
 368 |         # Get ch_str, start, end
 369 |         # If read_id and ch not set...
 370 |         # noinspection PyUnboundLocalVariable
 371 |         if read_id and channel_str:
 372 |             int_data_path = app_data["bulkfile"]["IntermediateData"][channel_str][
 373 |                 "Reads"
 374 |             ]
 375 |             int_data_labels = {
 376 |                 "read_id": int_data_path["read_id"],
 377 |                 "read_start": int_data_path["read_start"],
 378 |             }
 379 |             df = pd.DataFrame(data=int_data_labels)
 380 |             df.read_start = df.read_start / app_data["app_vars"]["sf"]
 381 |             df.read_id = df.read_id.str.decode("utf8")
 382 |             df = df.where(df.read_id == read_id)
 383 |             df = df.dropna()
 384 |             if len(df) > 2:
 385 |                 start_time = math.floor(df.iloc[0, :].read_start)
 386 |                 end_time = math.ceil(df.iloc[-1, :].read_start)
 387 |             else:
 388 |                 input_error(app_data["wdg_dict"]["position"], "add")
 389 |                 return
 390 |         else:
 391 |             input_error(app_data["wdg_dict"]["position"], "add")
 392 |             return
 393 |         app_data["wdg_dict"]["position"].value = "{ch}:{start}-{end}".format(
 394 |             ch=channel_num, start=start_time, end=end_time
 395 |         )
 396 |     elif re.match(r"^([0-9]{1,4}:[0-9]{1,9}-[0-9]{1,9})\Z", new):
 397 |         # https://regex101.com/r/zkN1j2/2
 398 |         input_error(app_data["wdg_dict"]["position"], "remove")
 399 |         coords = new.split(":")
 400 |         times = coords[1].split("-")
 401 |         channel_num = coords[0]
 402 |         channel_str = "Channel_{num}".format(num=channel_num)
 403 |         (start_time, end_time) = int(times[0]), int(times[1])
 404 |         if end_time - start_time <= 0:
 405 |             input_error(app_data["wdg_dict"]["position"], "add")
 406 |             return
 407 |     else:
 408 |         input_error(app_data["wdg_dict"]["position"], "add")
 409 |         return
 410 | 
 411 |     if int(end_time) > app_data["app_vars"]["len_ds"]:
 412 |         end_time = app_data["app_vars"]["len_ds"]
 413 |     app_data["app_vars"]["channel_str"] = channel_str
 414 |     app_data["app_vars"]["channel_num"] = int(channel_num)
 415 |     app_data["app_vars"]["start_time"] = int(start_time)
 416 |     app_data["app_vars"]["end_time"] = int(end_time)
 417 | 
 418 |     app_data["wdg_dict"]["position"].value = "{ch}:{start}-{end}".format(
 419 |         ch=app_data["app_vars"]["channel_num"],
 420 |         start=app_data["app_vars"]["start_time"],
 421 |         end=app_data["app_vars"]["end_time"],
 422 |     )
 423 | 
 424 |     update()
 425 | 
 426 | 
 427 | def update_data(bulkfile, app_vars):
 428 |     app_vars["duration"] = app_vars["end_time"] - app_vars["start_time"]
 429 |     # get times and squiggles
 430 |     app_vars["start_squiggle"] = math.floor(app_vars["start_time"] * app_vars["sf"])
 431 |     app_vars["end_squiggle"] = math.floor(app_vars["end_time"] * app_vars["sf"])
 432 |     # get data in numpy arrays
 433 |     step = 1 / app_vars["sf"]
 434 |     app_data["x_data"] = np.arange(app_vars["start_time"], app_vars["end_time"], step)
 435 |     app_data["y_data"] = bulkfile["Raw"][app_vars["channel_str"]]["Signal"][()]
 436 |     app_vars["len_ds"] = len(app_data["y_data"]) / app_vars["sf"]
 437 |     app_data["y_data"] = app_data["y_data"][
 438 |         app_vars["start_squiggle"] : app_vars["end_squiggle"]
 439 |     ]
 440 |     # get annotations
 441 |     path = bulkfile["IntermediateData"][app_vars["channel_str"]]["Reads"]
 442 |     fields = ["read_id", "read_start", "modal_classification"]
 443 |     app_data["label_df"], app_data["label_dt"] = get_annotations(
 444 |         path, fields, "modal_classification"
 445 |     )
 446 |     app_data["label_df"] = app_data["label_df"].drop_duplicates(
 447 |         subset=["read_id", "modal_classification"], keep="first"
 448 |     )
 449 |     app_data["label_df"].read_start = app_data["label_df"].read_start / app_vars["sf"]
 450 |     app_data["label_df"].read_id = app_data["label_df"].read_id.str.decode("utf8")
 451 | 
 452 |     path = bulkfile["StateData"][app_vars["channel_str"]]["States"]
 453 |     fields = ["acquisition_raw_index", "summary_state"]
 454 |     state_label_df, state_label_dtypes = get_annotations(path, fields, "summary_state")
 455 |     state_label_df.acquisition_raw_index = (
 456 |         state_label_df.acquisition_raw_index / app_vars["sf"]
 457 |     )
 458 |     state_label_df = state_label_df.rename(
 459 |         columns={
 460 |             "acquisition_raw_index": "read_start",
 461 |             "summary_state": "modal_classification",
 462 |         }
 463 |     )
 464 |     app_data["label_df"] = app_data["label_df"].append(
 465 |         state_label_df, ignore_index=True
 466 |     )
 467 |     app_data["label_df"].sort_values(by="read_start", ascending=True, inplace=True)
 468 |     app_data["label_dt"].update(state_label_dtypes)
 469 | 
 470 | 
 471 | def get_annotations(path, fields, enum_field):
 472 |     data_labels = {}
 473 |     for field in fields:
 474 |         data_labels[field] = path[field]
 475 |     data_dtypes = {}
 476 |     if h5py.check_dtype(enum=path.dtype[enum_field]):
 477 |         dataset_dtype = h5py.check_dtype(enum=path.dtype[enum_field])
 478 |         # data_dtype may lose some dataset dtypes there are duplicates of 'v'
 479 |         data_dtypes = {v: k for k, v in dataset_dtype.items()}
 480 |     labels_df = pd.DataFrame(data=data_labels)
 481 |     return labels_df, data_dtypes
 482 | 
 483 | 
 484 | def build_widgets():
 485 |     """"""
 486 |     check_labels = []
 487 |     jump_list = []
 488 |     check_active = []
 489 |     app_data["label_mp"] = {}
 490 |     for k, v in enumerate(app_data["label_dt"].items()):
 491 |         app_data["label_mp"][v[0]] = k
 492 |         check_labels.append(v[1])
 493 |         if v[1] in cfg_lo:
 494 |             if cfg_lo[v[1]] == "True":
 495 |                 check_active.append(k)
 496 |                 jump_list.append((v[1], str(v[0])))
 497 |         # else:
 498 |         #     # print("label {v} is in your bulk-file but not defined in config.ini".format(v=v[1]))
 499 |         #     check_active.append(k)
 500 | 
 501 |     if len(check_active) == len(check_labels):
 502 |         filter_toggle_active = 0
 503 |     elif len(check_active) == 0:
 504 |         filter_toggle_active = 1
 505 |     else:
 506 |         filter_toggle_active = None
 507 | 
 508 |     wdg = app_data["wdg_dict"]
 509 |     wdg["duration"] = PreText(
 510 |         text="Duration: {d} seconds".format(d=app_data["app_vars"]["duration"]),
 511 |         css_classes=["duration_pre"],
 512 |     )
 513 |     # wdg['navigation_label'] = Div(text='Navigation:', css_classes=['navigation-dropdown', 'help-text'])
 514 |     # wdg['navigation_text'] = Div(
 515 |     #     text="""Use the <code><b>Jump to ...</b></code> buttons to find the next or previous event type.
 516 |     #             """,
 517 |     #     css_classes=['navigation-drop']
 518 |     # )
 519 |     wdg["jump_next"] = Dropdown(
 520 |         label="Jump to next",
 521 |         button_type="primary",
 522 |         menu=jump_list,
 523 |         css_classes=["jump-block"],
 524 |     )
 525 |     wdg["jump_prev"] = Dropdown(
 526 |         label="Jump to previous", button_type="primary", menu=jump_list
 527 |     )
 528 | 
 529 |     wdg["export_label"] = Div(
 530 |         text="Export data:", css_classes=["export-dropdown", "help-text"]
 531 |     )
 532 |     wdg["export_text"] = Div(
 533 |         text="""Export data, as a read file, from the current position. These are written to the output directory
 534 |                 specified in your config file.
 535 |                 """,
 536 |         css_classes=["export-drop"],
 537 |     )
 538 |     wdg["save_read_file"] = Button(
 539 |         label="Save read file", button_type="success", css_classes=[]
 540 |     )
 541 |     # wdg['bulkfile_info'] = Div(text='Bulkfile info', css_classes=['bulkfile-dropdown', 'caret-down'])
 542 |     # wdg['bulkfile_help'] = Div(text='Bulkfile info help:', css_classes=['bulkfile-help-dropdown', 'help-text', 'bulkfile-drop'])
 543 |     # wdg['bulkfile_help_text'] = Div(
 544 |     #     text="""This contains basic information about the experiment that is recorded in the bulk-fast5-file.
 545 |     #             """,
 546 |     #     css_classes=['bulkfile-help-drop']
 547 |     # )
 548 |     wdg["bulkfile_text"] = Div(text="", css_classes=["bulkfile-drop"])
 549 |     for k, v in app_data["app_vars"]["attributes"].items():
 550 |         for entry in v:
 551 |             wdg[
 552 |                 "bulkfile_text"
 553 |             ].text += "<b>{f}:</b> <br><code>{val}</code><br>".format(
 554 |                 f=entry[0], val=app_data["app_vars"][entry[0]]
 555 |             )
 556 |     # wdg['label_options'] = Div(text='Select annotations', css_classes=['filter-dropdown', 'caret-down'])
 557 |     # wdg['filter_help'] = Div(text='filter help:', css_classes=['filter-help-dropdown', 'help-text', 'filter-drop'])
 558 |     # wdg['filter_help_text'] = Div(
 559 |     #     text="""Select which bulkfile annotations should be rendered on the chart. 'Display annotations' will turn all
 560 |     #             annotations on or off.
 561 |     #             """,
 562 |     #     css_classes=['filter-help-drop']
 563 |     # )
 564 |     wdg["toggle_annotations"] = Toggle(
 565 |         label="Display annotations",
 566 |         button_type="danger",
 567 |         css_classes=["toggle_button_g_r", "filter-drop"],
 568 |         active=True,
 569 |     )
 570 |     wdg["toggle_mappings"] = Toggle(
 571 |         label="Display mappings",
 572 |         button_type="danger",
 573 |         css_classes=["toggle_button_g_r", "filter-drop"],
 574 |         active=True,
 575 |     )
 576 |     wdg["filter_toggle_group"] = RadioButtonGroup(
 577 |         labels=["Select all", "Select none"],
 578 |         active=filter_toggle_active,
 579 |         css_classes=["filter-drop"],
 580 |     )
 581 |     wdg["label_filter"] = CheckboxGroup(
 582 |         labels=check_labels, active=check_active, css_classes=["filter-drop"]
 583 |     )
 584 | 
 585 |     # wdg['plot_options'] = Div(text='Plot adjustments', css_classes=['adjust-dropdown', 'caret-down'])
 586 |     # wdg['adjust_help'] = Div(text='adjust help:', css_classes=['adjust-help-dropdown', 'help-text', 'adjust-drop'])
 587 |     # wdg['adjust_help_text'] = Div(
 588 |     #     text="""Adjust chart parameters, such as width, height and where annotations are rendered. These are set in the
 589 |     #             config.ini, where the default values can be edited.
 590 |     #             """,
 591 |     #     css_classes=['adjust-help-drop']
 592 |     # )
 593 |     wdg["po_width"] = TextInput(
 594 |         title="Plot Width (px)", value=cfg_po["plot_width"], css_classes=["adjust-drop"]
 595 |     )
 596 |     wdg["po_height"] = TextInput(
 597 |         title="Plot Height (px)",
 598 |         value=cfg_po["plot_height"],
 599 |         css_classes=["adjust-drop"],
 600 |     )
 601 |     wdg["label_height"] = TextInput(
 602 |         title="Annotation height (y-axis)",
 603 |         value=cfg_po["label_height"],
 604 |         css_classes=["adjust-drop"],
 605 |     )
 606 |     wdg["po_y_max"] = TextInput(
 607 |         title="y max",
 608 |         value=cfg_po["y_max"],
 609 |         css_classes=["adjust-drop", "toggle_y_target"],
 610 |     )
 611 |     wdg["po_y_min"] = TextInput(
 612 |         title="y min",
 613 |         value=cfg_po["y_min"],
 614 |         css_classes=["adjust-drop", "toggle_y_target"],
 615 |     )
 616 |     wdg["toggle_y_axis"] = Toggle(
 617 |         label="Fixed Y-axis",
 618 |         button_type="danger",
 619 |         css_classes=["toggle_button_g_r", "adjust-drop", "toggle_y_axis"],
 620 |         active=False,
 621 |     )
 622 |     wdg["toggle_smoothing"] = Toggle(
 623 |         label="Smoothing",
 624 |         button_type="danger",
 625 |         css_classes=["toggle_button_g_r", "adjust-drop"],
 626 |         active=True,
 627 |     )
 628 | 
 629 |     wdg["label_filter"].on_change("active", update_checkboxes)
 630 |     wdg["filter_toggle_group"].on_change("active", update_toggle)
 631 |     wdg["jump_next"].on_click(next_update)
 632 |     wdg["jump_prev"].on_click(prev_update)
 633 |     wdg["save_read_file"].on_click(export_data)
 634 | 
 635 |     for name in toggle_inputs:
 636 |         wdg[name].on_click(toggle_button)
 637 |     for name in int_inputs:
 638 |         wdg[name].on_change("value", is_input_int)
 639 |     return wdg
 640 | 
 641 | 
 642 | def create_figure(x_data, y_data, wdg, app_vars):
 643 |     def vline(x_coords, y_upper, y_lower):
 644 |         # Return a dataset that can plot vertical lines
 645 |         x_values = np.vstack((x_coords, x_coords)).T
 646 |         y_upper_list = np.full((1, len(x_values)), y_upper)
 647 |         y_lower_list = np.full((1, len(x_values)), y_lower)
 648 |         y_values = np.vstack((y_lower_list, y_upper_list)).T
 649 |         return x_values.tolist(), y_values.tolist()
 650 | 
 651 |     def hlines(y_coords, x_lower, x_upper):
 652 |         """
 653 | 
 654 |         Parameters
 655 |         ----------
 656 |         y_coords: (int, float) height to plot lines at
 657 |         x_lower: (int, float) lower x coord
 658 |         x_upper: (int, float) upper x coord
 659 | 
 660 |         Returns
 661 |         -------
 662 | 
 663 |         """
 664 |         x_values = np.vstack((x_lower, x_upper)).T
 665 |         y_values_list = np.full((1, len(x_values)), y_coords)
 666 |         y_values = np.vstack((y_values_list, y_values_list)).T
 667 |         return x_values.tolist(), y_values.tolist()
 668 | 
 669 |     if wdg["toggle_smoothing"].active:
 670 |         w_range = app_vars["duration"]
 671 |         divisor = math.e ** 2.5
 672 |         thin_factor = math.ceil(w_range / divisor)
 673 |     else:
 674 |         thin_factor = 1
 675 |     if thin_factor == 0:
 676 |         thin_factor = 1
 677 | 
 678 |     greater_delete_index = np.argwhere(y_data > int(cfg_po["upper_cut_off"]))
 679 |     x_data = np.delete(x_data, greater_delete_index)
 680 |     y_data = np.delete(y_data, greater_delete_index)
 681 | 
 682 |     lesser_delete_index = np.argwhere(y_data < int(cfg_po["lower_cut_off"]))
 683 |     x_data = np.delete(x_data, lesser_delete_index)
 684 |     y_data = np.delete(y_data, lesser_delete_index)
 685 | 
 686 |     x_data = x_data[::thin_factor]
 687 |     y_data = y_data[::thin_factor]
 688 | 
 689 |     data = {
 690 |         "x": x_data,
 691 |         "y": y_data,
 692 |     }
 693 | 
 694 |     source = ColumnDataSource(data=data)
 695 | 
 696 |     p = figure(
 697 |         plot_height=int(wdg["po_height"].value),
 698 |         plot_width=int(wdg["po_width"].value),
 699 |         toolbar_location="right",
 700 |         tools=["xbox_zoom", "xpan", "undo", "reset", "save"],
 701 |         active_drag="xbox_zoom",
 702 |     )
 703 |     if cfg_po["output_backend"] not in output_backend:
 704 |         p.output_backend = "canvas"
 705 |     else:
 706 |         p.output_backend = cfg_po["output_backend"]
 707 |     # Add step/% points plotted: Step: {sp} ({pt:.3f}) -> sp=thin_factor, pt=1/thin_factor
 708 |     p.add_layout(
 709 |         Title(
 710 |             text="Channel: {ch} Start: {st} End: {ed} Sample rate: {sf}".format(
 711 |                 ch=app_vars["channel_num"],
 712 |                 st=app_vars["start_time"],
 713 |                 ed=app_vars["end_time"],
 714 |                 sf=app_vars["sf"],
 715 |             )
 716 |         ),
 717 |         "above",
 718 |     )
 719 |     p.add_layout(
 720 |         Title(
 721 |             text="bulk FAST5 file: {s}".format(
 722 |                 s=app_data["wdg_dict"]["file_list"].value
 723 |             )
 724 |         ),
 725 |         "above",
 726 |     )
 727 | 
 728 |     p.toolbar.logo = None
 729 |     p.yaxis.axis_label = "Raw signal"
 730 |     p.yaxis.major_label_orientation = "horizontal"
 731 |     p.xaxis.axis_label = "Time (seconds)"
 732 |     p.line(source=source, x="x", y="y", line_width=1)
 733 |     p.xaxis.major_label_orientation = math.radians(45)
 734 |     p.x_range.range_padding = 0.01
 735 | 
 736 |     # set padding manually
 737 |     y_min = np.amin(data["y"])
 738 |     y_max = np.amax(data["y"])
 739 |     pad = (y_max - y_min) * 0.1 / 2
 740 |     p.y_range = Range1d(y_min - pad, y_max + pad)
 741 |     try:
 742 |         app_data["bmf"]
 743 |     except NameError:
 744 |         bmf_set = False
 745 |     except KeyError:
 746 |         bmf_set = False
 747 |     else:
 748 |         bmf_set = True
 749 |     if bmf_set and wdg["toggle_mappings"].active:
 750 |         LOGGER.info("Plotting mappings")
 751 |         # set padding manually
 752 |         # lower_pad = (y_max - y_min) * 0.1 / 2
 753 |         # upper_pad = (y_max - y_min) / 2
 754 |         # p.y_range = Range1d(y_min - lower_pad, y_max + upper_pad)
 755 |         # set mapping track midpoints
 756 |         # upper_mapping = upper_pad / 4 * 3 + y_max
 757 |         # lower_mapping = upper_pad / 4 + y_max
 758 |         lower_mapping = int(wdg["label_height"].value) + 750
 759 |         # Select only this channel
 760 |         slim_bmf = app_data["bmf"][
 761 |             app_data["bmf"]["channel"] == app_vars["channel_num"]
 762 |         ]
 763 |         # Select the current viewed range
 764 |         slim_bmf = slim_bmf[
 765 |             (
 766 |                 (slim_bmf["start_time"] > app_vars["start_time"])
 767 |                 & (slim_bmf["end_time"] < app_vars["end_time"])
 768 |             )
 769 |             | (
 770 |                 (slim_bmf["start_time"] < app_vars["start_time"])
 771 |                 & (slim_bmf["end_time"] < app_vars["end_time"])
 772 |                 & (slim_bmf["end_time"] > app_vars["start_time"])
 773 |             )
 774 |             | (
 775 |                 (slim_bmf["start_time"] > app_vars["start_time"])
 776 |                 & (slim_bmf["end_time"] > app_vars["end_time"])
 777 |                 & (slim_bmf["start_time"] < app_vars["end_time"])
 778 |             )
 779 |         ]
 780 |         slim_bmf["start_time"] = slim_bmf["start_time"].where(
 781 |             slim_bmf["start_time"] > app_vars["start_time"], app_vars["start_time"]
 782 |         )
 783 |         slim_bmf["end_time"] = slim_bmf["end_time"].where(
 784 |             slim_bmf["end_time"] < app_vars["end_time"], app_vars["end_time"]
 785 |         )
 786 | 
 787 |         slim_bmf["height"] = lower_mapping
 788 |         slim_bmf["offset"] = (
 789 |             np.ones(len(slim_bmf)) * 5
 790 |             + slim_bmf.groupby(["start_time", "end_time"]).cumcount() * 15
 791 |         )
 792 |         # Convert slim_bmf to ColDataSrc
 793 |         mapping_source = ColumnDataSource(data=slim_bmf.to_dict(orient="list"))
 794 |         # Add labels to LabelSet
 795 |         mapping_labels = LabelSet(
 796 |             x="start_time",
 797 |             y="height",
 798 |             text="label",
 799 |             level="glyph",
 800 |             x_offset=5,
 801 |             y_offset="offset",
 802 |             source=mapping_source,
 803 |             render_mode="canvas",
 804 |         )
 805 |         p.add_layout(mapping_labels)
 806 |         # Add some colour here
 807 |         # Forward Vertical lines => blue
 808 |         p_x, p_y = vline(
 809 |             np.concatenate(
 810 |                 [
 811 |                     slim_bmf["start_time"]
 812 |                     .where(slim_bmf["strand"] == "+")
 813 |                     .dropna()
 814 |                     .values,
 815 |                     slim_bmf["end_time"]
 816 |                     .where(slim_bmf["strand"] == "+")
 817 |                     .dropna()
 818 |                     .values,
 819 |                 ]
 820 |             ),
 821 |             lower_mapping + 20,
 822 |             lower_mapping - 20,
 823 |         )
 824 |         p.multi_line(p_x, p_y, line_dash="solid", color="blue", line_width=1)
 825 |         # Reverse Vertical lines => red
 826 |         p_x, p_y = vline(
 827 |             np.concatenate(
 828 |                 [
 829 |                     slim_bmf["start_time"]
 830 |                     .where(slim_bmf["strand"] == "-")
 831 |                     .dropna()
 832 |                     .values,
 833 |                     slim_bmf["end_time"]
 834 |                     .where(slim_bmf["strand"] == "-")
 835 |                     .dropna()
 836 |                     .values,
 837 |                 ]
 838 |             ),
 839 |             lower_mapping + 20,
 840 |             lower_mapping - 20,
 841 |         )
 842 |         p.multi_line(p_x, p_y, line_dash="solid", color="red", line_width=1)
 843 |         # Horizontal lines
 844 |         p_x, p_y = hlines(
 845 |             lower_mapping,
 846 |             slim_bmf["start_time"].where(slim_bmf["strand"] == "+").dropna(),
 847 |             slim_bmf["end_time"].where(slim_bmf["strand"] == "+").dropna(),
 848 |         )
 849 |         p.multi_line(p_x, p_y, line_dash="solid", color="blue", line_width=1)
 850 |         # Horizontal lines
 851 |         p_x, p_y = hlines(
 852 |             lower_mapping,
 853 |             slim_bmf["start_time"].where(slim_bmf["strand"] == "-").dropna(),
 854 |             slim_bmf["end_time"].where(slim_bmf["strand"] == "-").dropna(),
 855 |         )
 856 |         p.multi_line(p_x, p_y, line_dash="solid", color="red", line_width=1)
 857 | 
 858 |     if wdg["toggle_y_axis"].active:
 859 |         p.y_range = Range1d(int(wdg["po_y_min"].value), int(wdg["po_y_max"].value))
 860 |     if wdg["toggle_annotations"].active:
 861 |         # Map modal_classifications onto df
 862 |         app_data["label_df"]["mc_active_map"] = app_data["label_df"][
 863 |             "modal_classification"
 864 |         ].map(app_data["label_mp"])
 865 |         app_data["label_df"]["mc_label_map"] = app_data["label_df"][
 866 |             "modal_classification"
 867 |         ].map(app_data["label_dt"])
 868 |         # Here labels are thinned out
 869 |         slim_label_df = app_data["label_df"][
 870 |             (app_data["label_df"]["read_start"] >= app_vars["start_time"])
 871 |             & (app_data["label_df"]["read_start"] <= app_vars["end_time"])
 872 |         ]
 873 |         # Use pd.isin to remove unwanted annotations from the slimmed df
 874 |         slim_label_df = slim_label_df[
 875 |             slim_label_df["mc_active_map"].isin(wdg["label_filter"].active) == True
 876 |         ]
 877 |         # get coordinates and vstack them to produce [[x, x], [x, x]...]
 878 |         line_x_values = np.vstack(
 879 |             (slim_label_df["read_start"].values, slim_label_df["read_start"].values)
 880 |         ).T
 881 |         tmp_list = np.full((1, len(line_x_values)), -10000)
 882 |         line_y_values = np.vstack((tmp_list, tmp_list * -1)).T
 883 |         # Add all vertical lines as multi_line
 884 |         p.multi_line(
 885 |             line_x_values.tolist(),
 886 |             line_y_values.tolist(),
 887 |             line_dash="dashed",
 888 |             color="green",
 889 |             line_width=1,
 890 |         )
 891 |         # combine series to form label
 892 |         slim_label_df["label"] = (
 893 |             slim_label_df["mc_label_map"]
 894 |             + " - "
 895 |             + slim_label_df["read_id"].astype("str")
 896 |         )
 897 |         # Create ColumnDataSource combining labels and coordinates
 898 |         label_source = ColumnDataSource(
 899 |             data=dict(
 900 |                 x=slim_label_df["read_start"].values,
 901 |                 y=np.full((len(slim_label_df), 1), int(wdg["label_height"].value)),
 902 |                 t=slim_label_df["label"].values,
 903 |             )
 904 |         )
 905 |         # Add all labels as a label set
 906 |         labels = LabelSet(
 907 |             x="x",
 908 |             y="y",
 909 |             text="t",
 910 |             level="glyph",
 911 |             x_offset=0,
 912 |             y_offset=0,
 913 |             source=label_source,
 914 |             render_mode="canvas",
 915 |             angle=-270,
 916 |             angle_units="deg",
 917 |         )
 918 |         p.add_layout(labels)
 919 | 
 920 |     return column(p, css_classes=["plot_div"])
 921 | 
 922 | 
 923 | def is_input_int(attr, old, new):
 924 |     try:
 925 |         int(new)
 926 |         for wdg in int_inputs:
 927 |             if (app_data["wdg_dict"][wdg].value == new) and (
 928 |                 "input-error" in app_data["wdg_dict"][wdg].css_classes
 929 |             ):
 930 |                 input_error(app_data["wdg_dict"][wdg], "remove")
 931 |     except ValueError:
 932 |         for wdg in int_inputs:
 933 |             if app_data["wdg_dict"][wdg].value == new:
 934 |                 input_error(app_data["wdg_dict"][wdg], "add")
 935 |                 return
 936 | 
 937 |     new = new.lstrip("0")
 938 |     update()
 939 | 
 940 | 
 941 | def toggle_button(state):
 942 |     layout.children[1] = create_figure(
 943 |         app_data["x_data"],
 944 |         app_data["y_data"],
 945 |         app_data["wdg_dict"],
 946 |         app_data["app_vars"],
 947 |     )
 948 | 
 949 | 
 950 | def input_error(widget, mode):
 951 |     """"""
 952 |     if mode == "add":
 953 |         widget.css_classes.append("input-error")
 954 |     elif mode == "remove":
 955 |         if widget.css_classes:
 956 |             del widget.css_classes[-1]
 957 |     else:
 958 |         print("mode not recognised")
 959 | 
 960 | 
 961 | def update():
 962 |     update_data(app_data["bulkfile"], app_data["app_vars"])
 963 |     if app_data["INIT"]:
 964 |         build_widgets()
 965 |         layout.children[0] = column(
 966 |             list(app_data["wdg_dict"].values()), width=int(cfg_po["wdg_width"])
 967 |         )
 968 |         app_data["INIT"] = False
 969 |     app_data["wdg_dict"]["duration"].text = "Duration: {d} seconds".format(
 970 |         d=app_data["app_vars"]["duration"]
 971 |     )
 972 |     app_data["wdg_dict"]["toggle_smoothing"].active = True
 973 |     layout.children[1] = create_figure(
 974 |         app_data["x_data"],
 975 |         app_data["y_data"],
 976 |         app_data["wdg_dict"],
 977 |         app_data["app_vars"],
 978 |     )
 979 | 
 980 | 
 981 | def update_other(attr, old, new):
 982 |     update()
 983 | 
 984 | 
 985 | def update_toggle(attr, old, new):
 986 |     if new == 0:
 987 |         app_data["wdg_dict"]["label_filter"].active = list(
 988 |             np.arange(0, len(app_data["wdg_dict"]["label_filter"].labels), 1)
 989 |         )
 990 |     elif new == 1:
 991 |         app_data["wdg_dict"]["label_filter"].active = []
 992 |     update()
 993 | 
 994 | 
 995 | def update_checkboxes(attr, old, new):
 996 |     if len(new) != len(app_data["wdg_dict"]["label_filter"].labels) and len(new) != 0:
 997 |         app_data["wdg_dict"]["filter_toggle_group"].active = None
 998 |     update()
 999 | 
1000 | 
1001 | def next_update(value):
1002 |     value = int(value.item)
1003 |     jump_start = app_data["label_df"][
1004 |         (app_data["label_df"]["read_start"] > app_data["app_vars"]["start_time"] + 1)
1005 |         & (app_data["label_df"]["modal_classification"] == value)
1006 |     ]
1007 |     try:
1008 |         app_data["app_vars"]["start_time"] = int(
1009 |             math.floor(jump_start["read_start"].iloc[0])
1010 |         )
1011 |     except IndexError:
1012 |         app_data["wdg_dict"]["duration"].text += "\n{ev} event not found".format(
1013 |             ev=app_data["label_dt"][value]
1014 |         )
1015 |         return
1016 |     except Exception as e:
1017 |         print(type(e))
1018 |         print(e)
1019 |     app_data["app_vars"]["end_time"] = (
1020 |         app_data["app_vars"]["start_time"] + app_data["app_vars"]["duration"]
1021 |     )
1022 |     app_data["wdg_dict"]["position"].value = "{ch}:{start}-{end}".format(
1023 |         ch=app_data["app_vars"]["channel_num"],
1024 |         start=app_data["app_vars"]["start_time"],
1025 |         end=app_data["app_vars"]["end_time"],
1026 |     )
1027 |     layout.children[1] = create_figure(
1028 |         app_data["x_data"],
1029 |         app_data["y_data"],
1030 |         app_data["wdg_dict"],
1031 |         app_data["app_vars"],
1032 |     )
1033 | 
1034 | 
1035 | def prev_update(value):
1036 |     value = int(value.item)
1037 |     jump_start = app_data["label_df"][
1038 |         (app_data["label_df"]["read_start"] < app_data["app_vars"]["start_time"])
1039 |         & (app_data["label_df"]["modal_classification"] == value)
1040 |     ]
1041 |     try:
1042 |         app_data["app_vars"]["start_time"] = int(
1043 |             math.floor(jump_start["read_start"].iloc[-1])
1044 |         )
1045 |     except IndexError:
1046 |         app_data["wdg_dict"]["duration"].text += "\n{ev} event not found".format(
1047 |             ev=app_data["label_dt"][value]
1048 |         )
1049 |         return
1050 |     except Exception as e:
1051 |         print(type(e))
1052 |         print(e)
1053 |     app_data["app_vars"]["end_time"] = (
1054 |         app_data["app_vars"]["start_time"] + app_data["app_vars"]["duration"]
1055 |     )
1056 |     app_data["wdg_dict"]["position"].value = "{ch}:{start}-{end}".format(
1057 |         ch=app_data["app_vars"]["channel_num"],
1058 |         start=app_data["app_vars"]["start_time"],
1059 |         end=app_data["app_vars"]["end_time"],
1060 |     )
1061 |     layout.children[1] = create_figure(
1062 |         app_data["x_data"],
1063 |         app_data["y_data"],
1064 |         app_data["wdg_dict"],
1065 |         app_data["app_vars"],
1066 |     )
1067 | 
1068 | 
1069 | def export_data():
1070 |     try:
1071 |         start_val = math.floor(
1072 |             app_data["app_vars"]["start"] * app_data["app_vars"]["sf"]
1073 |         )
1074 |         end_val = math.ceil(app_data["app_vars"]["end"] * app_data["app_vars"]["sf"])
1075 |     except KeyError:
1076 |         start_val = app_data["app_vars"]["start_squiggle"]
1077 |         end_val = app_data["app_vars"]["end_squiggle"]
1078 |     if (
1079 |         export_read_file(
1080 |             app_data["app_vars"]["channel_num"],
1081 |             start_val,
1082 |             end_val,
1083 |             app_data["bulkfile"],
1084 |             cfg_dr["out"],
1085 |         )
1086 |         == 0
1087 |     ):
1088 |         app_data["wdg_dict"]["duration"].text += "\nread file created"
1089 |     else:
1090 |         app_data["wdg_dict"]["duration"].text += "\nError: read file not created"
1091 | 
1092 | 
1093 | app_data = {
1094 |     "file_src": None,  # bulkfile path (string)
1095 |     "bulkfile": None,  # bulkfile object
1096 |     "bmf": None,  # bmf dataframe
1097 |     "x_data": None,  # numpy ndarray time points
1098 |     "y_data": None,  # numpy ndarray signal data
1099 |     "label_df": None,  # pandas df of signal labels
1100 |     "label_dt": None,  # dict of signal enumeration
1101 |     "label_mp": None,  # dict matching labels to widget filter
1102 |     "app_vars": {  # dict of variables used in plots and widgets
1103 |         "len_ds": None,  # length of signal dataset
1104 |         "start_time": None,  # squiggle start time in seconds
1105 |         "end_time": None,  # squiggle end time in seconds
1106 |         "duration": None,  # squiggle duration in seconds
1107 |         "start_squiggle": None,  # squiggle start position (samples)
1108 |         "end_squiggle": None,  # squiggle end position (samples)
1109 |         "channel_str": None,  # 'Channel_NNN' (string)
1110 |         "channel_num": None,  # Channel number (int)
1111 |         "sf": None,  # sample frequency (int)
1112 |         "attributes": None,  # OrderedDict of bulkfile attr info
1113 |     },
1114 |     "wdg_dict": None,  # dictionary of widgets
1115 |     "controls": None,  # widgets added to widgetbox
1116 |     "pore_plt": None,  # the squiggle plot
1117 |     "INIT": True,  # Initial plot with bulkfile (bool)
1118 | }
1119 | 
1120 | int_inputs = ["po_width", "po_height", "po_y_min", "po_y_max", "label_height"]
1121 | toggle_inputs = ["toggle_y_axis", "toggle_annotations", "toggle_smoothing"]
1122 | 
1123 | app_data["app_vars"]["files"] = []
1124 | p = Path(cfg_dr["dir"])
1125 | app_data["app_vars"]["files"] = [
1126 |     (x.name, x.name) for x in p.iterdir() if x.suffix == ".fast5"
1127 | ]
1128 | m = Path(cfg_dr["map"])
1129 | app_data["app_vars"]["map_files"] = [
1130 |     (x.name, x.name) for x in m.iterdir() if x.suffix == ".bmf"
1131 | ]
1132 | app_data["app_vars"]["map_files"].insert(0, ("", "--"))
1133 | # check files are useable by h5py
1134 | for index, file in enumerate(app_data["app_vars"]["files"]):
1135 |     file = file[0]
1136 |     try:
1137 |         bulk_file = h5py.File(Path(Path(cfg_dr["dir"]) / file), "r")
1138 |     except OSError:
1139 |         app_data["app_vars"]["files"][index] = None
1140 |         continue
1141 |     try:
1142 |         try_path = bulk_file["Raw"]
1143 |     except KeyError:
1144 |         app_data["app_vars"]["files"][index] = None
1145 |         continue
1146 |     for i, channel in enumerate(try_path):
1147 |         if i == 0:
1148 |             try:
1149 |                 try_path[channel]["Signal"][0]
1150 |             except KeyError:
1151 |                 app_data["app_vars"]["files"][index] = None
1152 |         break
1153 |     bulk_file.flush()
1154 |     bulk_file.close()
1155 | app_data["app_vars"]["files"] = list(
1156 |     filter((None).__ne__, app_data["app_vars"]["files"])
1157 | )
1158 | app_data["app_vars"]["files"].insert(0, ("", "--"))
1159 | 
1160 | app_data["wdg_dict"] = init_wdg_dict()
1161 | app_data["controls"] = column(
1162 |     list(app_data["wdg_dict"].values()), width=int(cfg_po["wdg_width"])
1163 | )
1164 | 
1165 | f = figure(toolbar_location=None)
1166 | f.line(x=[0], y=[0])
1167 | f.outline_line_color = None
1168 | f.toolbar.logo = None
1169 | f.xaxis.visible = False
1170 | f.yaxis.visible = False
1171 | f.xgrid.visible = False
1172 | f.ygrid.visible = False
1173 | app_data["pore_plt"] = column(f, css_classes=["plot_div"])
1174 | 
1175 | layout = row(app_data["controls"], app_data["pore_plt"])
1176 | 
1177 | curdoc().add_root(layout)
1178 | curdoc().title = "bulkvis"
1179 | 


--------------------------------------------------------------------------------
/bulkvis/bulkvis_server/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |     <head></head>
 4 |         {{ bokeh_css }}
 5 |         <style>
 6 |           {% include 'styles.css' %}
 7 |         </style>
 8 |         <meta charset="utf-8">
 9 |         {{ bokeh_js }}
10 |     </head>
11 |     <body>
12 |         {{ plot_div|indent(8) }}
13 |         {{ plot_script|indent(8) }}
14 |     </body>
15 | </html>
16 | 


--------------------------------------------------------------------------------
/bulkvis/bulkvis_server/templates/styles.css:
--------------------------------------------------------------------------------
 1 | @import url(https://fonts.googleapis.com/css?family=Noto+Sans);
 2 | 
 3 | body {
 4 |     font-family: 'Noto Sans', sans-serif;
 5 |     -webkit-font-smoothing: antialiased;
 6 |     text-rendering: optimizeLegibility;
 7 |     display: block;
 8 |     margin: 0;
 9 | }
10 | .plot_div {
11 |     position: fixed !important;
12 | }
13 | .toggle_button_g_r > .bk-btn-group > .bk-active {
14 |     background-color: #5cb85c;
15 |     font-style: italic;
16 | }
17 | .toggle_button_o_r > .bk-btn-group > .bk-active {
18 |     background-color: #ed9c28;
19 |     font-style: italic;
20 | }
21 | .input-error > .bk-input-group > input {
22 |     background-color: #d9534f;
23 | }
24 | label.bk > span.bk {
25 |     font-family: monospace;
26 | }
27 | code {
28 |     overflow-wrap: anywhere;
29 | }


--------------------------------------------------------------------------------
/bulkvis/cite.py:
--------------------------------------------------------------------------------
 1 | import textwrap
 2 | 
 3 | _help = "Output the citation for this tool and exit"
 4 | _cli = ()
 5 | 
 6 | 
 7 | def run(parser, args):
 8 |     cite = textwrap.fill(
 9 |         (
10 |             "Alexander Payne, Nadine Holmes, Vardhman Rakyan, Matthew Loose, "
11 |             "BulkVis: a graphical viewer for Oxford nanopore bulk FAST5 files, "
12 |             "Bioinformatics, Volume 35, Issue 13, 1 July 2019, Pages 2193–2198"
13 |         ),
14 |         width=70,
15 |         subsequent_indent=" " * 10,
16 |     )
17 |     url = "https://academic.oup.com/bioinformatics/article/35/13/2193/5193712"
18 |     doi = "10.1093/bioinformatics/bty841"
19 |     print("Thank you for using bulkvis!\n")
20 |     print(f"Citation: {cite}")
21 |     print(f"URL:      {url}")
22 |     print(f"DOI:      {doi}")
23 | 


--------------------------------------------------------------------------------
/bulkvis/core.py:
--------------------------------------------------------------------------------
  1 | """core.py
  2 | """
  3 | from pathlib import Path
  4 | import sys
  5 | import numpy as np
  6 | import pandas as pd
  7 | import traceback
  8 | 
  9 | 
 10 | def concat_files_to_df(file_list, **kwargs):
 11 |     """Return a pandas.DataFrame from a list of files
 12 |     Parameters
 13 |     ----------
 14 |     file_list : list
 15 |         List of files to be concatenated
 16 |     kwargs
 17 |         Any parameter used by pandas.read_csv except 'filepath_or_buffer'. These will be applied to all
 18 |         files in 'file_list'
 19 |     Returns
 20 |     -------
 21 |     pandas.DataFrame
 22 |     Raises
 23 |     ------
 24 |     pandas.errors.ParserError
 25 |         Raises pandas.errors.ParserError if input file(s) do not match expected format or shape.
 26 |     """
 27 |     kwargs = remove_kwargs(["filepath_or_buffer"], **kwargs)
 28 |     df_list = []
 29 |     for f in file_list:
 30 |         try:
 31 |             df_list.append(pd.read_csv(filepath_or_buffer=f, **kwargs))
 32 |         except pd.errors.ParserError as e:
 33 |             sys.exit(
 34 |                 "ParserError\nUsually caused by an input file not being the expected format"
 35 |             )
 36 |         except Exception as e:
 37 |             traceback.print_exc()
 38 |             sys.exit(1)
 39 |     return pd.concat(df_list, ignore_index=True)
 40 | 
 41 | 
 42 | def remove_kwargs(remove_list, **kwargs):
 43 |     """Remove items from kwargs dict that may cause conflict with successive function calls"""
 44 |     # return {k: v for k, v in kwargs.items() if k not in remove_list}  # This iterates the entire dict
 45 |     for item in remove_list:  # This just iterates the remove_list
 46 |         _ = kwargs.pop(item, None)
 47 |     return kwargs
 48 | 
 49 | 
 50 | def length_stats(lengths):
 51 |     """Return count [COUNT], minimum [MIN], maximum [MAX], mean [MEAN], and N50 [N50] of an array
 52 |     Parameters
 53 |     ----------
 54 |     lengths : array_like
 55 |         List of integers
 56 |     Returns
 57 |     -------
 58 |     dictionary
 59 |     """
 60 |     return {
 61 |         "COUNT": int(len(lengths)),
 62 |         "MIN": int(np.min(lengths)),
 63 |         "MAX": int(np.max(lengths)),
 64 |         "MEAN": int(np.mean(lengths)),
 65 |         "N50": _get_n50(np.sort(lengths)),
 66 |     }
 67 | 
 68 | 
 69 | def _get_n50(lengths):
 70 |     """Return N50 statistic for a list of read lengths
 71 |     Parameters
 72 |     ----------
 73 |     lengths array_like
 74 |         List of sorted, ascending, integers
 75 |     Returns
 76 |     -------
 77 |     integer
 78 |     """
 79 |     return int(lengths[np.where(np.cumsum(lengths) >= np.sum(lengths) / 2)][0])
 80 | 
 81 | 
 82 | def readable_yield(num, suffix="B"):
 83 |     """Return a human readable string of yield using si/metric prefixes
 84 |     Parameters
 85 |     ----------
 86 |     num : int (or float)
 87 |         Integer of total number of bases
 88 |     suffix : str
 89 |         String to append to si/metric prefixes ['', 'k', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y']
 90 |     Returns
 91 |     -------
 92 |     string
 93 |     """
 94 |     for unit in ["", "k", "M", "G", "T", "P", "E", "Z"]:
 95 |         if abs(num) < 1000:
 96 |             return "{n:3.2f} {u}{s}".format(n=num, u=unit, s=suffix)
 97 |         num /= 1000
 98 |     return "{n:3.1f} {u}{s}".format(n=num, u="Y", s=suffix)
 99 | 
100 | 
101 | def human_readable_yield(num: int, factor: int = 1000, suffix: str = "B") -> str:
102 |     """Return a human readable string of a large number using SI unit prefixes
103 |     Parameters
104 |     ----------
105 |     num : int
106 |         A number to convert to decimal form
107 |     factor : int
108 |         The SI factor, use 1000 for SI units and 1024 for binary multiples
109 |     suffix : str
110 |         The suffix to place after the SI prefix, for example use B for SI units and iB for binary multiples
111 |     Returns
112 |     -------
113 |     str
114 |         Returns the input number formatted to two decimal places with the SI unit and suffix
115 |     """
116 |     for unit in ["", "k", "M", "G", "T", "P", "E", "Z"]:
117 |         if abs(num) < factor:
118 |             return "{n:3.2f} {u}{s}".format(n=num, u=unit, s=suffix)
119 |         num /= factor
120 |     return "{n:3.2f} {u}{s}".format(n=num, u="Y", s=suffix)
121 | 
122 | 
123 | def top_n(df, field, n):
124 |     """Print top N reads, by length, from a dataset
125 |     Parameters
126 |     ----------
127 |     df : pandas.DataFrame
128 |         DataFrame containing a Series with length values
129 |     field : str
130 |         The key for the Series containing length values
131 |     n : int
132 |         The number of values to print
133 |     Returns
134 |     -------
135 |     None
136 |     """
137 |     df = df.sort_values(by=field, ascending=False)
138 |     rows = df.filter([field], axis=1).head(n=n).reset_index()
139 |     max_len = max(
140 |         [len(str(r[field])) + len(str(r[field])) // 3 for i, r in rows.iterrows()]
141 |     )
142 |     for idx, row in rows.iterrows():
143 |         print("{i}:\t{len:>{m},}".format(i=idx + 1, m=max_len, len=row[field]))
144 |     return
145 | 
146 | 
147 | def find_files_of_type(file_or_directory, file_extensions):
148 |     """Return a list of pathlib.Path of files with chosen extensions
149 |     Parameters
150 |     ----------
151 |     file_or_directory : str
152 |         filepath or a directory
153 |     file_extensions : list
154 |         A list of lowercase file extensions including '.' e.g. ['.txt', '.paf']
155 |     Returns
156 |     -------
157 |     list
158 |         If files with extension are found return list of pathlib.Path objects, otherwise return empty list
159 |     """
160 |     file_or_directory = Path(file_or_directory).expanduser()
161 |     if (
162 |         file_or_directory.is_file()
163 |         and "".join(file_or_directory.suffixes).lower() in file_extensions
164 |     ):
165 |         return [file_or_directory]
166 |     elif file_or_directory.is_dir():
167 |         return [
168 |             x
169 |             for x in file_or_directory.iterdir()
170 |             if "".join(x.suffixes).lower() in file_extensions
171 |         ]
172 |     else:
173 |         return []
174 | 
175 | 
176 | def fuse_reads(seq_sum_df, paf_df, distance=10000, alt=True):
177 |     """Find fused reads from sequencing_summary.txt and paf files
178 |     Parse sequencing_summary.txt and mapping.paf files to infer reads that may
179 |     have been incorrectly split my MinKNOW. This approach is based on read number
180 |     and mapping, therefore a _good_ mapping is required.
181 |     Parameters
182 |     ----------
183 |     seq_sum_df : pandas.DataFrame
184 |         A pandas.DataFrame from a sequencing_summary.txt file, this must contain
185 |         the columns `['channel', 'start_time', 'duration', 'run_id', 'read_id',
186 |         'sequence_length_template', 'filename']`.
187 |     paf_df : pandas.DataFrame
188 |         A pandas.DataFrame from a .paf file, these are generated by minimap2.
189 |         As this file type doesn't have headers the following parameters are
190 |         the minimum required for using a file with this function
191 |         `usecols=[0, 4, 5, 7, 8], names=['Qname', 'Strand', 'Tname', 'Tstart', 'Tend']`
192 |     distance : int
193 |         The distance, in bases, between the end coordinate of a read mapping and
194 |         the start coordinate of successive read from the same channel. Defaults to 10000
195 |     alt : bool
196 |         Include alternate assemblies, default is True. If set to True (include
197 |         alternate assemblies) the 'new' dataset may have more bases than the 'original'
198 |         input dataset due to reads mapping to alternate contigs.
199 |     Returns
200 |     -------
201 |     fused_reads_df : pandas.DataFrame
202 |         pandas.DataFrame containing fused reads
203 |     un_fused_reads_df : pandas.DataFrame
204 |         pandas.DataFrame containing un-fused reads
205 |     to_be_fused_reads_df : pandas.DataFrame
206 |         pandas.DataFrame containing reads that are fused 'parts' in the same
207 |         format as un_fused_reads_df
208 |     """
209 |     # TODO: raise error if required columns are not present
210 |     # TODO: raise error if columns are not of correct types
211 |     # Remove zero length reads and sort seq_sum_df
212 |     seq_sum_df = seq_sum_df[seq_sum_df["sequence_length_template"] != 0].sort_values(
213 |         by=["channel", "run_id", "start_time"]
214 |     )
215 |     # Create extra Series for finding fused reads
216 |     seq_sum_df["next_read_id"] = seq_sum_df["read_id"].shift(-1)
217 |     seq_sum_df["next_start_time"] = seq_sum_df["start_time"].shift(-1)
218 |     seq_sum_df["next_end"] = seq_sum_df["next_start_time"] + seq_sum_df[
219 |         "duration"
220 |     ].shift(-1)
221 |     seq_sum_df["next_sequence_length_template"] = seq_sum_df[
222 |         "sequence_length_template"
223 |     ].shift(-1)
224 |     seq_sum_df["combined_length"] = (
225 |         seq_sum_df["sequence_length_template"]
226 |         + seq_sum_df["next_sequence_length_template"]
227 |     )
228 |     seq_sum_df["next_sequence_length_template"] = (
229 |         seq_sum_df["next_sequence_length_template"].fillna(0).astype("int64")
230 |     )
231 |     seq_sum_df["combined_length"] = (
232 |         seq_sum_df["combined_length"].fillna(0).astype("int64")
233 |     )
234 | 
235 |     # Merge seq_sum_df and paf_df on read_id/Qname; this aligns the read and the mapping information
236 |     df = pd.merge(seq_sum_df, paf_df, left_on="read_id", right_on="Qname", how="outer")
237 |     # Merge df with paf_df on next_read_id/Qname; this aligns each read with it's
238 |     # successor giving suffix '_A' and '_B' respectively
239 |     df2 = pd.merge(
240 |         df,
241 |         paf_df,
242 |         left_on="next_read_id",
243 |         right_on="Qname",
244 |         how="outer",
245 |         suffixes=("_A", "_B"),
246 |     )
247 |     df2 = df2.dropna().reset_index()
248 | 
249 |     # If df2 had no rows, no merging has taken place
250 |     if len(df2) == 0:
251 |         return None, None, None
252 | 
253 |     # Condition where Qname (read_id) does NOT match
254 |     not_qname = df2["Qname_A"] != df2["Qname_B"]
255 |     # Condition where Strand matches
256 |     yes_strand = df2["Strand_A"] == df2["Strand_B"]
257 |     # Condition where Target (chromosome) matches
258 |     yes_tname = df2["Tname_A"] == df2["Tname_B"]
259 | 
260 |     df2 = df2[not_qname & yes_strand & yes_tname]
261 | 
262 |     # End program if no rows
263 |     if len(df2) == 0:
264 |         return None, None, None
265 | 
266 |     df2["match_distance"] = np.where(
267 |         df2["Strand_A"] == "+",  # Where: Strand is '+'
268 |         df2["Tstart_B"] - df2["Tend_A"],  # True:  read_2_start - read_1_end
269 |         df2["Tstart_A"] - df2["Tend_B"],  # False: read_1_start - read_2_end
270 |     )
271 |     # Remove reads outside of the distance parameter
272 |     df2 = df2[(df2["match_distance"] > 0) & (df2["match_distance"] < distance)]
273 | 
274 |     # End program if no rows
275 |     if len(df2) == 0:
276 |         return None, None, None
277 | 
278 |     df2 = df2.drop_duplicates(
279 |         subset=[
280 |             "channel",
281 |             "start_time",
282 |             "duration",
283 |             "next_start_time",
284 |             "read_id",
285 |             "next_read_id",
286 |             "sequence_length_template",
287 |             "next_sequence_length_template",
288 |             "combined_length",
289 |         ],
290 |         keep="first",
291 |     )
292 |     # separate df into read groups and set index to cs to allow grouping
293 |     cond_1 = df2["next_read_id"] == df2["read_id"].shift(-1)
294 |     cond_2 = df2["read_id"] == df2["next_read_id"].shift(-1)
295 |     df2["COND"] = np.where(cond_1 | cond_2, True, False)
296 |     df2["W"] = np.where(df2["COND"].shift(1) == False, 1, 0)
297 |     df2["cs"] = df2["W"].cumsum()
298 | 
299 |     """UNDER HERE NOT REVISED OR COMMENTED WELL"""
300 |     # TODO: finish commenting
301 | 
302 |     if alt:
303 |         groupby_list = ["cs", "Tname_B"]
304 |     else:
305 |         groupby_list = ["cs"]
306 |     df2 = df2.set_index(groupby_list)
307 |     df2_groupby = df2.groupby(level=groupby_list)
308 | 
309 |     # group and concatenate read ids
310 |     df2["all_but_last"] = df2_groupby["read_id"].apply("|".join)
311 |     df2["last_read_id"] = df2_groupby["next_read_id"].last()
312 | 
313 |     # TODO: this is the failing point, can it be cut off sooner?
314 | 
315 |     df2["cat_read_id"] = df2["all_but_last"] + "|" + df2["last_read_id"]
316 | 
317 |     # group and combine length
318 |     df2["combined_length"] = df2_groupby["sequence_length_template"].sum()
319 |     df2["last_length"] = df2_groupby["next_sequence_length_template"].last()
320 |     df2["combined_length"] = df2["combined_length"] + df2["last_length"]
321 | 
322 |     # take max/min for end/start match from grouped value list
323 |     df2["start_match"] = (
324 |         df2_groupby[["Tstart_A", "Tstart_B", "Tend_A", "Tend_B"]]
325 |         .transform("min")
326 |         .min(axis=1)
327 |     )
328 |     df2["end_match"] = (
329 |         df2_groupby[["Tstart_A", "Tstart_B", "Tend_A", "Tend_B"]]
330 |         .transform("max")
331 |         .max(axis=1)
332 |     )
333 | 
334 |     # group and add start and end times
335 |     df2["start_time"] = df2_groupby["start_time"].first()
336 |     df2["next_end"] = df2_groupby["next_end"].last()
337 | 
338 |     # add the duration (time between start and end)
339 |     df2["duration"] = df2["next_end"] - df2["start_time"]
340 | 
341 |     # format and add coordinates
342 |     df2["stime_floor"] = np.floor(df2["start_time"]).astype("int64").astype("str")
343 |     df2["etime_ceil"] = np.ceil(df2["next_end"]).astype("int64").astype("str")
344 |     df2["channel"] = df2["channel"].astype("int64").astype("str")
345 |     df2["combined_length"] = df2["combined_length"].astype("int64")
346 |     df2["start_match"] = df2["start_match"].astype("int64").astype("str")
347 |     df2["end_match"] = df2["end_match"].astype("int64").astype("str")
348 |     df2["duration"] = df2["duration"].map("{:.5f}".format)
349 |     df2["coords"] = df2["channel"] + ":" + df2["stime_floor"] + "-" + df2["etime_ceil"]
350 | 
351 |     # rename cols for export
352 |     df2.rename(columns={"Tname_A": "target_name", "Strand_A": "strand"}, inplace=True)
353 | 
354 |     # fused_read_ids is a pd.Series of all fused reads
355 |     fused_read_ids = pd.concat([df2["read_id"], df2["next_read_id"]])
356 | 
357 |     df2["count"] = df2_groupby.size() + 1
358 | 
359 |     # remove duplicate entries from df2
360 |     df2 = df2.drop_duplicates(
361 |         subset=[
362 |             "coords",
363 |             "channel",
364 |             "start_time",
365 |             "duration",
366 |             "combined_length",
367 |             "start_match",
368 |             "end_match",
369 |             "cat_read_id",
370 |         ],
371 |         keep="first",
372 |     )
373 |     fused_read_ids = fused_read_ids.unique()
374 | 
375 |     # un_fused_df contains reads that are correctly split
376 |     un_fused_df = seq_sum_df[~seq_sum_df["read_id"].isin(fused_read_ids)].reset_index()
377 |     # split_df is reads that have false starts (i.e 2->N)
378 |     split_df = seq_sum_df[seq_sum_df["read_id"].isin(fused_read_ids)].reset_index()
379 | 
380 |     # TODO: CLEAN UP EXTRA SERIES FROM DFS
381 |     return df2, un_fused_df, split_df
382 | 
383 | 
384 | def die(message, status=1):
385 |     """Print an error message and call sys.exit with the given status, terminating the process"""
386 |     print(message, file=sys.stderr)
387 |     sys.exit(status)
388 | 
389 | 
390 | def print_args(args, label="Arguments"):
391 |     """Print and format all arguments from the command line"""
392 |     print(label + ":")
393 |     dirs = dir(args)
394 |     m = max([len(a) for a in dirs if a[0] != "_"])
395 |     for attr in dirs:
396 |         if attr[0] != "_":
397 |             print("{a:<{m}}\t{b}".format(a=attr, m=m, b=getattr(args, attr)))
398 |     print("========================================")
399 | 
400 | 
401 | if __name__ == "__main__":
402 |     sys.exit("ERROR: core is not directly executable")
403 | 


--------------------------------------------------------------------------------
/bulkvis/fuse.py:
--------------------------------------------------------------------------------
  1 | from bulkvis.core import (
  2 |     concat_files_to_df,
  3 |     fuse_reads,
  4 |     length_stats,
  5 |     human_readable_yield,
  6 |     top_n,
  7 | )
  8 | from collections import OrderedDict
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | _help = "Find incorrectly split reads from ONT sequencing_summary.txt and minimap2 .paf files"
 13 | _cli = (
 14 |     (
 15 |         "-d",
 16 |         "--distance",
 17 |         dict(
 18 |             help="Specify the maximum distance between consecutive mappings. This is the difference "
 19 |             "between 'Target Start' and 'Target End' in the paf file ",
 20 |             type=int,
 21 |             default=10000,
 22 |             metavar="",
 23 |         ),
 24 |     ),
 25 |     (
 26 |         "-t",
 27 |         "--top",
 28 |         dict(
 29 |             help="Show top N reads, by length, for the original dataset, fused reads, and "
 30 |             "corrected dataset",
 31 |             # This could be written better
 32 |             type=int,
 33 |             default=10,
 34 |             metavar="",
 35 |         ),
 36 |     ),
 37 |     # The behaviour of 'alt' is confusing... it seems like a double negative
 38 |     (
 39 |         "-a",
 40 |         "--alt",
 41 |         dict(
 42 |             help="""Exclude alternate assemblies""", action="store_false", default=True
 43 |         ),
 44 |     ),
 45 |     (
 46 |         "-s",
 47 |         "--summary",
 48 |         dict(
 49 |             metavar="",
 50 |             required=True,
 51 |             nargs="+",
 52 |             help="Sequencing summary file(s) generated by albacore or guppy. Can be compressed "
 53 |             "using gzip, bzip2, xz, or zip",
 54 |         ),
 55 |     ),
 56 |     (
 57 |         "-p",
 58 |         "--paf",
 59 |         dict(
 60 |             metavar="",
 61 |             required=True,
 62 |             nargs="+",
 63 |             help="paf file(s) generated by minimap2. Can be compressed using gzip, bzip2, "
 64 |             "xz, or zip",
 65 |         ),
 66 |     ),
 67 |     (
 68 |         "-o",
 69 |         "--output",
 70 |         dict(
 71 |             help="Specify name for the output file. This file only contains chains of reads.",
 72 |             default="fused_reads.txt",
 73 |             metavar="output",
 74 |         ),
 75 |     ),
 76 | )
 77 | 
 78 | 
 79 | def run(parser, args):
 80 |     """Input and output controller for bulkvis fuse"""
 81 |     # Open sequencing_summary_*.txt files into a single pd.DataFrame
 82 |     seq_sum_df = concat_files_to_df(
 83 |         file_list=args.summary,
 84 |         sep="\t",
 85 |         usecols=[
 86 |             "channel",
 87 |             "start_time",
 88 |             "duration",
 89 |             "run_id",
 90 |             "read_id",
 91 |             "sequence_length_template",
 92 |         ],
 93 |     )
 94 |     # Open minimap2 paf files into a single pd.DataFrame
 95 |     paf_df = concat_files_to_df(
 96 |         file_list=args.paf,
 97 |         sep="\t",
 98 |         header=None,
 99 |         usecols=[0, 4, 5, 7, 8],
100 |         names=["Qname", "Strand", "Tname", "Tstart", "Tend"],
101 |         engine="python",
102 |     )
103 |     fused_df, un_fused_df, to_be_fused_df = fuse_reads(
104 |         seq_sum_df, paf_df, distance=args.distance, alt=args.alt
105 |     )
106 |     # Get yield numbers
107 |     original_bases = np.sum(seq_sum_df["sequence_length_template"])
108 |     new_lengths = pd.concat(
109 |         [un_fused_df["sequence_length_template"], fused_df["combined_length"]]
110 |     )
111 |     new_bases = np.sum(new_lengths)
112 |     seq_sum_lengths = seq_sum_df[seq_sum_df["sequence_length_template"] != 0][
113 |         "sequence_length_template"
114 |     ]
115 |     # Initialize dictionary for holding metrics
116 |     stats = OrderedDict()
117 |     stats["Original reads:"] = length_stats(seq_sum_lengths)
118 |     stats["Un-fused reads:"] = length_stats(un_fused_df["sequence_length_template"])
119 |     stats["To be fused reads:"] = length_stats(
120 |         to_be_fused_df["sequence_length_template"]
121 |     )
122 |     stats["Fused reads:"] = length_stats(fused_df["combined_length"])
123 |     stats["New reads:"] = length_stats(new_lengths)
124 |     # Convert stats dict to pandas.DataFrame for easy display
125 |     stats_df = pd.DataFrame(stats).T[["COUNT", "MIN", "MAX", "MEAN", "N50"]]
126 |     print(stats_df)
127 |     # TODO: display yield better
128 |     print(
129 |         "\nTotal yield {y} ({b:,} bases)".format(
130 |             y=human_readable_yield(original_bases), b=original_bases
131 |         )
132 |     )
133 |     print(
134 |         "Total yield {y} ({b:,} bases)\n".format(
135 |             y=human_readable_yield(new_bases), b=new_bases
136 |         )
137 |     )
138 |     top = abs(args.top)
139 |     if top > 0:
140 |         print("Top {n} original reads by length:".format(n=top))
141 |         top_n(seq_sum_df, "sequence_length_template", top)
142 |         print("Top {n} fused reads by combined length:".format(n=top))
143 |         top_n(fused_df, "combined_length", top)
144 |         print("Top {n} reads after correction:".format(n=top))
145 |         top_n(pd.DataFrame(data={"length": new_lengths}), "length", top)
146 |     header = [
147 |         "coords",
148 |         "run_id",
149 |         "channel",
150 |         "start_time",
151 |         "duration",
152 |         "combined_length",
153 |         "target_name",
154 |         "strand",
155 |         "start_match",
156 |         "end_match",
157 |         "cat_read_id",
158 |         "count",
159 |     ]
160 |     fused_df.to_csv(args.output, sep="\t", header=True, columns=header, index=False)
161 |     print("Fused read summary file saved as {f}".format(f=args.output))
162 | 


--------------------------------------------------------------------------------
/bulkvis/mappings.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from readpaf import parse_paf
  4 | import gzip
  5 | 
  6 | # from argparse import ArgumentParser
  7 | from pathlib import Path
  8 | 
  9 | 
 10 | def run(parser, args):
 11 |     # Open [PAF] mapping file with specified columns
 12 |     if args.paf.endswith(".gz"):
 13 |         fopen = gzip.open
 14 |     else:
 15 |         fopen = open
 16 |     with fopen(args.paf, "rt") as fh:
 17 |         pf = parse_paf(fh, dataframe=True)
 18 |     # Thin PAF file by 'Primary alignment type' and drop duplicates
 19 |     pf = pf[pf["tp"].eq("P")]
 20 |     pf = pf.sort_values(
 21 |         ["query_name", "target_name", "mapping_quality"], ascending=[True, True, False]
 22 |     )
 23 |     # col_names = [
 24 |     #     "Qname",
 25 |     #     "Strand",
 26 |     #     "Tname",
 27 |     #     "Tstart",
 28 |     #     "Tend",
 29 |     #     "mapping_quality",
 30 |     #     "alignment_type",
 31 |     # ]
 32 |     # pf = pd.read_csv(
 33 |     #     paf_path,
 34 |     #     sep="\t",
 35 |     #     header=None,
 36 |     #     names=col_names,
 37 |     #     usecols=[0, 4, 5, 7, 8, 11, 12],
 38 |     # )
 39 |     pf = pf.drop_duplicates(["query_name"], keep="first")
 40 |     # Open sequencing_summary.txt file
 41 |     cols = ["read_id", "run_id", "channel", "start_time", "duration"]
 42 |     ss = pd.read_csv(args.summary, sep="\t", usecols=cols)
 43 |     # Merge seq_sum and paf files
 44 |     df = pd.merge(ss, pf, left_on="read_id", right_on="query_name", how="outer")
 45 |     df = df.dropna()
 46 | 
 47 |     df["end_time"] = df["start_time"] + df["duration"]
 48 |     df["start_mapping"] = (
 49 |         df[["target_start", "target_end"]]
 50 |         .min(axis=1)
 51 |         .astype("int64")
 52 |         .map("{0:,d}".format)
 53 |     )
 54 |     df["end_mapping"] = (
 55 |         df[["target_start", "target_end"]]
 56 |         .max(axis=1)
 57 |         .astype("int64")
 58 |         .map("{0:,d}".format)
 59 |     )
 60 |     df["label"] = (
 61 |         df["target_name"].astype("str")
 62 |         + ": "
 63 |         + df["start_mapping"].astype("str")
 64 |         + " - "
 65 |         + df["end_mapping"].astype("str")
 66 |     )
 67 | 
 68 |     # df = df.rename(columns={"Tname": "target_name", "Strand": "strand"})
 69 |     # export as <run_id>.bmf
 70 |     header = [
 71 |         "run_id",
 72 |         "read_id",
 73 |         "channel",
 74 |         "start_time",
 75 |         "end_time",
 76 |         "target_name",
 77 |         "strand",
 78 |         "start_mapping",
 79 |         "end_mapping",
 80 |         "label",
 81 |     ]
 82 |     i = 0
 83 |     for k, v in df.groupby(["run_id"]):
 84 |         # Join 'bmf' path, run_id, and file extension
 85 |         p = Path(args.bmf).joinpath(str(k) + ".bmf")
 86 |         v.to_csv(p, sep="\t", header=True, columns=header, index=False)
 87 |         i += 1
 88 | 
 89 |     print("{n} files written to {p}".format(n=i, p=args.bmf))
 90 | 
 91 | 
 92 | def full_path(file):
 93 |     return str(Path(file).expanduser().resolve())
 94 | 
 95 | 
 96 | _help = """Parse sequencing_summary.txt files and .paf files to format mapping info for bulkvis"""
 97 | _cli = (
 98 |     (
 99 |         "-s",
100 |         "--summary",
101 |         dict(
102 |             help="A sequencing summary file generated by albacore or guppy",
103 |             type=full_path,
104 |             default="",
105 |             required=True,
106 |             metavar="",
107 |         ),
108 |     ),
109 |     (
110 |         "-p",
111 |         "--paf",
112 |         dict(
113 |             help="A paf file generated by minimap2",
114 |             type=full_path,
115 |             default="",
116 |             required=True,
117 |             metavar="",
118 |         ),
119 |     ),
120 |     (
121 |         "--bmf",
122 |         dict(
123 |             help="Specify the output folder, where files will be written as "
124 |             "<run_id>.bmf. This should be the same folder as the bulk FAST5 "
125 |             "file for this experiment.",
126 |             type=full_path,
127 |             metavar="",
128 |             required=True,
129 |         ),
130 |     ),
131 | )
132 | 


--------------------------------------------------------------------------------
/bulkvis/merge.py:
--------------------------------------------------------------------------------
  1 | """merge.py
  2 | """
  3 | from bulkvis.core import die, fuse_reads, concat_files_to_df, find_files_of_type
  4 | import pandas as pd
  5 | from pathlib import Path
  6 | from tqdm import tqdm
  7 | 
  8 | 
  9 | _help = """Merge FASTQ files based on a fused_reads.txt or ONT 
 10 | sequencing_summary.txt and minimap2 .paf files"""
 11 | _cli = (
 12 |     (
 13 |         "-d",
 14 |         "--distance",
 15 |         dict(
 16 |             help="Specify the maximum distance between consecutive mappings, only used with "
 17 |             "--summary and --paf options",
 18 |             type=int,
 19 |             default=10000,
 20 |         ),
 21 |     ),
 22 |     (
 23 |         # The behaviour of 'alt' is confusing... it seems like a double negative
 24 |         "-a",
 25 |         "--alt",
 26 |         dict(
 27 |             help="""Exclude alternate assemblies""", action="store_false", default=True
 28 |         ),
 29 |     ),
 30 |     (
 31 |         "-s",
 32 |         "--summary",
 33 |         dict(
 34 |             help="Sequencing summary file(s) generated by albacore or guppy", nargs="+",
 35 |         ),
 36 |     ),
 37 |     (
 38 |         "-p",
 39 |         "--paf",
 40 |         dict(help="paf file(s) generated by minimap2", metavar="", nargs="+"),
 41 |     ),
 42 |     (
 43 |         "--fused-reads",
 44 |         dict(help="fused_reads.txt file generated by `bulkvis fuse`", metavar=""),
 45 |     ),
 46 |     (
 47 |         "-i",
 48 |         "--input",
 49 |         dict(
 50 |             help="FASTQ files or directories of input, if a directory is given files with extension"
 51 |             "'.fastq' or '.fq' will be used",
 52 |             nargs="+",
 53 |         ),
 54 |     ),
 55 |     (
 56 |         "-o",
 57 |         "--output-dir",
 58 |         dict(
 59 |             help="Reads will be grouped as fused or un-fused. Fused reads will be saved "
 60 |             "in this directory. If not set uses current working directory",
 61 |         ),
 62 |     ),
 63 |     (
 64 |         "--format",
 65 |         dict(
 66 |             help="Output format for the reads",
 67 |             default="fastq",
 68 |             choices=["fastq", "fasta"],
 69 |         ),
 70 |     ),
 71 |     (
 72 |         "--all-reads",
 73 |         dict(
 74 |             help="Write un-fused reads to 'un_fused_reads.fastq' in the output directory",
 75 |             action="store_true",
 76 |         ),
 77 |     ),
 78 | )
 79 | 
 80 | 
 81 | # TODO: Simplify, remove summary/paf -> require fused_reads.txt
 82 | def run(parser, args):
 83 |     """Find fused reads and merge fasta/q"""
 84 | 
 85 |     """
 86 |     Minimum required files for this script to operate:
 87 |      - sequencing_summary.txt AND mapping.paf
 88 |      OR
 89 |      - fused_reads.txt
 90 | 
 91 |     If both sets are provided, raise exception or if one or the other provided
 92 | 
 93 |     This code block will provide fused_read_ids and fused_reads_tuples
 94 |     """
 95 |     if args.fused_reads and not (args.summary or args.paf):
 96 |         # Open fused_reads.txt file
 97 |         fused_df = pd.read_csv(args.fused_reads, sep='\t', usecols=['run_id', 'cat_read_id', 'count'])
 98 |         # Set fused_read_tuples and fused_read_ids_flat
 99 |         fused_read_tuples = fused_df['cat_read_id'].str.split('|').tolist()
100 |         fused_read_ids = [item for sublist in fused_read_tuples for item in sublist]
101 |     elif args.summary and args.paf and not args.fused_reads:
102 |         # Open sequencing_summary file and paf file, and run bulkvis.fuse_reads
103 |         seq_sum_df = concat_files_to_df(file_list=args.summary,
104 |                                         sep='\t',
105 |                                         usecols=['channel', 'start_time', 'duration',
106 |                                                  'run_id', 'read_id', 'sequence_length_template',
107 |                                                  'filename']
108 |                                         )
109 |         # Open minimap2 paf files into a single pd.DataFrame
110 |         paf_df = concat_files_to_df(file_list=args.paf,
111 |                                     sep='\t',
112 |                                     header=None,
113 |                                     usecols=[0, 4, 5, 7, 8],
114 |                                     names=['Qname', 'Strand', 'Tname', 'Tstart', 'Tend']
115 |                                     )
116 |         fused_df, un_fused_df, to_be_fused_df = fuse_reads(seq_sum_df, paf_df, distance=args.distance, alt=False)
117 |         fused_read_tuples = fused_df['cat_read_id'].str.split('|').tolist()
118 |         fused_read_ids = to_be_fused_df['read_id'].tolist()
119 |     else:
120 |         # Raise a parser error
121 |         parser.error('Either a fused_reads.txt, from bulkvis fuse OR sequencing_summary.txt '
122 |                      'and .paf files must be provided.')
123 | 
124 |     # Empty list for fastq file paths
125 |     fastq_files = []
126 |     # TODO: Maybe consider gzip support
127 |     # These should be lowercase and include the '.'
128 |     endings = ['.fastq', '.fq']
129 | 
130 |     for file_or_directory in args.input:
131 |         fastq_files.extend(find_files_of_type(file_or_directory, endings))
132 |     # remove none from fastq_files
133 |     fastq_files = list(filter(None.__ne__, fastq_files))
134 | 
135 |     # End if no fastq files are found
136 |     if not fastq_files:
137 |         die('No FASTQ files found', status=0)
138 | 
139 |     print('{} fastq files found'.format(len(fastq_files)))
140 |     # Create a read dictionary to hold all the fused reads that are found
141 |     reads = {}
142 |     # Run loop over fastq_files, opening each file and adding only reads that are fused to the dictionary.
143 |     for fastq_file in tqdm(fastq_files, desc='FASTQ processed'):
144 |         # with open(fastq_file, 'r') as fastq:
145 |         with fastq_file.open('r') as fastq:
146 |             for line in fastq:
147 |                 header = line.strip()[1:]
148 |                 read_id = header.split()[0]
149 |                 if read_id in fused_read_ids:
150 |                     sequence = next(fastq).strip()
151 |                     next(fastq)
152 |                     qualities = next(fastq).strip()
153 |                     reads[read_id] = {
154 |                         'header': header.split(),
155 |                         'sequence': sequence,
156 |                         'qualities': qualities
157 |                     }
158 |                 else:
159 |                     next(fastq)
160 |                     next(fastq)
161 |                     next(fastq)
162 | 
163 |     if args.output_dir is not None:
164 |         Path(args.output_dir).mkdir(parents=True, exist_ok=True)
165 |         p = Path(args.output_dir)
166 |     else:
167 |         print('No output directory specified, using current working directory')
168 |         p = Path('.')
169 |     # Set format and create folders for output
170 |     if args.format == 'fastq':
171 |         fused_read_file = p / 'fused_reads.fastq'
172 |     else:
173 |         fused_read_file = p / 'fused_reads.fasta'
174 | 
175 |     # Split out fused reads into new file
176 |     write_counter = 0
177 |     miss_counter = 0
178 |     # with open(fused_read_file, 'w') as output_fused:
179 |     with fused_read_file.open('w') as output_fused:
180 |         for pair in tqdm(fused_read_tuples, desc='Fused reads written'):
181 |             if _read_id_not_in_dict(pair, reads):
182 |                 miss_counter += 1
183 |                 continue
184 |             if args.format == 'fastq':
185 |                 read_str = '@{read_id} {run_id} {number}\n{seq}\n+\n{qual}\n'.format(
186 |                     read_id='|'.join(pair),
187 |                     run_id=reads[pair[0]]['header'][1],
188 |                     number=reads[pair[0]]['header'][2],
189 |                     seq=''.join([reads[s]['sequence'] for s in pair]),
190 |                     qual=''.join([reads[s]['qualities'] for s in pair])
191 |                 )
192 |             else:  # fasta
193 |                 read_str = '>{read_id} {run_id} {number}\n{seq}\n'.format(
194 |                     read_id='|'.join(pair),
195 |                     run_id=reads[pair[0]]['header'][1],
196 |                     number=reads[pair[0]]['header'][2],
197 |                     seq=''.join([reads[s]['sequence'] for s in pair])
198 |                 )
199 |             output_fused.write(read_str)
200 |             write_counter += 1
201 | 
202 |     print('{} fused reads written'.format(write_counter))
203 |     if miss_counter > 0:
204 |         print('{} fused reads missed. These reads are most likely in the "fail" folder.'.format(miss_counter))
205 | 
206 |     # This will return if write all is not set,
207 |     if not args.all_reads:
208 |         die('', status=0)
209 | 
210 |     print('Writing un-fused reads')
211 | 
212 |     # Set path the output directory and make any missing folders
213 |     if args.format == 'fastq':
214 |         un_fused_read_file = p / 'un_fused_reads.fastq'
215 |     else:
216 |         un_fused_read_file = p / 'un_fused_reads.fasta'
217 | 
218 |     # Set new write counter
219 |     write_counter = 0
220 |     # with open(un_fused_read_file, 'w') as output_un_fused:
221 |     with un_fused_read_file.open('w') as output_un_fused:
222 |         for file in tqdm(fastq_files, desc='FASTQ processed'):
223 |             # with open(file, 'r') as fastq:
224 |             with file.open('r') as fastq:
225 |                 for line in fastq:
226 |                     header = line.strip()[1:]
227 |                     read_id = header.split()[0]
228 |                     if read_id not in fused_read_ids:
229 |                         sequence = next(fastq).strip()
230 |                         next(fastq)
231 |                         qualities = next(fastq).strip()
232 |                         # Write the read out
233 |                         if args.format == 'fastq':
234 |                             read_str = '@{header}\n{seq}\n+\n{qual}\n'.format(
235 |                                 header=header,
236 |                                 seq=sequence,
237 |                                 qual=qualities
238 |                             )
239 |                         else:  # fasta
240 |                             read_str = '>{header}\n{seq}\n'.format(
241 |                                 header=header,
242 |                                 seq=sequence
243 |                             )
244 |                         output_un_fused.write(read_str)
245 |                         write_counter += 1
246 |                     else:
247 |                         next(fastq)
248 |                         next(fastq)
249 |                         next(fastq)
250 |     print('{} un-fused reads written'.format(write_counter))
251 |     # TODO: Read and write FASTA, FASTQ, and gzip
252 |     # TODO: Improve arguments, add required inputs
253 | 
254 | 
255 | def _read_id_not_in_dict(read_ids, read_dict):
256 |     """Return True if all read_ids in a list are not in the read_dict keys, otherwise False"""
257 |     for read_id in read_ids:
258 |         if read_id not in read_dict.keys():
259 |             return True
260 |     return False
261 | 


--------------------------------------------------------------------------------
/bulkvis/serve.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import shutil
 3 | import subprocess
 4 | import sys
 5 | 
 6 | from bokeh.command.subcommands.serve import Serve
 7 | 
 8 | 
 9 | _help = "Serve the bulk FAST5 file viewer web app"
10 | # Patch the incoming bokeh serve arguments
11 | # Remove `files` and `--args` as these are
12 | # used in the internal call to bokeh serve
13 | # prepend `dir` which is the bulk file dir
14 | _cli = [
15 |     (
16 |         "dir",
17 |         dict(
18 |             help="bulk FAST5 directory (default: working directory)",
19 |             default=None,
20 |             metavar="BULK_DIRECTORY",
21 |         ),
22 |     ),
23 | ] + [arg for arg in Serve.args if arg[0] not in {"files", "--args"}]
24 | 
25 | 
26 | def run(parser, args):
27 |     bokeh = shutil.which("bokeh")
28 |     if not bokeh:
29 |         sys.exit("Unable to find bokeh. Is it installed?")
30 | 
31 |     server = str(Path(__file__).parent / "bulkvis_server")
32 | 
33 |     flags = sys.argv[3:]
34 | 
35 |     command = [bokeh, "serve", server] + flags + ["--args", args.dir]
36 | 
37 |     try:
38 |         subprocess.run(command)
39 |     except KeyboardInterrupt:
40 |         pass
41 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = bulkvis
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/_static/css/custom.css:
--------------------------------------------------------------------------------
 1 | .figure img {
 2 |     border: 1px solid black;
 3 | }
 4 | .custom-warn {
 5 |     color: #D63301;
 6 |     background-color: #FFCCBA;
 7 |     border: 1px solid;
 8 |     margin: 10px 0px;
 9 |     padding: 15px 40px;
10 |     font-size: 30px;
11 | }


--------------------------------------------------------------------------------
/docs/_static/icons/save.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/icons/save.png


--------------------------------------------------------------------------------
/docs/_static/icons/xpan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/icons/xpan.png


--------------------------------------------------------------------------------
/docs/_static/icons/zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/icons/zoom.png


--------------------------------------------------------------------------------
/docs/_static/images/bulk_file/01_pop_up.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/bulk_file/01_pop_up.png


--------------------------------------------------------------------------------
/docs/_static/images/bulk_file/02_read_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/bulk_file/02_read_config.png


--------------------------------------------------------------------------------
/docs/_static/images/bulk_file/03_bulk_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/bulk_file/03_bulk_config.png


--------------------------------------------------------------------------------
/docs/_static/images/quickstart/01_initial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/01_initial.png


--------------------------------------------------------------------------------
/docs/_static/images/quickstart/02_position.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/02_position.png


--------------------------------------------------------------------------------
/docs/_static/images/quickstart/03_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/03_plot.png


--------------------------------------------------------------------------------
/docs/_static/images/quickstart/04_sidebar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/04_sidebar.png


--------------------------------------------------------------------------------
/docs/_static/images/quickstart/05_annotations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/05_annotations.png


--------------------------------------------------------------------------------
/docs/_static/images/quickstart/06_adjustments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/06_adjustments.png


--------------------------------------------------------------------------------
/docs/_static/images/quickstart/07_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/07_plot.png


--------------------------------------------------------------------------------
/docs/_static/images/quickstart/08_read_file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/08_read_file.png


--------------------------------------------------------------------------------
/docs/_static/images/utilities/01_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/utilities/01_plot.png


--------------------------------------------------------------------------------
/docs/collecting_a_bulk_file.rst:
--------------------------------------------------------------------------------
  1 | ########################
  2 | Editing Protocol Scripts
  3 | ########################
  4 | 
  5 | .. container:: custom-warn
  6 | 
  7 |     This page is a copy of the `Nanopore Community Knowledge Page <https://community.nanoporetech.com/protocols/experiment-companion-minknow/v/mke_1013_v1_revaj_11apr2016/editing-protocol-scripts>`_.
  8 |     Please use the latest instructions available on the Community pages.
  9 | 
 10 | Introduction
 11 | ============
 12 | 
 13 | In MinKNOW version 0.51.3 onwards, the protocol scripts are structured in a way that makes it
 14 | easier for users to configure certain parameters, without needing extensive programming knowledge.
 15 | Namely, there are three settings that can be configured for the 48 h runs from the scripts:
 16 | 
 17 | run_time_changes allows the user to change:
 18 | 
 19 | - the total run time in hours
 20 | - the time in seconds between MUX changes
 21 | - the starting voltage of the run
 22 | - the time in seconds between global potential reversals
 23 | 
 24 | If this configuration is not enabled, the run starts with the current default settings.
 25 | 
 26 | read_file_configuration allows the user to:
 27 | 
 28 | - turn raw data for reads on or off
 29 | - turn event data for reads on or off
 30 | 
 31 | bulk_file_configuration allows the user to:
 32 | 
 33 | - turn raw data on in the bulk file
 34 | - turn event data on in the bulk file
 35 | 
 36 | Note on editing MinKNOW scripts
 37 | -------------------------------
 38 | 
 39 | Prerequisites
 40 | 
 41 | Changing the settings of the scripts impacts on the data collected during the run, so it is advised
 42 | that such configuration is attempted by advanced users only.
 43 | 
 44 | A note of caution
 45 | 
 46 | The scripts that control the device, particularly the sequencing run scripts, are an area of constant
 47 | development. Scripts are subject to rapid change and can be added, removed and overwritten by the
 48 | automatic software updates from Oxford Nanopore. Consequently, it is highly recommended that copies
 49 | of altered scripts are saved in a second location, with change notes, so that if necessary they can
 50 | be restored quickly.
 51 | 
 52 | Open the script of interest
 53 | ===========================
 54 | 
 55 | Open a text editor software (e.g. Notepad++) with Administrator privileges
 56 | In the text editor, click Open and navigate to:
 57 | 
 58 | ``C:\Program Files\OxfordNanopore\MinKNOW\ont-python\Lib\site-packages\bream\core\nc\cli\NC_Sequencing.py`` in Windows
 59 | 
 60 | ``Applications/MinKNOW.app/Contents/Resources/ont-python/lib/python2.7/site-packages/bream/core/nc/cli/NC_Sequencing.py`` in Mac OS X
 61 | 
 62 | ``/opt/ONT/MinKNOW/ont-python/lib/python2.7/site-packages/bream/core/nc/cli`` in Linux
 63 | 
 64 | Navigate to
 65 | ===========
 66 | 
 67 | navigate to line 349:
 68 | ``popup_boxes=args.popup_boxes``
 69 | 
 70 | Replace
 71 | =======
 72 | 
 73 | Replace this line with any combination of the three options below::
 74 | 
 75 |     'run_time_changes'
 76 |     'read_file_configuration'
 77 |     'bulk_file_configuration'
 78 | 
 79 | For example, to enable all three, type::
 80 | 
 81 |     popup_boxes=['run_time_changes', 'read_file_configuration', 'bulk_file_configuration'],
 82 | 
 83 | **Note: the comma at the end of the line is essential for the script to function properly.**
 84 | 
 85 | Save the script
 86 | ===============
 87 | 
 88 | Restart the MinKNOW service
 89 | ===========================
 90 | 
 91 | Open up a command prompt window as administrator, and navigate to the MinKNOW folder::
 92 | 
 93 |     cd "Program Files\OxfordNanopore\MinKNOW"
 94 |     bin\mk_manager_client.exe --exit
 95 |     bin\mk_manager_svc.exe
 96 | 
 97 | For Mac OS X users, open a terminal window::
 98 | 
 99 |     cd /Applications/MinKNOW.app/Contents/Resources
100 |     sudo bin/mk_manager_svc
101 | 
102 | For Ubuntu users, open a terminal window::
103 | 
104 |     cd /opt/ONT/MinKNOW
105 |     sudo bin/mk_manager_svc
106 | 
107 | Pop-up box
108 | ==========
109 | 
110 | After the relevant lines in the script have been activated, a pop-up box will appear when a 48 h
111 | protocol is selected in the MinKNOW web GUI. Enter or check the appropriate information and click Update.
112 | 
113 | .. figure:: _static/images/bulk_file/01_pop_up.png
114 |     :class: figure
115 |     :alt: run_time_changes
116 | 
117 |     run_time_changes
118 | 
119 | .. figure:: _static/images/bulk_file/02_read_config.png
120 |     :class: figure
121 |     :alt: read_file_configuration
122 | 
123 |     read_file_configuration
124 | 
125 | .. figure:: _static/images/bulk_file/03_bulk_config.png
126 |     :class: figure
127 |     :alt: bulk_file_configuration
128 | 
129 |     bulk_file_configuration
130 | 
131 | 
132 | **Bulk data acquisition is turned off by default; to enable it, check the
133 | debug_data box in addition to either the event or raw data.**


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/stable/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | # import os
 16 | # import sys
 17 | # sys.path.insert(0, os.path.abspath('.'))
 18 | 
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | project = 'bulkvis'
 23 | copyright = '2018, Alex Payne'
 24 | author = 'Alex Payne'
 25 | 
 26 | # The short X.Y version
 27 | version = ''
 28 | # The full version, including alpha/beta/rc tags
 29 | release = ''
 30 | 
 31 | 
 32 | # -- General configuration ---------------------------------------------------
 33 | 
 34 | # If your documentation needs a minimal Sphinx version, state it here.
 35 | #
 36 | # needs_sphinx = '1.0'
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 40 | # ones.
 41 | extensions = [
 42 | ]
 43 | 
 44 | # Add any paths that contain templates here, relative to this directory.
 45 | templates_path = ['_templates']
 46 | 
 47 | # The suffix(es) of source filenames.
 48 | # You can specify multiple suffix as a list of string:
 49 | #
 50 | # source_suffix = ['.rst', '.md']
 51 | source_suffix = '.rst'
 52 | 
 53 | # The master toctree document.
 54 | master_doc = 'index'
 55 | 
 56 | # The language for content autogenerated by Sphinx. Refer to documentation
 57 | # for a list of supported languages.
 58 | #
 59 | # This is also used if you do content translation via gettext catalogs.
 60 | # Usually you set "language" from the command line for these cases.
 61 | language = None
 62 | 
 63 | # List of patterns, relative to source directory, that match files and
 64 | # directories to ignore when looking for source files.
 65 | # This pattern also affects html_static_path and html_extra_path .
 66 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 67 | 
 68 | # The name of the Pygments (syntax highlighting) style to use.
 69 | pygments_style = 'sphinx'
 70 | 
 71 | 
 72 | # -- Options for HTML output -------------------------------------------------
 73 | 
 74 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 75 | # a list of builtin themes.
 76 | #
 77 | html_theme = 'alabaster'
 78 | 
 79 | # Theme options are theme-specific and customize the look and feel of a theme
 80 | # further.  For a list of options available for each theme, see the
 81 | # documentation.
 82 | #
 83 | # html_theme_options = {}
 84 | 
 85 | # Add any paths that contain custom static files (such as style sheets) here,
 86 | # relative to this directory. They are copied after the builtin static files,
 87 | # so a file named "default.css" will overwrite the builtin "default.css".
 88 | html_static_path = ['_static']
 89 | 
 90 | # Custom sidebar templates, must be a dictionary that maps document names
 91 | # to template names.
 92 | #
 93 | # The default sidebars (for documents that don't match any pattern) are
 94 | # defined by theme itself.  Builtin themes are using these templates by
 95 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
 96 | # 'searchbox.html']``.
 97 | #
 98 | # html_sidebars = { '***': ['globaltoc.html', 'relations.html', 'sourcelink.html', 'searchbox.html'], }
 99 | 
100 | 
101 | # -- Options for HTMLHelp output ---------------------------------------------
102 | 
103 | # Output file base name for HTML help builder.
104 | htmlhelp_basename = 'bulkvisdoc'
105 | 
106 | 
107 | # -- Options for LaTeX output ------------------------------------------------
108 | 
109 | latex_elements = {
110 |     # The paper size ('letterpaper' or 'a4paper').
111 |     #
112 |     # 'papersize': 'letterpaper',
113 | 
114 |     # The font size ('10pt', '11pt' or '12pt').
115 |     #
116 |     # 'pointsize': '10pt',
117 | 
118 |     # Additional stuff for the LaTeX preamble.
119 |     #
120 |     # 'preamble': '',
121 | 
122 |     # Latex figure (float) alignment
123 |     #
124 |     # 'figure_align': 'htbp',
125 | }
126 | 
127 | # Grouping the document tree into LaTeX files. List of tuples
128 | # (source start file, target name, title,
129 | #  author, documentclass [howto, manual, or own class]).
130 | latex_documents = [
131 |     (master_doc, 'bulkvis.tex', 'bulkvis Documentation',
132 |      'Alex Payne', 'manual'),
133 | ]
134 | 
135 | 
136 | # -- Options for manual page output ------------------------------------------
137 | 
138 | # One entry per manual page. List of tuples
139 | # (source start file, name, description, authors, manual section).
140 | man_pages = [
141 |     (master_doc, 'bulkvis', 'bulkvis Documentation',
142 |      [author], 1)
143 | ]
144 | 
145 | 
146 | # -- Options for Texinfo output ----------------------------------------------
147 | 
148 | # Grouping the document tree into Texinfo files. List of tuples
149 | # (source start file, target name, title, author,
150 | #  dir menu entry, description, category)
151 | texinfo_documents = [
152 |     (master_doc, 'bulkvis', 'bulkvis Documentation',
153 |      author, 'bulkvis', 'One line description of project.',
154 |      'Miscellaneous'),
155 | ]
156 | 
157 | def setup(app):
158 |     app.add_stylesheet('css/custom.css')


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. bulkvis documentation master file, created by
 2 |    sphinx-quickstart on Wed Apr  4 09:36:43 2018.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | #######
 7 | bulkvis
 8 | #######
 9 | 
10 | Introduction
11 | ============
12 | 
13 | bulkvis is an interactive bulk-fast5-file explorer built using python 3 and bokeh.
14 | It enables the visualisation of raw 'squiggle' data from Oxford Nanopore Technologies sequencers.
15 | 
16 | .. toctree::
17 |    :maxdepth: 2
18 |    :titlesonly:
19 |    :caption: Contents:
20 | 
21 |    installation
22 |    quickstart
23 |    utilities
24 |    collecting_a_bulk_file


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | ############
 2 | Installation
 3 | ############
 4 | 
 5 | We recommend running bulkvis from within a python virtual environment so that there are no conflicts in dependencies.
 6 | 
 7 | Installing pip
 8 | ==============
 9 | 
10 | pip is most likely already installed, to find out run::
11 | 
12 |     pip --version
13 | 
14 | If pip is not installed, use the official
15 | `get-pip.py <https://pip.pypa.io/en/stable/installing/#installing-with-get-pip-py>`_ script.
16 | 
17 | Create and activate a virtual environment
18 | =========================================
19 | 
20 | For linux and MacOS::
21 | 
22 |     python3 -m venv bulkvis-env
23 |     source bulkvis-env/bin/activate
24 | 
25 | For Windows::
26 | 
27 |     python3 -m venv bulkvis
28 |     bulkvis\Scripts\activate
29 | 
30 | If the virtual environment is successfully activated the prefix ``(bulkvis)`` will be present.
31 | 
32 | Running ``deactivate`` will deactivate and exit the virtual environment
33 | 
34 | Clone bulkvis
35 | =============
36 | 
37 | bulkvis can be retrieved by cloning the git repository::
38 | 
39 |     git clone https://github.com/LooseLab/bulkvis.git
40 | 
41 | or by navigating to `bulkvis <https://github.com/LooseLab/bulkvis.git>`_ and downloading an zip of the repository,
42 | this will then need to be unzipped.
43 | 
44 | Installing dependencies
45 | =======================
46 | 
47 | Once the repository is cloned or downloaded bulkvis' dependencies will need to be installed. This **must** be run from
48 | within the virtual environment to prevent conflicts. Run::
49 | 
50 |     pip install -r bulkvis/requirements.txt
51 | 
52 | This will fetch and install all the required packages.
53 | 
54 | Creating config.ini
55 | ===================
56 | 
57 | bulkvis uses a configuration file, config.ini, to provide global variables that are required for operation. A config
58 | file can either be generated by running ``utils/set_config.py`` (requires a bulkfile) or by copying and editing an
59 | example bulkfile from ``config.md``.
60 | 
61 | Using ``set_config.py``::
62 | 
63 |     cd bulkvis
64 |     python utils/set_confi.py -b <<bulkfile>> -i /path/to/bulkfile/directory -e /path/to/readfile/directory -m /path/to/mapfile/directory -c config.ini
65 | 
66 | Using ``config.md``::
67 | 
68 |     cd bulkvis
69 |     touch config.ini
70 |     nano config.ini
71 | 
72 | Then navigate to `config.md <https://github.com/LooseLab/bulkvis/blob/master/config.md>`_, copy and paste the example
73 | configuration settings into nano in the terminal and finally change the directories (``dir``, ``out`` and ``map``) to point
74 | towards your bulk-fast5-files and a read directory.
75 | 
76 | Starting bulkvis
77 | ================
78 | 
79 | To start bulkvis::
80 | 
81 |     bokeh serve --show bulkvis
82 | 
83 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=bulkvis
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/quickstart.rst:
--------------------------------------------------------------------------------
  1 | ##########
  2 | Quickstart
  3 | ##########
  4 | 
  5 | This page will provide a quick overview of the bulkvis features and how to use them.
  6 | 
  7 | Start bulkvis
  8 | =============
  9 | From the directory containing bulkvis, run::
 10 | 
 11 |     bokeh serve --show bulkvis
 12 | 
 13 | This will start the bulkvis app and open the page in your default web browser. The page should look like this:
 14 | 
 15 | .. figure:: _static/images/quickstart/01_initial.png
 16 |     :class: figure
 17 |     :alt: Screenshot of bulkvis showing a blank screen with a drop-down box in the top left corner
 18 | 
 19 |     Screenshot of bulkvis on initial load
 20 | 
 21 | Selecting a bulk-fast5-file
 22 | ===========================
 23 | Use the drop-down to select a bulk-fast5-file from your supplied directory. Once the file is loaded the position can be
 24 | entered.
 25 | 
 26 | .. figure:: _static/images/quickstart/02_position.png
 27 |     :class: figure
 28 |     :alt: Screenshot of bulkvis showing, in the top left corner, a drop-down box with a file selected and a text box labeled position
 29 | 
 30 |     Screenshot of bulkvis waiting for position information
 31 | 
 32 | Selecting a position
 33 | ====================
 34 | The position can be selected by either supplying coordinates or a fastq read header. The contents of this input is submitted
 35 | by clicking away from the text box or by pressing return/enter. If bulkvis cannot parse the input the text box will turn
 36 | red until valid input is detected.
 37 | 
 38 | After a position is entered bulkvis will completely load and the chart will be visible.
 39 | 
 40 | Using coordinates
 41 | -----------------
 42 | Coordinates refer to the channel, start time and end time. This is given in the format ``channel:start-end``. For
 43 | example to navigate to channel 42 and see the squiggle from 30 seconds to 90 seconds::
 44 | 
 45 |     42:30-90
 46 | 
 47 | Using a fastq read header
 48 | -------------------------
 49 | Alternatively, the position can be given as a fastq read header that is from the run associated with this bulk-fast5-file.
 50 | This can be copied and pasted into the text-box e.g::
 51 | 
 52 |     @b45a4b09-6f22-40f6-afd9-aa7fca8e89f3 runid=f9291b45b0c66faa77755e51738d193fcfafffc7 read=234 ch=391 start_time=2018-01-18T21:59:40Z
 53 | 
 54 | 
 55 | After entering valid input the chart and other elements will load:
 56 | 
 57 | .. figure:: _static/images/quickstart/03_plot.png
 58 |     :class: figure
 59 |     :alt: Screenshot of bulkvis showing a 'squiggle' plot of raw nanopore signal and a left-hand sidebar containing information about the plot
 60 | 
 61 |     Screenshot of bulkvis fully loaded with both plot and sidebar
 62 | 
 63 | Navigating
 64 | ==========
 65 | The bulk-fast5-file can be navigated by jumping to the next or previous event and by using the xpan (|xpan_icon|) to drag
 66 | the plot along the x-axis or zoom (|zoom_icon|) to take a closer look at a section of the plot.
 67 | 
 68 | The jump to action is available for any event type that is listed as ``True`` in config.ini and is available even when the
 69 | event is not currently being displayed.
 70 | 
 71 | 
 72 | Bulkfile information
 73 | ====================
 74 | The bulkfile information panel showcases information that is present in the bulk-fast5-file that is not necessarily
 75 | displayed in MinKNOW.
 76 | 
 77 | .. figure:: _static/images/quickstart/04_sidebar.png
 78 |     :class: figure
 79 |     :alt: Screenshot of the sidebar from bulkvis, showing the file selections drop-down, position input, jump-to buttons, export button, information panel, and two hidden sections ('Select annotations' and 'Plot adjustments')
 80 | 
 81 |     Screenshot of bulkvis sidebar
 82 | 
 83 | Annotations
 84 | ===========
 85 | Annotations are added to the plot based on state data and intermediate data from the bulkfile. These represent the label
 86 | computed by MinKNOW at specific time points in the experiment.
 87 | 
 88 | Selecting a checkbox will allow that specific label to be rendered on the plot. The 'Display annotations' or 'Display mappings'
 89 | button will toggle these annotations on/off. The 'Select all' button will turn all annotations on, while the 'Select none' 
 90 | button will de-select all the annotations.
 91 | 
 92 | .. figure:: _static/images/quickstart/05_annotations.png
 93 |     :class: figure
 94 |     :alt: Screenshot of 'Select annotations' section of sidebar showing buttons labeled 'Display annotations', 'Display mappings', 'Select all', 'Select none' and a list of checkboxes
 95 | 
 96 |     Screenshot of the annotation selection panel
 97 | 
 98 | Plot adjustments
 99 | ================
100 | The plot adjustments are infrequently used options that are for tweaking the appearance of the plot without having to modify the configuration file.
101 | Here the width and height of the plot can be set to match the current screen, the height that annotations are rendered at can be adjusted, and the Y-axis can
102 | be fixed to a given range.
103 | 
104 | Plot smoothing is on by default, as raw signal data can quickly become massive, this reduces the number of points plot but maintains the shape of the data.
105 | Smoothing will automatically turn on whenever the position is changed.
106 | 
107 | .. figure:: _static/images/quickstart/06_adjustments.png
108 |     :class: figure
109 |     :alt: Screenshot of 'Plot adjustments' section of the sidebar showing inputs for width, height, annotation height, y max, and y min as well as buttons for 'Fixed Y-axis' and 'Smoothing'
110 | 
111 |     Screenshot of the plot adjustments panel
112 | 
113 | Exporting images
114 | ================
115 | bulkvis is able to export images of plots as below, this is done using the save function (|save_icon|) which will either download
116 | the current plot view as ``bokeh_plot.png`` or, in safari, open the image in a new tab where it can be saved by right clicking and selecting save.
117 | 
118 | .. figure:: _static/images/quickstart/07_plot.png
119 |     :class: figure
120 |     :alt: Example plot of raw signal data from an Oxford Nanopore bulk-fast5-file
121 | 
122 |     Example plot from bulkvis
123 | 
124 | Exporting read files
125 | ====================
126 | bulkvis is able to export arbitrary read files from bulk-fast5-files. The data range is determined by the current position as set in the text input.
127 | When a read file is generated it is written to the folder set in the configuration file.
128 | 
129 | .. figure:: _static/images/quickstart/08_read_file.png
130 |     :class: figure
131 |     :alt: Screenshot of bulkvis sidebar showing position, duration, 'read file created', jump buttons and export button
132 | 
133 |     Screenshot of export button and success message (below 'duration')
134 | 
135 | .. |zoom_icon| image:: /_static/icons/zoom.png
136 |     :height: 11pt
137 | .. |xpan_icon| image:: /_static/icons/xpan.png
138 |     :height: 11pt
139 | .. |save_icon| image:: /_static/icons/save.png
140 |     :height: 11pt
141 | 


--------------------------------------------------------------------------------
/docs/utilities.rst:
--------------------------------------------------------------------------------
  1 | Utilities
  2 | =========
  3 | 
  4 | An overview of the utility scripts provided to conduct analysis on fused reads.
  5 | 
  6 | whale_watch.py
  7 | --------------
  8 | .. code-block:: bash
  9 | 
 10 |     Parse sequencing_summary.txt files and .paf files to find split reads in an
 11 |     Oxford Nanopore Dataset
 12 | 
 13 |     General options:
 14 |       -h, --help         Show this help and exit
 15 |       -d , --distance    Specify the maximum distance between consecutive
 16 |                          mappings. This is the difference between 'Target Start'
 17 |                          and 'Target End' in the paf file. Defaults to 10000
 18 |       -t , --top         Specify how many top processed reads to display. Default
 19 |                          is 10
 20 |       -D, --debug        Write debug file
 21 | 
 22 |     Input sources:
 23 |       -s , --summary     A sequencing summary file generated by albacore
 24 |       -p , --paf         A paf file generated by minimap2
 25 | 
 26 |     Output files:
 27 |       -F , --out-fused   Specify name of the fused_read file. This file only
 28 |                          contains chains of reads. Defaults to 'fused_reads.txt'
 29 | 
 30 | 
 31 | Output format
 32 | ^^^^^^^^^^^^^
 33 | .. csv-table::
 34 |     :header: "Field", "Description", "Example"
 35 | 
 36 |     "coords", "bulkvis position coordinates", "231:30782-32296"
 37 |     "run_id", "The run that these reads came from", "8093748fc82dc4c5cc441125d76432dd658c27c8"
 38 |     "channel", "Channel that sequenced these reads", "231"
 39 |     "start_time", "Time, in seconds, that the (first) incorrectly split read starting sequencing", "30782.8425"
 40 |     "duration", "Time, in seconds, it took for the incorrectly split read to pass through the channel", "1512.46425"
 41 |     "combined_length", "Number of bases in the combined reads", "611531"
 42 |     "target_name", "The mapping target, determined by minimap", "chr7"
 43 |     "strand", "'+' if query and target on the same strand; '-' if opposite", "\+"
 44 |     "start_match", "Start coordinate on the original strand", "46731340"
 45 |     "end_match", "End coordinate on the original strand", "46791591"
 46 |     "cat_read_id", "Read ids of all the reads in this group", "82eed45a-7774-4778-8f8a-eb17d7010116|6e9c7720-b7a3-47cc-8f42-30e2219add4b"
 47 |     "count", "Number of reads in this group", "2"
 48 | 
 49 | 
 50 | whale_merge.py
 51 | --------------
 52 | .. code-block:: bash
 53 | 
 54 |     Parse sequencing_summary.txt files and .paf files to find chained reads in an
 55 |     Oxford Nanopore Dataset and output fused fastq files
 56 | 
 57 |     General options:
 58 |       -h, --help         Show this help and exit
 59 |       -d , --distance    Specify the maximum distance between consecutive
 60 |                          mappings. This is the difference between 'Target Start'
 61 |                          and 'Target End' in the paf file. Defaults to 10000
 62 | 
 63 |     Input sources:
 64 |       -s , --summary     A sequencing summary file generated by albacore
 65 |       -p , --paf         A paf file generated by minimap2
 66 |       -f , --readfiles   Full path to the folder containing fastq files you wish
 67 |                          to join
 68 | 
 69 |     Output files:
 70 |       -o , --out-fused   Specify name of the fused_read fastq file. This file will
 71 |                          contain fused reads and the remaining singleton reads.
 72 |                          Defaults to 'fused_reads.fastq'
 73 |       -W                 Outputs just the fused reads
 74 | 
 75 | 
 76 | 
 77 | set_config.py
 78 | -------------
 79 | .. code-block:: bash
 80 | 
 81 |     Generate a configuration file required for bulkvis to run
 82 | 
 83 |     General options:
 84 |       -h, --help          Show this help and exit
 85 | 
 86 |     Input sources:
 87 |       -b , --bulkfile     A bulk-fast5 file to get labels from
 88 |       -i , --input-dir    The path to tbe folder containing bulk-files for
 89 |                           visualisation
 90 |       -e , --export-dir   The path to tbe folder where read-files will be written
 91 |                           by bulkvis
 92 | 
 93 |     Output:
 94 |       -c , --config       Path to the config.ini file in your bulkvis installation
 95 | 
 96 | 
 97 | Figure scripts
 98 | --------------
 99 | whale_plot.py
100 | ^^^^^^^^^^^^^
101 | .. code-block:: bash
102 | 
103 |     Parse sequencing_summary.txt, .paf, and bulk fast5 files to generate CSV files
104 |     containing the distributions of MinKNOW events around read starts and ends.
105 |     These are divided into unique reads, split reads and internal reads. The R
106 |     script, whale.R, is called to generate the plot; this requires the packages:
107 |     ggplot2, tidyr, and dplyr. Note: of the MinKNOW classifications only above,
108 |     adapter, pore, transition, unblocking, and unclassified are included.
109 | 
110 |     General options:
111 |       -h, --help            Show this help and exit
112 |       -d DISTANCE, --distance DISTANCE
113 |                             Specify the maximum distance, in bases, between
114 |                             consecutive mappings. This is the difference between
115 |                             'Target Start' and 'Target End' in a paf file
116 |                             (default: 10000)
117 |       -V, --verbose         Print verbose output to terminal (default: False)
118 | 
119 |     Input sources:
120 |       -b BULK_FILE, --bulk-file BULK_FILE
121 |                             An ONT bulk fast5 file containing raw signal (default:
122 |                             None)
123 |       -s SUMMARY, --summary SUMMARY
124 |                             A sequencing summary file generated by albacore
125 |                             (default: None)
126 |       -p PAF, --paf PAF     A paf file generated by minimap2 (default: None)
127 |       -t TIME, --time TIME  +/- time around a strand event in seconds (default:
128 |                             10)
129 | 
130 |     Output files:
131 |       --no-generate-plot    If set, do not generate density plot (default: False)
132 |       -A A                  CSV of MinKNOW events occurring before and after
133 |                             correctly called read starts (default:
134 |                             unique_read_start.csv)
135 |       -B B                  CSV of MinKNOW events occurring before and after
136 |                             correctly called read ends (default:
137 |                             unique_read_end.csv)
138 |       -C C                  CSV of MinKNOW events occurring before and after the
139 |                             start of the first incorrectly split read in a group
140 |                             (default: split_read_start.csv)
141 |       -D D                  CSV of MinKNOW events occurring before and after
142 |                             incorrectly called read starts, within a group of
143 |                             incorrectly split reads (default:
144 |                             internal_read_start.csv)
145 |       -E E                  CSV of MinKNOW events occurring before and after
146 |                             incorrectly called read ends, within a group of
147 |                             incorrectly split reads (default:
148 |                             internal_read_end.csv)
149 |       -F F                  CSV of MinKNOW events occurring before and after the
150 |                             end of the first incorrectly split read in a group
151 |                             (default: split_read_end.csv)
152 |       --out OUT             Specify the output filename for the plot. File
153 |                             extension must be one of [.eps, .ps, .tex, .pdf,
154 |                             .jpeg, .tiff, .png, .bmp, .svg, .wmf] (default:
155 |                             classification_count.pdf)
156 | 
157 | 
158 | Example plot:
159 | """""""""""""
160 | .. figure:: _static/images/utilities/01_plot.png
161 |     :class: figure
162 |     :alt: Example whale_plot.py output, showing a six columns: unique read start, unique read end, split read start, internal read start, internal read end, split read end. Each column shows the count of different classifications (above, adapter, pore, transition, unblocking, unclassified) around read starts and ends.
163 | 
164 |     Example plot from whale_plot.py
165 | 
166 | whale.R
167 | ^^^^^^^
168 | 
169 | This R script is called by whale_plot.py to produce the above plot, it requires `Rscript` to run and can be run independently. To run:
170 | 
171 | .. code-block:: bash
172 | 
173 |     $ Rscript whale.R col_A.csv col_B.csv col_C.csv col_D.csv col_E.csv col_F.csv <<output filename>> <<run id>>
174 | 
175 | The order arguments is given is essential in this script, otherwise labels will not match.
176 | The output filename must include a file extesion from `[.eps, .ps, .tex, .pdf, .jpeg, .tiff, .png, .bmp, .svg, .wmf]`.
177 | Run id is not required to execute this script.
178 | 
179 | pod_plot.py
180 | ^^^^^^^^^^^
181 | .. code-block:: bash
182 | 
183 |     Generate plots for all reads in a fused_reads.txt file. This uses bokeh to
184 |     render a plot and requires selenium, phantomjs, and Pillow to be installed.
185 |     These are available via conda/pip.
186 | 
187 |     General options:
188 |       -h, --help         Show this help and exit
189 | 
190 |     Input sources:
191 |       -f , --fused       A fused read file generated by whale_watch.py
192 |       -b , --bulk-file   An ONT bulk-fast5-file
193 | 
194 |     Output files:
195 |       -D , --out-dir     Specify the output directory where plots will be saved.
196 |                          Defaults to current working directory
197 | 
198 | gen_bmf.py
199 | ----------
200 | .. code-block:: bash
201 | 
202 |     Parse sequencing_summary.txt files and .paf files to format mapping info for
203 |     bulkvis
204 |     
205 |     General options:
206 |       -h, --help       Show this help and exit
207 |     
208 |     Input sources:
209 |       -s , --summary   A sequencing summary file generated by albacore
210 |       -p , --paf       A paf file generated by minimap2
211 |     
212 |     Output::
213 |       --bmf            Specify the output folder, where files will be written as
214 |                        <run_id>.bmf. This should be the 'map' path specified in
215 |                        the config.ini
216 | 
217 | bulk_info.py
218 | -------------
219 | .. code-block:: bash
220 | 
221 |     Given a directory containing bulk fast5 files output a csv containing the run
222 |     information for them
223 | 
224 |     General options:
225 |       -h, --help   Show this help and exit
226 | 
227 |     Input sources:
228 |       -d , --dir   A directory containing bulk-fast5-files
229 | 
230 |     Output sources:
231 |       -o , --out   Output csv filename
232 | 
233 | Other scripts
234 | -------------
235 | 
236 | channelmaps.py
237 | ^^^^^^^^^^^^^^
238 | `channelmaps.py` is a utility script that is designed to be called by other scripts. It contains the physical layout of
239 | ONT minION flowcells and allows lookup by channel number, reverse lookup by coordinates, and can return a list of
240 | surrounding channels.
241 | 
242 | stitch.py
243 | ^^^^^^^^^
244 | `stitch.py` is a utility script that is called from bulkvis, it will produce the read fast5 file from the squiggle data.
245 | 


--------------------------------------------------------------------------------
/env.yml:
--------------------------------------------------------------------------------
 1 | name: bulkvis
 2 | channels:
 3 |   - bioconda
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.11
 8 |   - pip
 9 |   - pip:
10 |     - numpy==1.26.4
11 |     - git+https://github.com/LooseLab/bulkvis.git
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | version = {}
 4 | with open("bulkvis/_version.py") as fh:
 5 |     exec(fh.read(), version)
 6 | 
 7 | install_requires = [
 8 |     "bokeh>=2.1.0,<2.4.0",
 9 |     "h5py",
10 |     "pandas>1.0,<2.0",
11 |     "tornado",
12 |     "tqdm",
13 |     "readpaf",
14 | ]
15 | 
16 | setup(
17 |     name="bulkvis",
18 |     version=version["__version__"],
19 |     author="Alexander Payne",
20 |     install_requires=install_requires,
21 |     entry_points={
22 |         "console_scripts": [
23 |             "bulkvis=bulkvis.bulkvis:main",
24 |         ],
25 |     },
26 |     packages=["bulkvis", "bulkvis.bulkvis_server"],
27 |     python_requires=">=3.6",
28 |     include_package_data=True,
29 | )
30 | 


--------------------------------------------------------------------------------