├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── bulkvis ├── __init__.py ├── _version.py ├── bulkvis.py ├── bulkvis_server │ ├── main.py │ └── templates │ │ ├── index.html │ │ └── styles.css ├── cite.py ├── core.py ├── fuse.py ├── mappings.py ├── merge.py └── serve.py ├── docs ├── Makefile ├── _static │ ├── css │ │ └── custom.css │ ├── icons │ │ ├── save.png │ │ ├── xpan.png │ │ └── zoom.png │ └── images │ │ ├── bulk_file │ │ ├── 01_pop_up.png │ │ ├── 02_read_config.png │ │ └── 03_bulk_config.png │ │ ├── quickstart │ │ ├── 01_initial.png │ │ ├── 02_position.png │ │ ├── 03_plot.png │ │ ├── 04_sidebar.png │ │ ├── 05_annotations.png │ │ ├── 06_adjustments.png │ │ ├── 07_plot.png │ │ └── 08_read_file.png │ │ └── utilities │ │ └── 01_plot.png ├── collecting_a_bulk_file.rst ├── conf.py ├── index.rst ├── installation.rst ├── make.bat ├── quickstart.rst └── utilities.rst ├── env.yml └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | .DS_STORE 141 | *.fast5 142 | .idea/ 143 | *.ini 144 | docs/_build 145 | docs/_template 146 | data/* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 LooseLab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include bulkvis/ *.py 2 | recursive-include bulkvis/ *.html 3 | recursive-include bulkvis/ *.css 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ⇜ bulkvis ⇝ 2 | ============ 3 | 4 | An app written in Python3 using [Bokeh][1] to visualise raw squiggle data from Oxford Nanopore Technologies (ONT) bulkfiles. 5 | 6 | Quickstart 7 | ========== 8 | 9 | Our preferred installation method uses `conda` with this environment setup: 10 | ```yaml 11 | name: bulkvis 12 | channels: 13 | - bioconda 14 | - conda-forge 15 | - defaults 16 | dependencies: 17 | - python=3.11 18 | - pip 19 | - pip: 20 | - numpy==1.26.4 21 | - git+https://github.com/LooseLab/bulkvis.git@2.0 22 | ``` 23 | 24 | Either copy the YAML above into a file or: 25 | 26 | ```console 27 | curl -O https://raw.githubusercontent.com/LooseLab/bulkvis/2.0/env.yml 28 | conda env create -f env.yml 29 | ``` 30 | 31 | Then bulkvis can be started using: 32 | ```console 33 | conda activate bulkvis 34 | bulkvis serve --show 35 | ``` 36 | 37 |
38 | or with another python source 39 | 40 | ```bash 41 | # Make a python3 virtual environment 42 | python3 -m venv bulkvis 43 | 44 | # Activate virtual environment 45 | source bulkvis/bin/activate 46 | 47 | # Clone the repo to your installation/projects directory 48 | pip install git+https://github.com/LooseLab/bulkvis.git@2.0 49 | 50 | # Start bokeh server 51 | bulkvis serve --show 52 | ``` 53 |
54 | 55 | Other install requires: 56 | === 57 | 58 | To open some bulk FAST5 files [`vbz compression plugins`][2] are required. 59 | These are written and maintained by Oxford Nanopore Technologies. 60 | 61 | 62 | [1]: https://github.com/bokeh/bokeh/ 63 | [2]: https://github.com/nanoporetech/vbz_compression 64 | -------------------------------------------------------------------------------- /bulkvis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/bulkvis/__init__.py -------------------------------------------------------------------------------- /bulkvis/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "2.0.1" 2 | -------------------------------------------------------------------------------- /bulkvis/bulkvis.py: -------------------------------------------------------------------------------- 1 | """bulkvis.py 2 | 3 | This is the main entry point for the bulkvis CLI 4 | """ 5 | import argparse 6 | import importlib 7 | 8 | from ._version import __version__ 9 | 10 | 11 | def run_command(parser, args): 12 | try: 13 | command = importlib.import_module(f"bulkvis.{args.command}") 14 | except ImportError: 15 | parser.exit(2, f"Could not use subcommand: {args.command!r}") 16 | 17 | command.run(parser, args) 18 | 19 | 20 | def main(): 21 | parser = argparse.ArgumentParser( 22 | prog="bulkvis", 23 | epilog="See ' --help' to read about a specific sub-command.", 24 | ) 25 | version = f"bulkvis {__version__}" 26 | parser.add_argument("--version", action="version", version=version) 27 | subparsers = parser.add_subparsers(dest="command", help="Sub-commands") 28 | 29 | for module in ["fuse", "merge", "serve", "mappings", "cite"]: 30 | _module = importlib.import_module(f"bulkvis.{module}") 31 | _parser = subparsers.add_parser( 32 | module, description=_module._help, help=_module._help 33 | ) 34 | for *flags, opts in _module._cli: 35 | _parser.add_argument(*flags, **opts) 36 | _parser.set_defaults(func=run_command) 37 | 38 | args = parser.parse_args() 39 | if args.command is not None: 40 | args.func(parser, args) 41 | else: 42 | parser.print_help() 43 | 44 | 45 | if __name__ == "__main__": 46 | main() 47 | 48 | # TODO: Changelog and deprecations 49 | # TODO: github workflows 50 | # TODO: Make sure CLIs match run 51 | -------------------------------------------------------------------------------- /bulkvis/bulkvis_server/main.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | from dateutil import parser 3 | import math 4 | from pathlib import Path 5 | import re 6 | import io 7 | import argparse 8 | import logging 9 | from collections import OrderedDict 10 | 11 | import h5py 12 | import numpy as np 13 | import pandas as pd 14 | from bokeh.layouts import row, column 15 | from bokeh.models import ( 16 | TextInput, 17 | Toggle, 18 | Div, 19 | Range1d, 20 | Label, 21 | Span, 22 | Title, 23 | LabelSet, 24 | RadioButtonGroup, 25 | ) 26 | from bokeh.models import ( 27 | CheckboxGroup, 28 | Dropdown, 29 | PreText, 30 | Select, 31 | Button, 32 | ColumnDataSource, 33 | ) 34 | from bokeh.plotting import curdoc, figure 35 | 36 | 37 | def export_read_file(channel, start_index, end_index, bulkfile, output_dir): 38 | """ 39 | Export a read file generated from index coordinates and 40 | :param channel: int, channel number 41 | :param start_index: int, start index for read 42 | :param end_index: int, end index for read 43 | :param bulkfile: bulkfile object 44 | :param output_dir: str, output directory, including trailing slash 45 | :return: 0 for success 46 | """ 47 | out_filename = Path(bulkfile.filename).stem 48 | # out_filename = ( 49 | # bulkfile["UniqueGlobalKey"]["context_tags"].attrs["filename"].decode("utf8") 50 | # ) 51 | 52 | output_arg = "{dir}/{fn}_bulkvis-read_{start}-{end}_ch_{ch}.fast5".format( 53 | dir=output_dir, 54 | fn=out_filename, 55 | start=start_index, 56 | end=end_index, 57 | ch=channel, 58 | ) 59 | 60 | LOGGER.info(f"Exporting to {output_arg}") 61 | 62 | readfile = h5py.File(output_arg, "w") 63 | read_id_str = "{ch}-{start}-{end}".format( 64 | ch=channel, start=start_index, end=end_index 65 | ) 66 | version_num = 0.6 67 | 68 | ch_num = channel 69 | ch_str = "Channel_{ch}".format(ch=ch_num) 70 | 71 | ugk = readfile.create_group("UniqueGlobalKey") 72 | 73 | bulkfile.copy("UniqueGlobalKey/context_tags", ugk) 74 | bulkfile.copy("UniqueGlobalKey/tracking_id", ugk) 75 | bulkfile.copy("IntermediateData/{ch}/Meta".format(ch=ch_str), ugk) 76 | 77 | readfile["UniqueGlobalKey"]["channel_id"] = readfile["UniqueGlobalKey"]["Meta"] 78 | readfile["UniqueGlobalKey"]["channel_id"].attrs.create( 79 | "sampling_rate", 80 | readfile["UniqueGlobalKey"]["Meta"].attrs["sample_rate"], 81 | None, 82 | dtype="Float64", 83 | ) 84 | del readfile["UniqueGlobalKey"]["Meta"] 85 | 86 | readfile["UniqueGlobalKey"]["channel_id"].attrs.create( 87 | "channel_number", ch_num, None, dtype=" start_index).dropna() 109 | read_number = 0 110 | attrs = { 111 | "duration": {"val": end_index - start_index, "d": "uint32"}, 112 | "median_before": {"val": df.iloc[0].median_before, "d": "Float64"}, 113 | "read_id": {"val": read_id_str, "d": "channel:start-end or a complete FASTQ header.""", 266 | css_classes=["position-drop"], 267 | ) 268 | app_data["wdg_dict"]["position"] = TextInput( 269 | value="", 270 | placeholder="e.g 391:120-150 or complete FASTQ header", 271 | css_classes=["position-label"], 272 | ) 273 | read_bmf(app_data["app_vars"]["Run ID"]) 274 | app_data["wdg_dict"]["position"].on_change("value", parse_position) 275 | 276 | layout.children[0] = column( 277 | list(app_data["wdg_dict"].values()), width=int(cfg_po["wdg_width"]) 278 | ) 279 | 280 | 281 | def read_bmf(run_id): 282 | run_id = run_id + ".bmf" 283 | try: 284 | app_data["bmf"] = pd.read_csv(Path(Path(cfg_dr["map"]) / run_id), sep="\t") 285 | # filter mappings to just this run 286 | app_data["bmf"] = app_data["bmf"][ 287 | app_data["bmf"]["run_id"] == app_data["app_vars"]["Run ID"] 288 | ] 289 | except FileNotFoundError: 290 | pass 291 | except Exception as e: 292 | print(e) 293 | return 294 | 295 | 296 | def open_bulkfile(path): 297 | # !!! add in check to see if this is a ONT bulkfile 298 | # Open bulkfile in read-only mode 299 | open_file = h5py.File(path, "r") 300 | # Get sample frequency, how many data points are collected each second 301 | sf = int( 302 | open_file["UniqueGlobalKey"]["context_tags"] 303 | .attrs["sample_frequency"] 304 | .decode("utf8") 305 | ) 306 | attributes = OrderedDict( 307 | [ 308 | ( 309 | "tracking_id", 310 | [ 311 | ("Experiment", "sample_id"), 312 | ("Flowcell ID", "flow_cell_id"), 313 | ("MinKNOW version", "version"), 314 | ("Protocols version", "protocols_version"), 315 | ("MinION ID", "device_id"), 316 | ("Hostname", "hostname"), 317 | ("Run ID", "run_id"), 318 | ("ASIC ID", "asic_id"), 319 | ("Experiment start", "exp_start_time"), 320 | ], 321 | ), 322 | ( 323 | "context_tags", 324 | [ 325 | ("Sequencing kit", "sequencing_kit"), 326 | ("Flowcell type", "flowcell_type"), 327 | ], 328 | ), 329 | ] 330 | ) 331 | 332 | for k, v in attributes.items(): 333 | for attribute in v: 334 | try: 335 | app_data["app_vars"][attribute[0]] = ( 336 | open_file["UniqueGlobalKey"][k].attrs[attribute[1]].decode("utf8") 337 | ) 338 | if attribute[1] == "exp_start_time": 339 | app_data["app_vars"][attribute[0]] = parser.parse( 340 | app_data["app_vars"][attribute[0]] 341 | ).strftime("%d-%b-%Y %H:%M:%S") 342 | except KeyError: 343 | app_data["app_vars"][attribute[0]] = "N/A" 344 | return open_file, sf, attributes 345 | 346 | 347 | # noinspection PyUnboundLocalVariable 348 | def parse_position(attr, old, new): 349 | if re.match(r"^(\@[a-f0-9\-]{36})([a-z0-9=\s]{1,})ch=[0-9]{1,4}", new): 350 | # https://regex101.com/r/9VvgNM/4 351 | # Match UUID / read_id as fastq str 352 | # ^(\@[a-f0-9\-]{36}) 353 | # Match lowercase a-z, 0-9, '=' and whitespace 354 | # ([a-z0-9=\s]{1,}) 355 | # Match 'ch=' and up to 4 numbers 356 | # ch=[0-9]{1,4} 357 | # if new[0] == "@": 358 | input_error(app_data["wdg_dict"]["position"], "remove") 359 | fq = new[1:] 360 | fq_list = fq.split(" ") 361 | # split out read_id and channel 362 | for k, item in enumerate(fq_list): 363 | if k == 0: 364 | read_id = item 365 | if item.split("=")[0] == "ch": 366 | channel_num = item.split("=")[1] 367 | channel_str = "Channel_{num}".format(num=channel_num) 368 | # Get ch_str, start, end 369 | # If read_id and ch not set... 370 | # noinspection PyUnboundLocalVariable 371 | if read_id and channel_str: 372 | int_data_path = app_data["bulkfile"]["IntermediateData"][channel_str][ 373 | "Reads" 374 | ] 375 | int_data_labels = { 376 | "read_id": int_data_path["read_id"], 377 | "read_start": int_data_path["read_start"], 378 | } 379 | df = pd.DataFrame(data=int_data_labels) 380 | df.read_start = df.read_start / app_data["app_vars"]["sf"] 381 | df.read_id = df.read_id.str.decode("utf8") 382 | df = df.where(df.read_id == read_id) 383 | df = df.dropna() 384 | if len(df) > 2: 385 | start_time = math.floor(df.iloc[0, :].read_start) 386 | end_time = math.ceil(df.iloc[-1, :].read_start) 387 | else: 388 | input_error(app_data["wdg_dict"]["position"], "add") 389 | return 390 | else: 391 | input_error(app_data["wdg_dict"]["position"], "add") 392 | return 393 | app_data["wdg_dict"]["position"].value = "{ch}:{start}-{end}".format( 394 | ch=channel_num, start=start_time, end=end_time 395 | ) 396 | elif re.match(r"^([0-9]{1,4}:[0-9]{1,9}-[0-9]{1,9})\Z", new): 397 | # https://regex101.com/r/zkN1j2/2 398 | input_error(app_data["wdg_dict"]["position"], "remove") 399 | coords = new.split(":") 400 | times = coords[1].split("-") 401 | channel_num = coords[0] 402 | channel_str = "Channel_{num}".format(num=channel_num) 403 | (start_time, end_time) = int(times[0]), int(times[1]) 404 | if end_time - start_time <= 0: 405 | input_error(app_data["wdg_dict"]["position"], "add") 406 | return 407 | else: 408 | input_error(app_data["wdg_dict"]["position"], "add") 409 | return 410 | 411 | if int(end_time) > app_data["app_vars"]["len_ds"]: 412 | end_time = app_data["app_vars"]["len_ds"] 413 | app_data["app_vars"]["channel_str"] = channel_str 414 | app_data["app_vars"]["channel_num"] = int(channel_num) 415 | app_data["app_vars"]["start_time"] = int(start_time) 416 | app_data["app_vars"]["end_time"] = int(end_time) 417 | 418 | app_data["wdg_dict"]["position"].value = "{ch}:{start}-{end}".format( 419 | ch=app_data["app_vars"]["channel_num"], 420 | start=app_data["app_vars"]["start_time"], 421 | end=app_data["app_vars"]["end_time"], 422 | ) 423 | 424 | update() 425 | 426 | 427 | def update_data(bulkfile, app_vars): 428 | app_vars["duration"] = app_vars["end_time"] - app_vars["start_time"] 429 | # get times and squiggles 430 | app_vars["start_squiggle"] = math.floor(app_vars["start_time"] * app_vars["sf"]) 431 | app_vars["end_squiggle"] = math.floor(app_vars["end_time"] * app_vars["sf"]) 432 | # get data in numpy arrays 433 | step = 1 / app_vars["sf"] 434 | app_data["x_data"] = np.arange(app_vars["start_time"], app_vars["end_time"], step) 435 | app_data["y_data"] = bulkfile["Raw"][app_vars["channel_str"]]["Signal"][()] 436 | app_vars["len_ds"] = len(app_data["y_data"]) / app_vars["sf"] 437 | app_data["y_data"] = app_data["y_data"][ 438 | app_vars["start_squiggle"] : app_vars["end_squiggle"] 439 | ] 440 | # get annotations 441 | path = bulkfile["IntermediateData"][app_vars["channel_str"]]["Reads"] 442 | fields = ["read_id", "read_start", "modal_classification"] 443 | app_data["label_df"], app_data["label_dt"] = get_annotations( 444 | path, fields, "modal_classification" 445 | ) 446 | app_data["label_df"] = app_data["label_df"].drop_duplicates( 447 | subset=["read_id", "modal_classification"], keep="first" 448 | ) 449 | app_data["label_df"].read_start = app_data["label_df"].read_start / app_vars["sf"] 450 | app_data["label_df"].read_id = app_data["label_df"].read_id.str.decode("utf8") 451 | 452 | path = bulkfile["StateData"][app_vars["channel_str"]]["States"] 453 | fields = ["acquisition_raw_index", "summary_state"] 454 | state_label_df, state_label_dtypes = get_annotations(path, fields, "summary_state") 455 | state_label_df.acquisition_raw_index = ( 456 | state_label_df.acquisition_raw_index / app_vars["sf"] 457 | ) 458 | state_label_df = state_label_df.rename( 459 | columns={ 460 | "acquisition_raw_index": "read_start", 461 | "summary_state": "modal_classification", 462 | } 463 | ) 464 | app_data["label_df"] = app_data["label_df"].append( 465 | state_label_df, ignore_index=True 466 | ) 467 | app_data["label_df"].sort_values(by="read_start", ascending=True, inplace=True) 468 | app_data["label_dt"].update(state_label_dtypes) 469 | 470 | 471 | def get_annotations(path, fields, enum_field): 472 | data_labels = {} 473 | for field in fields: 474 | data_labels[field] = path[field] 475 | data_dtypes = {} 476 | if h5py.check_dtype(enum=path.dtype[enum_field]): 477 | dataset_dtype = h5py.check_dtype(enum=path.dtype[enum_field]) 478 | # data_dtype may lose some dataset dtypes there are duplicates of 'v' 479 | data_dtypes = {v: k for k, v in dataset_dtype.items()} 480 | labels_df = pd.DataFrame(data=data_labels) 481 | return labels_df, data_dtypes 482 | 483 | 484 | def build_widgets(): 485 | """""" 486 | check_labels = [] 487 | jump_list = [] 488 | check_active = [] 489 | app_data["label_mp"] = {} 490 | for k, v in enumerate(app_data["label_dt"].items()): 491 | app_data["label_mp"][v[0]] = k 492 | check_labels.append(v[1]) 493 | if v[1] in cfg_lo: 494 | if cfg_lo[v[1]] == "True": 495 | check_active.append(k) 496 | jump_list.append((v[1], str(v[0]))) 497 | # else: 498 | # # print("label {v} is in your bulk-file but not defined in config.ini".format(v=v[1])) 499 | # check_active.append(k) 500 | 501 | if len(check_active) == len(check_labels): 502 | filter_toggle_active = 0 503 | elif len(check_active) == 0: 504 | filter_toggle_active = 1 505 | else: 506 | filter_toggle_active = None 507 | 508 | wdg = app_data["wdg_dict"] 509 | wdg["duration"] = PreText( 510 | text="Duration: {d} seconds".format(d=app_data["app_vars"]["duration"]), 511 | css_classes=["duration_pre"], 512 | ) 513 | # wdg['navigation_label'] = Div(text='Navigation:', css_classes=['navigation-dropdown', 'help-text']) 514 | # wdg['navigation_text'] = Div( 515 | # text="""Use the Jump to ... buttons to find the next or previous event type. 516 | # """, 517 | # css_classes=['navigation-drop'] 518 | # ) 519 | wdg["jump_next"] = Dropdown( 520 | label="Jump to next", 521 | button_type="primary", 522 | menu=jump_list, 523 | css_classes=["jump-block"], 524 | ) 525 | wdg["jump_prev"] = Dropdown( 526 | label="Jump to previous", button_type="primary", menu=jump_list 527 | ) 528 | 529 | wdg["export_label"] = Div( 530 | text="Export data:", css_classes=["export-dropdown", "help-text"] 531 | ) 532 | wdg["export_text"] = Div( 533 | text="""Export data, as a read file, from the current position. These are written to the output directory 534 | specified in your config file. 535 | """, 536 | css_classes=["export-drop"], 537 | ) 538 | wdg["save_read_file"] = Button( 539 | label="Save read file", button_type="success", css_classes=[] 540 | ) 541 | # wdg['bulkfile_info'] = Div(text='Bulkfile info', css_classes=['bulkfile-dropdown', 'caret-down']) 542 | # wdg['bulkfile_help'] = Div(text='Bulkfile info help:', css_classes=['bulkfile-help-dropdown', 'help-text', 'bulkfile-drop']) 543 | # wdg['bulkfile_help_text'] = Div( 544 | # text="""This contains basic information about the experiment that is recorded in the bulk-fast5-file. 545 | # """, 546 | # css_classes=['bulkfile-help-drop'] 547 | # ) 548 | wdg["bulkfile_text"] = Div(text="", css_classes=["bulkfile-drop"]) 549 | for k, v in app_data["app_vars"]["attributes"].items(): 550 | for entry in v: 551 | wdg[ 552 | "bulkfile_text" 553 | ].text += "{f}:
{val}
".format( 554 | f=entry[0], val=app_data["app_vars"][entry[0]] 555 | ) 556 | # wdg['label_options'] = Div(text='Select annotations', css_classes=['filter-dropdown', 'caret-down']) 557 | # wdg['filter_help'] = Div(text='filter help:', css_classes=['filter-help-dropdown', 'help-text', 'filter-drop']) 558 | # wdg['filter_help_text'] = Div( 559 | # text="""Select which bulkfile annotations should be rendered on the chart. 'Display annotations' will turn all 560 | # annotations on or off. 561 | # """, 562 | # css_classes=['filter-help-drop'] 563 | # ) 564 | wdg["toggle_annotations"] = Toggle( 565 | label="Display annotations", 566 | button_type="danger", 567 | css_classes=["toggle_button_g_r", "filter-drop"], 568 | active=True, 569 | ) 570 | wdg["toggle_mappings"] = Toggle( 571 | label="Display mappings", 572 | button_type="danger", 573 | css_classes=["toggle_button_g_r", "filter-drop"], 574 | active=True, 575 | ) 576 | wdg["filter_toggle_group"] = RadioButtonGroup( 577 | labels=["Select all", "Select none"], 578 | active=filter_toggle_active, 579 | css_classes=["filter-drop"], 580 | ) 581 | wdg["label_filter"] = CheckboxGroup( 582 | labels=check_labels, active=check_active, css_classes=["filter-drop"] 583 | ) 584 | 585 | # wdg['plot_options'] = Div(text='Plot adjustments', css_classes=['adjust-dropdown', 'caret-down']) 586 | # wdg['adjust_help'] = Div(text='adjust help:', css_classes=['adjust-help-dropdown', 'help-text', 'adjust-drop']) 587 | # wdg['adjust_help_text'] = Div( 588 | # text="""Adjust chart parameters, such as width, height and where annotations are rendered. These are set in the 589 | # config.ini, where the default values can be edited. 590 | # """, 591 | # css_classes=['adjust-help-drop'] 592 | # ) 593 | wdg["po_width"] = TextInput( 594 | title="Plot Width (px)", value=cfg_po["plot_width"], css_classes=["adjust-drop"] 595 | ) 596 | wdg["po_height"] = TextInput( 597 | title="Plot Height (px)", 598 | value=cfg_po["plot_height"], 599 | css_classes=["adjust-drop"], 600 | ) 601 | wdg["label_height"] = TextInput( 602 | title="Annotation height (y-axis)", 603 | value=cfg_po["label_height"], 604 | css_classes=["adjust-drop"], 605 | ) 606 | wdg["po_y_max"] = TextInput( 607 | title="y max", 608 | value=cfg_po["y_max"], 609 | css_classes=["adjust-drop", "toggle_y_target"], 610 | ) 611 | wdg["po_y_min"] = TextInput( 612 | title="y min", 613 | value=cfg_po["y_min"], 614 | css_classes=["adjust-drop", "toggle_y_target"], 615 | ) 616 | wdg["toggle_y_axis"] = Toggle( 617 | label="Fixed Y-axis", 618 | button_type="danger", 619 | css_classes=["toggle_button_g_r", "adjust-drop", "toggle_y_axis"], 620 | active=False, 621 | ) 622 | wdg["toggle_smoothing"] = Toggle( 623 | label="Smoothing", 624 | button_type="danger", 625 | css_classes=["toggle_button_g_r", "adjust-drop"], 626 | active=True, 627 | ) 628 | 629 | wdg["label_filter"].on_change("active", update_checkboxes) 630 | wdg["filter_toggle_group"].on_change("active", update_toggle) 631 | wdg["jump_next"].on_click(next_update) 632 | wdg["jump_prev"].on_click(prev_update) 633 | wdg["save_read_file"].on_click(export_data) 634 | 635 | for name in toggle_inputs: 636 | wdg[name].on_click(toggle_button) 637 | for name in int_inputs: 638 | wdg[name].on_change("value", is_input_int) 639 | return wdg 640 | 641 | 642 | def create_figure(x_data, y_data, wdg, app_vars): 643 | def vline(x_coords, y_upper, y_lower): 644 | # Return a dataset that can plot vertical lines 645 | x_values = np.vstack((x_coords, x_coords)).T 646 | y_upper_list = np.full((1, len(x_values)), y_upper) 647 | y_lower_list = np.full((1, len(x_values)), y_lower) 648 | y_values = np.vstack((y_lower_list, y_upper_list)).T 649 | return x_values.tolist(), y_values.tolist() 650 | 651 | def hlines(y_coords, x_lower, x_upper): 652 | """ 653 | 654 | Parameters 655 | ---------- 656 | y_coords: (int, float) height to plot lines at 657 | x_lower: (int, float) lower x coord 658 | x_upper: (int, float) upper x coord 659 | 660 | Returns 661 | ------- 662 | 663 | """ 664 | x_values = np.vstack((x_lower, x_upper)).T 665 | y_values_list = np.full((1, len(x_values)), y_coords) 666 | y_values = np.vstack((y_values_list, y_values_list)).T 667 | return x_values.tolist(), y_values.tolist() 668 | 669 | if wdg["toggle_smoothing"].active: 670 | w_range = app_vars["duration"] 671 | divisor = math.e ** 2.5 672 | thin_factor = math.ceil(w_range / divisor) 673 | else: 674 | thin_factor = 1 675 | if thin_factor == 0: 676 | thin_factor = 1 677 | 678 | greater_delete_index = np.argwhere(y_data > int(cfg_po["upper_cut_off"])) 679 | x_data = np.delete(x_data, greater_delete_index) 680 | y_data = np.delete(y_data, greater_delete_index) 681 | 682 | lesser_delete_index = np.argwhere(y_data < int(cfg_po["lower_cut_off"])) 683 | x_data = np.delete(x_data, lesser_delete_index) 684 | y_data = np.delete(y_data, lesser_delete_index) 685 | 686 | x_data = x_data[::thin_factor] 687 | y_data = y_data[::thin_factor] 688 | 689 | data = { 690 | "x": x_data, 691 | "y": y_data, 692 | } 693 | 694 | source = ColumnDataSource(data=data) 695 | 696 | p = figure( 697 | plot_height=int(wdg["po_height"].value), 698 | plot_width=int(wdg["po_width"].value), 699 | toolbar_location="right", 700 | tools=["xbox_zoom", "xpan", "undo", "reset", "save"], 701 | active_drag="xbox_zoom", 702 | ) 703 | if cfg_po["output_backend"] not in output_backend: 704 | p.output_backend = "canvas" 705 | else: 706 | p.output_backend = cfg_po["output_backend"] 707 | # Add step/% points plotted: Step: {sp} ({pt:.3f}) -> sp=thin_factor, pt=1/thin_factor 708 | p.add_layout( 709 | Title( 710 | text="Channel: {ch} Start: {st} End: {ed} Sample rate: {sf}".format( 711 | ch=app_vars["channel_num"], 712 | st=app_vars["start_time"], 713 | ed=app_vars["end_time"], 714 | sf=app_vars["sf"], 715 | ) 716 | ), 717 | "above", 718 | ) 719 | p.add_layout( 720 | Title( 721 | text="bulk FAST5 file: {s}".format( 722 | s=app_data["wdg_dict"]["file_list"].value 723 | ) 724 | ), 725 | "above", 726 | ) 727 | 728 | p.toolbar.logo = None 729 | p.yaxis.axis_label = "Raw signal" 730 | p.yaxis.major_label_orientation = "horizontal" 731 | p.xaxis.axis_label = "Time (seconds)" 732 | p.line(source=source, x="x", y="y", line_width=1) 733 | p.xaxis.major_label_orientation = math.radians(45) 734 | p.x_range.range_padding = 0.01 735 | 736 | # set padding manually 737 | y_min = np.amin(data["y"]) 738 | y_max = np.amax(data["y"]) 739 | pad = (y_max - y_min) * 0.1 / 2 740 | p.y_range = Range1d(y_min - pad, y_max + pad) 741 | try: 742 | app_data["bmf"] 743 | except NameError: 744 | bmf_set = False 745 | except KeyError: 746 | bmf_set = False 747 | else: 748 | bmf_set = True 749 | if bmf_set and wdg["toggle_mappings"].active: 750 | LOGGER.info("Plotting mappings") 751 | # set padding manually 752 | # lower_pad = (y_max - y_min) * 0.1 / 2 753 | # upper_pad = (y_max - y_min) / 2 754 | # p.y_range = Range1d(y_min - lower_pad, y_max + upper_pad) 755 | # set mapping track midpoints 756 | # upper_mapping = upper_pad / 4 * 3 + y_max 757 | # lower_mapping = upper_pad / 4 + y_max 758 | lower_mapping = int(wdg["label_height"].value) + 750 759 | # Select only this channel 760 | slim_bmf = app_data["bmf"][ 761 | app_data["bmf"]["channel"] == app_vars["channel_num"] 762 | ] 763 | # Select the current viewed range 764 | slim_bmf = slim_bmf[ 765 | ( 766 | (slim_bmf["start_time"] > app_vars["start_time"]) 767 | & (slim_bmf["end_time"] < app_vars["end_time"]) 768 | ) 769 | | ( 770 | (slim_bmf["start_time"] < app_vars["start_time"]) 771 | & (slim_bmf["end_time"] < app_vars["end_time"]) 772 | & (slim_bmf["end_time"] > app_vars["start_time"]) 773 | ) 774 | | ( 775 | (slim_bmf["start_time"] > app_vars["start_time"]) 776 | & (slim_bmf["end_time"] > app_vars["end_time"]) 777 | & (slim_bmf["start_time"] < app_vars["end_time"]) 778 | ) 779 | ] 780 | slim_bmf["start_time"] = slim_bmf["start_time"].where( 781 | slim_bmf["start_time"] > app_vars["start_time"], app_vars["start_time"] 782 | ) 783 | slim_bmf["end_time"] = slim_bmf["end_time"].where( 784 | slim_bmf["end_time"] < app_vars["end_time"], app_vars["end_time"] 785 | ) 786 | 787 | slim_bmf["height"] = lower_mapping 788 | slim_bmf["offset"] = ( 789 | np.ones(len(slim_bmf)) * 5 790 | + slim_bmf.groupby(["start_time", "end_time"]).cumcount() * 15 791 | ) 792 | # Convert slim_bmf to ColDataSrc 793 | mapping_source = ColumnDataSource(data=slim_bmf.to_dict(orient="list")) 794 | # Add labels to LabelSet 795 | mapping_labels = LabelSet( 796 | x="start_time", 797 | y="height", 798 | text="label", 799 | level="glyph", 800 | x_offset=5, 801 | y_offset="offset", 802 | source=mapping_source, 803 | render_mode="canvas", 804 | ) 805 | p.add_layout(mapping_labels) 806 | # Add some colour here 807 | # Forward Vertical lines => blue 808 | p_x, p_y = vline( 809 | np.concatenate( 810 | [ 811 | slim_bmf["start_time"] 812 | .where(slim_bmf["strand"] == "+") 813 | .dropna() 814 | .values, 815 | slim_bmf["end_time"] 816 | .where(slim_bmf["strand"] == "+") 817 | .dropna() 818 | .values, 819 | ] 820 | ), 821 | lower_mapping + 20, 822 | lower_mapping - 20, 823 | ) 824 | p.multi_line(p_x, p_y, line_dash="solid", color="blue", line_width=1) 825 | # Reverse Vertical lines => red 826 | p_x, p_y = vline( 827 | np.concatenate( 828 | [ 829 | slim_bmf["start_time"] 830 | .where(slim_bmf["strand"] == "-") 831 | .dropna() 832 | .values, 833 | slim_bmf["end_time"] 834 | .where(slim_bmf["strand"] == "-") 835 | .dropna() 836 | .values, 837 | ] 838 | ), 839 | lower_mapping + 20, 840 | lower_mapping - 20, 841 | ) 842 | p.multi_line(p_x, p_y, line_dash="solid", color="red", line_width=1) 843 | # Horizontal lines 844 | p_x, p_y = hlines( 845 | lower_mapping, 846 | slim_bmf["start_time"].where(slim_bmf["strand"] == "+").dropna(), 847 | slim_bmf["end_time"].where(slim_bmf["strand"] == "+").dropna(), 848 | ) 849 | p.multi_line(p_x, p_y, line_dash="solid", color="blue", line_width=1) 850 | # Horizontal lines 851 | p_x, p_y = hlines( 852 | lower_mapping, 853 | slim_bmf["start_time"].where(slim_bmf["strand"] == "-").dropna(), 854 | slim_bmf["end_time"].where(slim_bmf["strand"] == "-").dropna(), 855 | ) 856 | p.multi_line(p_x, p_y, line_dash="solid", color="red", line_width=1) 857 | 858 | if wdg["toggle_y_axis"].active: 859 | p.y_range = Range1d(int(wdg["po_y_min"].value), int(wdg["po_y_max"].value)) 860 | if wdg["toggle_annotations"].active: 861 | # Map modal_classifications onto df 862 | app_data["label_df"]["mc_active_map"] = app_data["label_df"][ 863 | "modal_classification" 864 | ].map(app_data["label_mp"]) 865 | app_data["label_df"]["mc_label_map"] = app_data["label_df"][ 866 | "modal_classification" 867 | ].map(app_data["label_dt"]) 868 | # Here labels are thinned out 869 | slim_label_df = app_data["label_df"][ 870 | (app_data["label_df"]["read_start"] >= app_vars["start_time"]) 871 | & (app_data["label_df"]["read_start"] <= app_vars["end_time"]) 872 | ] 873 | # Use pd.isin to remove unwanted annotations from the slimmed df 874 | slim_label_df = slim_label_df[ 875 | slim_label_df["mc_active_map"].isin(wdg["label_filter"].active) == True 876 | ] 877 | # get coordinates and vstack them to produce [[x, x], [x, x]...] 878 | line_x_values = np.vstack( 879 | (slim_label_df["read_start"].values, slim_label_df["read_start"].values) 880 | ).T 881 | tmp_list = np.full((1, len(line_x_values)), -10000) 882 | line_y_values = np.vstack((tmp_list, tmp_list * -1)).T 883 | # Add all vertical lines as multi_line 884 | p.multi_line( 885 | line_x_values.tolist(), 886 | line_y_values.tolist(), 887 | line_dash="dashed", 888 | color="green", 889 | line_width=1, 890 | ) 891 | # combine series to form label 892 | slim_label_df["label"] = ( 893 | slim_label_df["mc_label_map"] 894 | + " - " 895 | + slim_label_df["read_id"].astype("str") 896 | ) 897 | # Create ColumnDataSource combining labels and coordinates 898 | label_source = ColumnDataSource( 899 | data=dict( 900 | x=slim_label_df["read_start"].values, 901 | y=np.full((len(slim_label_df), 1), int(wdg["label_height"].value)), 902 | t=slim_label_df["label"].values, 903 | ) 904 | ) 905 | # Add all labels as a label set 906 | labels = LabelSet( 907 | x="x", 908 | y="y", 909 | text="t", 910 | level="glyph", 911 | x_offset=0, 912 | y_offset=0, 913 | source=label_source, 914 | render_mode="canvas", 915 | angle=-270, 916 | angle_units="deg", 917 | ) 918 | p.add_layout(labels) 919 | 920 | return column(p, css_classes=["plot_div"]) 921 | 922 | 923 | def is_input_int(attr, old, new): 924 | try: 925 | int(new) 926 | for wdg in int_inputs: 927 | if (app_data["wdg_dict"][wdg].value == new) and ( 928 | "input-error" in app_data["wdg_dict"][wdg].css_classes 929 | ): 930 | input_error(app_data["wdg_dict"][wdg], "remove") 931 | except ValueError: 932 | for wdg in int_inputs: 933 | if app_data["wdg_dict"][wdg].value == new: 934 | input_error(app_data["wdg_dict"][wdg], "add") 935 | return 936 | 937 | new = new.lstrip("0") 938 | update() 939 | 940 | 941 | def toggle_button(state): 942 | layout.children[1] = create_figure( 943 | app_data["x_data"], 944 | app_data["y_data"], 945 | app_data["wdg_dict"], 946 | app_data["app_vars"], 947 | ) 948 | 949 | 950 | def input_error(widget, mode): 951 | """""" 952 | if mode == "add": 953 | widget.css_classes.append("input-error") 954 | elif mode == "remove": 955 | if widget.css_classes: 956 | del widget.css_classes[-1] 957 | else: 958 | print("mode not recognised") 959 | 960 | 961 | def update(): 962 | update_data(app_data["bulkfile"], app_data["app_vars"]) 963 | if app_data["INIT"]: 964 | build_widgets() 965 | layout.children[0] = column( 966 | list(app_data["wdg_dict"].values()), width=int(cfg_po["wdg_width"]) 967 | ) 968 | app_data["INIT"] = False 969 | app_data["wdg_dict"]["duration"].text = "Duration: {d} seconds".format( 970 | d=app_data["app_vars"]["duration"] 971 | ) 972 | app_data["wdg_dict"]["toggle_smoothing"].active = True 973 | layout.children[1] = create_figure( 974 | app_data["x_data"], 975 | app_data["y_data"], 976 | app_data["wdg_dict"], 977 | app_data["app_vars"], 978 | ) 979 | 980 | 981 | def update_other(attr, old, new): 982 | update() 983 | 984 | 985 | def update_toggle(attr, old, new): 986 | if new == 0: 987 | app_data["wdg_dict"]["label_filter"].active = list( 988 | np.arange(0, len(app_data["wdg_dict"]["label_filter"].labels), 1) 989 | ) 990 | elif new == 1: 991 | app_data["wdg_dict"]["label_filter"].active = [] 992 | update() 993 | 994 | 995 | def update_checkboxes(attr, old, new): 996 | if len(new) != len(app_data["wdg_dict"]["label_filter"].labels) and len(new) != 0: 997 | app_data["wdg_dict"]["filter_toggle_group"].active = None 998 | update() 999 | 1000 | 1001 | def next_update(value): 1002 | value = int(value.item) 1003 | jump_start = app_data["label_df"][ 1004 | (app_data["label_df"]["read_start"] > app_data["app_vars"]["start_time"] + 1) 1005 | & (app_data["label_df"]["modal_classification"] == value) 1006 | ] 1007 | try: 1008 | app_data["app_vars"]["start_time"] = int( 1009 | math.floor(jump_start["read_start"].iloc[0]) 1010 | ) 1011 | except IndexError: 1012 | app_data["wdg_dict"]["duration"].text += "\n{ev} event not found".format( 1013 | ev=app_data["label_dt"][value] 1014 | ) 1015 | return 1016 | except Exception as e: 1017 | print(type(e)) 1018 | print(e) 1019 | app_data["app_vars"]["end_time"] = ( 1020 | app_data["app_vars"]["start_time"] + app_data["app_vars"]["duration"] 1021 | ) 1022 | app_data["wdg_dict"]["position"].value = "{ch}:{start}-{end}".format( 1023 | ch=app_data["app_vars"]["channel_num"], 1024 | start=app_data["app_vars"]["start_time"], 1025 | end=app_data["app_vars"]["end_time"], 1026 | ) 1027 | layout.children[1] = create_figure( 1028 | app_data["x_data"], 1029 | app_data["y_data"], 1030 | app_data["wdg_dict"], 1031 | app_data["app_vars"], 1032 | ) 1033 | 1034 | 1035 | def prev_update(value): 1036 | value = int(value.item) 1037 | jump_start = app_data["label_df"][ 1038 | (app_data["label_df"]["read_start"] < app_data["app_vars"]["start_time"]) 1039 | & (app_data["label_df"]["modal_classification"] == value) 1040 | ] 1041 | try: 1042 | app_data["app_vars"]["start_time"] = int( 1043 | math.floor(jump_start["read_start"].iloc[-1]) 1044 | ) 1045 | except IndexError: 1046 | app_data["wdg_dict"]["duration"].text += "\n{ev} event not found".format( 1047 | ev=app_data["label_dt"][value] 1048 | ) 1049 | return 1050 | except Exception as e: 1051 | print(type(e)) 1052 | print(e) 1053 | app_data["app_vars"]["end_time"] = ( 1054 | app_data["app_vars"]["start_time"] + app_data["app_vars"]["duration"] 1055 | ) 1056 | app_data["wdg_dict"]["position"].value = "{ch}:{start}-{end}".format( 1057 | ch=app_data["app_vars"]["channel_num"], 1058 | start=app_data["app_vars"]["start_time"], 1059 | end=app_data["app_vars"]["end_time"], 1060 | ) 1061 | layout.children[1] = create_figure( 1062 | app_data["x_data"], 1063 | app_data["y_data"], 1064 | app_data["wdg_dict"], 1065 | app_data["app_vars"], 1066 | ) 1067 | 1068 | 1069 | def export_data(): 1070 | try: 1071 | start_val = math.floor( 1072 | app_data["app_vars"]["start"] * app_data["app_vars"]["sf"] 1073 | ) 1074 | end_val = math.ceil(app_data["app_vars"]["end"] * app_data["app_vars"]["sf"]) 1075 | except KeyError: 1076 | start_val = app_data["app_vars"]["start_squiggle"] 1077 | end_val = app_data["app_vars"]["end_squiggle"] 1078 | if ( 1079 | export_read_file( 1080 | app_data["app_vars"]["channel_num"], 1081 | start_val, 1082 | end_val, 1083 | app_data["bulkfile"], 1084 | cfg_dr["out"], 1085 | ) 1086 | == 0 1087 | ): 1088 | app_data["wdg_dict"]["duration"].text += "\nread file created" 1089 | else: 1090 | app_data["wdg_dict"]["duration"].text += "\nError: read file not created" 1091 | 1092 | 1093 | app_data = { 1094 | "file_src": None, # bulkfile path (string) 1095 | "bulkfile": None, # bulkfile object 1096 | "bmf": None, # bmf dataframe 1097 | "x_data": None, # numpy ndarray time points 1098 | "y_data": None, # numpy ndarray signal data 1099 | "label_df": None, # pandas df of signal labels 1100 | "label_dt": None, # dict of signal enumeration 1101 | "label_mp": None, # dict matching labels to widget filter 1102 | "app_vars": { # dict of variables used in plots and widgets 1103 | "len_ds": None, # length of signal dataset 1104 | "start_time": None, # squiggle start time in seconds 1105 | "end_time": None, # squiggle end time in seconds 1106 | "duration": None, # squiggle duration in seconds 1107 | "start_squiggle": None, # squiggle start position (samples) 1108 | "end_squiggle": None, # squiggle end position (samples) 1109 | "channel_str": None, # 'Channel_NNN' (string) 1110 | "channel_num": None, # Channel number (int) 1111 | "sf": None, # sample frequency (int) 1112 | "attributes": None, # OrderedDict of bulkfile attr info 1113 | }, 1114 | "wdg_dict": None, # dictionary of widgets 1115 | "controls": None, # widgets added to widgetbox 1116 | "pore_plt": None, # the squiggle plot 1117 | "INIT": True, # Initial plot with bulkfile (bool) 1118 | } 1119 | 1120 | int_inputs = ["po_width", "po_height", "po_y_min", "po_y_max", "label_height"] 1121 | toggle_inputs = ["toggle_y_axis", "toggle_annotations", "toggle_smoothing"] 1122 | 1123 | app_data["app_vars"]["files"] = [] 1124 | p = Path(cfg_dr["dir"]) 1125 | app_data["app_vars"]["files"] = [ 1126 | (x.name, x.name) for x in p.iterdir() if x.suffix == ".fast5" 1127 | ] 1128 | m = Path(cfg_dr["map"]) 1129 | app_data["app_vars"]["map_files"] = [ 1130 | (x.name, x.name) for x in m.iterdir() if x.suffix == ".bmf" 1131 | ] 1132 | app_data["app_vars"]["map_files"].insert(0, ("", "--")) 1133 | # check files are useable by h5py 1134 | for index, file in enumerate(app_data["app_vars"]["files"]): 1135 | file = file[0] 1136 | try: 1137 | bulk_file = h5py.File(Path(Path(cfg_dr["dir"]) / file), "r") 1138 | except OSError: 1139 | app_data["app_vars"]["files"][index] = None 1140 | continue 1141 | try: 1142 | try_path = bulk_file["Raw"] 1143 | except KeyError: 1144 | app_data["app_vars"]["files"][index] = None 1145 | continue 1146 | for i, channel in enumerate(try_path): 1147 | if i == 0: 1148 | try: 1149 | try_path[channel]["Signal"][0] 1150 | except KeyError: 1151 | app_data["app_vars"]["files"][index] = None 1152 | break 1153 | bulk_file.flush() 1154 | bulk_file.close() 1155 | app_data["app_vars"]["files"] = list( 1156 | filter((None).__ne__, app_data["app_vars"]["files"]) 1157 | ) 1158 | app_data["app_vars"]["files"].insert(0, ("", "--")) 1159 | 1160 | app_data["wdg_dict"] = init_wdg_dict() 1161 | app_data["controls"] = column( 1162 | list(app_data["wdg_dict"].values()), width=int(cfg_po["wdg_width"]) 1163 | ) 1164 | 1165 | f = figure(toolbar_location=None) 1166 | f.line(x=[0], y=[0]) 1167 | f.outline_line_color = None 1168 | f.toolbar.logo = None 1169 | f.xaxis.visible = False 1170 | f.yaxis.visible = False 1171 | f.xgrid.visible = False 1172 | f.ygrid.visible = False 1173 | app_data["pore_plt"] = column(f, css_classes=["plot_div"]) 1174 | 1175 | layout = row(app_data["controls"], app_data["pore_plt"]) 1176 | 1177 | curdoc().add_root(layout) 1178 | curdoc().title = "bulkvis" 1179 | -------------------------------------------------------------------------------- /bulkvis/bulkvis_server/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {{ bokeh_css }} 5 | 8 | 9 | {{ bokeh_js }} 10 | 11 | 12 | {{ plot_div|indent(8) }} 13 | {{ plot_script|indent(8) }} 14 | 15 | 16 | -------------------------------------------------------------------------------- /bulkvis/bulkvis_server/templates/styles.css: -------------------------------------------------------------------------------- 1 | @import url(https://fonts.googleapis.com/css?family=Noto+Sans); 2 | 3 | body { 4 | font-family: 'Noto Sans', sans-serif; 5 | -webkit-font-smoothing: antialiased; 6 | text-rendering: optimizeLegibility; 7 | display: block; 8 | margin: 0; 9 | } 10 | .plot_div { 11 | position: fixed !important; 12 | } 13 | .toggle_button_g_r > .bk-btn-group > .bk-active { 14 | background-color: #5cb85c; 15 | font-style: italic; 16 | } 17 | .toggle_button_o_r > .bk-btn-group > .bk-active { 18 | background-color: #ed9c28; 19 | font-style: italic; 20 | } 21 | .input-error > .bk-input-group > input { 22 | background-color: #d9534f; 23 | } 24 | label.bk > span.bk { 25 | font-family: monospace; 26 | } 27 | code { 28 | overflow-wrap: anywhere; 29 | } -------------------------------------------------------------------------------- /bulkvis/cite.py: -------------------------------------------------------------------------------- 1 | import textwrap 2 | 3 | _help = "Output the citation for this tool and exit" 4 | _cli = () 5 | 6 | 7 | def run(parser, args): 8 | cite = textwrap.fill( 9 | ( 10 | "Alexander Payne, Nadine Holmes, Vardhman Rakyan, Matthew Loose, " 11 | "BulkVis: a graphical viewer for Oxford nanopore bulk FAST5 files, " 12 | "Bioinformatics, Volume 35, Issue 13, 1 July 2019, Pages 2193–2198" 13 | ), 14 | width=70, 15 | subsequent_indent=" " * 10, 16 | ) 17 | url = "https://academic.oup.com/bioinformatics/article/35/13/2193/5193712" 18 | doi = "10.1093/bioinformatics/bty841" 19 | print("Thank you for using bulkvis!\n") 20 | print(f"Citation: {cite}") 21 | print(f"URL: {url}") 22 | print(f"DOI: {doi}") 23 | -------------------------------------------------------------------------------- /bulkvis/core.py: -------------------------------------------------------------------------------- 1 | """core.py 2 | """ 3 | from pathlib import Path 4 | import sys 5 | import numpy as np 6 | import pandas as pd 7 | import traceback 8 | 9 | 10 | def concat_files_to_df(file_list, **kwargs): 11 | """Return a pandas.DataFrame from a list of files 12 | Parameters 13 | ---------- 14 | file_list : list 15 | List of files to be concatenated 16 | kwargs 17 | Any parameter used by pandas.read_csv except 'filepath_or_buffer'. These will be applied to all 18 | files in 'file_list' 19 | Returns 20 | ------- 21 | pandas.DataFrame 22 | Raises 23 | ------ 24 | pandas.errors.ParserError 25 | Raises pandas.errors.ParserError if input file(s) do not match expected format or shape. 26 | """ 27 | kwargs = remove_kwargs(["filepath_or_buffer"], **kwargs) 28 | df_list = [] 29 | for f in file_list: 30 | try: 31 | df_list.append(pd.read_csv(filepath_or_buffer=f, **kwargs)) 32 | except pd.errors.ParserError as e: 33 | sys.exit( 34 | "ParserError\nUsually caused by an input file not being the expected format" 35 | ) 36 | except Exception as e: 37 | traceback.print_exc() 38 | sys.exit(1) 39 | return pd.concat(df_list, ignore_index=True) 40 | 41 | 42 | def remove_kwargs(remove_list, **kwargs): 43 | """Remove items from kwargs dict that may cause conflict with successive function calls""" 44 | # return {k: v for k, v in kwargs.items() if k not in remove_list} # This iterates the entire dict 45 | for item in remove_list: # This just iterates the remove_list 46 | _ = kwargs.pop(item, None) 47 | return kwargs 48 | 49 | 50 | def length_stats(lengths): 51 | """Return count [COUNT], minimum [MIN], maximum [MAX], mean [MEAN], and N50 [N50] of an array 52 | Parameters 53 | ---------- 54 | lengths : array_like 55 | List of integers 56 | Returns 57 | ------- 58 | dictionary 59 | """ 60 | return { 61 | "COUNT": int(len(lengths)), 62 | "MIN": int(np.min(lengths)), 63 | "MAX": int(np.max(lengths)), 64 | "MEAN": int(np.mean(lengths)), 65 | "N50": _get_n50(np.sort(lengths)), 66 | } 67 | 68 | 69 | def _get_n50(lengths): 70 | """Return N50 statistic for a list of read lengths 71 | Parameters 72 | ---------- 73 | lengths array_like 74 | List of sorted, ascending, integers 75 | Returns 76 | ------- 77 | integer 78 | """ 79 | return int(lengths[np.where(np.cumsum(lengths) >= np.sum(lengths) / 2)][0]) 80 | 81 | 82 | def readable_yield(num, suffix="B"): 83 | """Return a human readable string of yield using si/metric prefixes 84 | Parameters 85 | ---------- 86 | num : int (or float) 87 | Integer of total number of bases 88 | suffix : str 89 | String to append to si/metric prefixes ['', 'k', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'] 90 | Returns 91 | ------- 92 | string 93 | """ 94 | for unit in ["", "k", "M", "G", "T", "P", "E", "Z"]: 95 | if abs(num) < 1000: 96 | return "{n:3.2f} {u}{s}".format(n=num, u=unit, s=suffix) 97 | num /= 1000 98 | return "{n:3.1f} {u}{s}".format(n=num, u="Y", s=suffix) 99 | 100 | 101 | def human_readable_yield(num: int, factor: int = 1000, suffix: str = "B") -> str: 102 | """Return a human readable string of a large number using SI unit prefixes 103 | Parameters 104 | ---------- 105 | num : int 106 | A number to convert to decimal form 107 | factor : int 108 | The SI factor, use 1000 for SI units and 1024 for binary multiples 109 | suffix : str 110 | The suffix to place after the SI prefix, for example use B for SI units and iB for binary multiples 111 | Returns 112 | ------- 113 | str 114 | Returns the input number formatted to two decimal places with the SI unit and suffix 115 | """ 116 | for unit in ["", "k", "M", "G", "T", "P", "E", "Z"]: 117 | if abs(num) < factor: 118 | return "{n:3.2f} {u}{s}".format(n=num, u=unit, s=suffix) 119 | num /= factor 120 | return "{n:3.2f} {u}{s}".format(n=num, u="Y", s=suffix) 121 | 122 | 123 | def top_n(df, field, n): 124 | """Print top N reads, by length, from a dataset 125 | Parameters 126 | ---------- 127 | df : pandas.DataFrame 128 | DataFrame containing a Series with length values 129 | field : str 130 | The key for the Series containing length values 131 | n : int 132 | The number of values to print 133 | Returns 134 | ------- 135 | None 136 | """ 137 | df = df.sort_values(by=field, ascending=False) 138 | rows = df.filter([field], axis=1).head(n=n).reset_index() 139 | max_len = max( 140 | [len(str(r[field])) + len(str(r[field])) // 3 for i, r in rows.iterrows()] 141 | ) 142 | for idx, row in rows.iterrows(): 143 | print("{i}:\t{len:>{m},}".format(i=idx + 1, m=max_len, len=row[field])) 144 | return 145 | 146 | 147 | def find_files_of_type(file_or_directory, file_extensions): 148 | """Return a list of pathlib.Path of files with chosen extensions 149 | Parameters 150 | ---------- 151 | file_or_directory : str 152 | filepath or a directory 153 | file_extensions : list 154 | A list of lowercase file extensions including '.' e.g. ['.txt', '.paf'] 155 | Returns 156 | ------- 157 | list 158 | If files with extension are found return list of pathlib.Path objects, otherwise return empty list 159 | """ 160 | file_or_directory = Path(file_or_directory).expanduser() 161 | if ( 162 | file_or_directory.is_file() 163 | and "".join(file_or_directory.suffixes).lower() in file_extensions 164 | ): 165 | return [file_or_directory] 166 | elif file_or_directory.is_dir(): 167 | return [ 168 | x 169 | for x in file_or_directory.iterdir() 170 | if "".join(x.suffixes).lower() in file_extensions 171 | ] 172 | else: 173 | return [] 174 | 175 | 176 | def fuse_reads(seq_sum_df, paf_df, distance=10000, alt=True): 177 | """Find fused reads from sequencing_summary.txt and paf files 178 | Parse sequencing_summary.txt and mapping.paf files to infer reads that may 179 | have been incorrectly split my MinKNOW. This approach is based on read number 180 | and mapping, therefore a _good_ mapping is required. 181 | Parameters 182 | ---------- 183 | seq_sum_df : pandas.DataFrame 184 | A pandas.DataFrame from a sequencing_summary.txt file, this must contain 185 | the columns `['channel', 'start_time', 'duration', 'run_id', 'read_id', 186 | 'sequence_length_template', 'filename']`. 187 | paf_df : pandas.DataFrame 188 | A pandas.DataFrame from a .paf file, these are generated by minimap2. 189 | As this file type doesn't have headers the following parameters are 190 | the minimum required for using a file with this function 191 | `usecols=[0, 4, 5, 7, 8], names=['Qname', 'Strand', 'Tname', 'Tstart', 'Tend']` 192 | distance : int 193 | The distance, in bases, between the end coordinate of a read mapping and 194 | the start coordinate of successive read from the same channel. Defaults to 10000 195 | alt : bool 196 | Include alternate assemblies, default is True. If set to True (include 197 | alternate assemblies) the 'new' dataset may have more bases than the 'original' 198 | input dataset due to reads mapping to alternate contigs. 199 | Returns 200 | ------- 201 | fused_reads_df : pandas.DataFrame 202 | pandas.DataFrame containing fused reads 203 | un_fused_reads_df : pandas.DataFrame 204 | pandas.DataFrame containing un-fused reads 205 | to_be_fused_reads_df : pandas.DataFrame 206 | pandas.DataFrame containing reads that are fused 'parts' in the same 207 | format as un_fused_reads_df 208 | """ 209 | # TODO: raise error if required columns are not present 210 | # TODO: raise error if columns are not of correct types 211 | # Remove zero length reads and sort seq_sum_df 212 | seq_sum_df = seq_sum_df[seq_sum_df["sequence_length_template"] != 0].sort_values( 213 | by=["channel", "run_id", "start_time"] 214 | ) 215 | # Create extra Series for finding fused reads 216 | seq_sum_df["next_read_id"] = seq_sum_df["read_id"].shift(-1) 217 | seq_sum_df["next_start_time"] = seq_sum_df["start_time"].shift(-1) 218 | seq_sum_df["next_end"] = seq_sum_df["next_start_time"] + seq_sum_df[ 219 | "duration" 220 | ].shift(-1) 221 | seq_sum_df["next_sequence_length_template"] = seq_sum_df[ 222 | "sequence_length_template" 223 | ].shift(-1) 224 | seq_sum_df["combined_length"] = ( 225 | seq_sum_df["sequence_length_template"] 226 | + seq_sum_df["next_sequence_length_template"] 227 | ) 228 | seq_sum_df["next_sequence_length_template"] = ( 229 | seq_sum_df["next_sequence_length_template"].fillna(0).astype("int64") 230 | ) 231 | seq_sum_df["combined_length"] = ( 232 | seq_sum_df["combined_length"].fillna(0).astype("int64") 233 | ) 234 | 235 | # Merge seq_sum_df and paf_df on read_id/Qname; this aligns the read and the mapping information 236 | df = pd.merge(seq_sum_df, paf_df, left_on="read_id", right_on="Qname", how="outer") 237 | # Merge df with paf_df on next_read_id/Qname; this aligns each read with it's 238 | # successor giving suffix '_A' and '_B' respectively 239 | df2 = pd.merge( 240 | df, 241 | paf_df, 242 | left_on="next_read_id", 243 | right_on="Qname", 244 | how="outer", 245 | suffixes=("_A", "_B"), 246 | ) 247 | df2 = df2.dropna().reset_index() 248 | 249 | # If df2 had no rows, no merging has taken place 250 | if len(df2) == 0: 251 | return None, None, None 252 | 253 | # Condition where Qname (read_id) does NOT match 254 | not_qname = df2["Qname_A"] != df2["Qname_B"] 255 | # Condition where Strand matches 256 | yes_strand = df2["Strand_A"] == df2["Strand_B"] 257 | # Condition where Target (chromosome) matches 258 | yes_tname = df2["Tname_A"] == df2["Tname_B"] 259 | 260 | df2 = df2[not_qname & yes_strand & yes_tname] 261 | 262 | # End program if no rows 263 | if len(df2) == 0: 264 | return None, None, None 265 | 266 | df2["match_distance"] = np.where( 267 | df2["Strand_A"] == "+", # Where: Strand is '+' 268 | df2["Tstart_B"] - df2["Tend_A"], # True: read_2_start - read_1_end 269 | df2["Tstart_A"] - df2["Tend_B"], # False: read_1_start - read_2_end 270 | ) 271 | # Remove reads outside of the distance parameter 272 | df2 = df2[(df2["match_distance"] > 0) & (df2["match_distance"] < distance)] 273 | 274 | # End program if no rows 275 | if len(df2) == 0: 276 | return None, None, None 277 | 278 | df2 = df2.drop_duplicates( 279 | subset=[ 280 | "channel", 281 | "start_time", 282 | "duration", 283 | "next_start_time", 284 | "read_id", 285 | "next_read_id", 286 | "sequence_length_template", 287 | "next_sequence_length_template", 288 | "combined_length", 289 | ], 290 | keep="first", 291 | ) 292 | # separate df into read groups and set index to cs to allow grouping 293 | cond_1 = df2["next_read_id"] == df2["read_id"].shift(-1) 294 | cond_2 = df2["read_id"] == df2["next_read_id"].shift(-1) 295 | df2["COND"] = np.where(cond_1 | cond_2, True, False) 296 | df2["W"] = np.where(df2["COND"].shift(1) == False, 1, 0) 297 | df2["cs"] = df2["W"].cumsum() 298 | 299 | """UNDER HERE NOT REVISED OR COMMENTED WELL""" 300 | # TODO: finish commenting 301 | 302 | if alt: 303 | groupby_list = ["cs", "Tname_B"] 304 | else: 305 | groupby_list = ["cs"] 306 | df2 = df2.set_index(groupby_list) 307 | df2_groupby = df2.groupby(level=groupby_list) 308 | 309 | # group and concatenate read ids 310 | df2["all_but_last"] = df2_groupby["read_id"].apply("|".join) 311 | df2["last_read_id"] = df2_groupby["next_read_id"].last() 312 | 313 | # TODO: this is the failing point, can it be cut off sooner? 314 | 315 | df2["cat_read_id"] = df2["all_but_last"] + "|" + df2["last_read_id"] 316 | 317 | # group and combine length 318 | df2["combined_length"] = df2_groupby["sequence_length_template"].sum() 319 | df2["last_length"] = df2_groupby["next_sequence_length_template"].last() 320 | df2["combined_length"] = df2["combined_length"] + df2["last_length"] 321 | 322 | # take max/min for end/start match from grouped value list 323 | df2["start_match"] = ( 324 | df2_groupby[["Tstart_A", "Tstart_B", "Tend_A", "Tend_B"]] 325 | .transform("min") 326 | .min(axis=1) 327 | ) 328 | df2["end_match"] = ( 329 | df2_groupby[["Tstart_A", "Tstart_B", "Tend_A", "Tend_B"]] 330 | .transform("max") 331 | .max(axis=1) 332 | ) 333 | 334 | # group and add start and end times 335 | df2["start_time"] = df2_groupby["start_time"].first() 336 | df2["next_end"] = df2_groupby["next_end"].last() 337 | 338 | # add the duration (time between start and end) 339 | df2["duration"] = df2["next_end"] - df2["start_time"] 340 | 341 | # format and add coordinates 342 | df2["stime_floor"] = np.floor(df2["start_time"]).astype("int64").astype("str") 343 | df2["etime_ceil"] = np.ceil(df2["next_end"]).astype("int64").astype("str") 344 | df2["channel"] = df2["channel"].astype("int64").astype("str") 345 | df2["combined_length"] = df2["combined_length"].astype("int64") 346 | df2["start_match"] = df2["start_match"].astype("int64").astype("str") 347 | df2["end_match"] = df2["end_match"].astype("int64").astype("str") 348 | df2["duration"] = df2["duration"].map("{:.5f}".format) 349 | df2["coords"] = df2["channel"] + ":" + df2["stime_floor"] + "-" + df2["etime_ceil"] 350 | 351 | # rename cols for export 352 | df2.rename(columns={"Tname_A": "target_name", "Strand_A": "strand"}, inplace=True) 353 | 354 | # fused_read_ids is a pd.Series of all fused reads 355 | fused_read_ids = pd.concat([df2["read_id"], df2["next_read_id"]]) 356 | 357 | df2["count"] = df2_groupby.size() + 1 358 | 359 | # remove duplicate entries from df2 360 | df2 = df2.drop_duplicates( 361 | subset=[ 362 | "coords", 363 | "channel", 364 | "start_time", 365 | "duration", 366 | "combined_length", 367 | "start_match", 368 | "end_match", 369 | "cat_read_id", 370 | ], 371 | keep="first", 372 | ) 373 | fused_read_ids = fused_read_ids.unique() 374 | 375 | # un_fused_df contains reads that are correctly split 376 | un_fused_df = seq_sum_df[~seq_sum_df["read_id"].isin(fused_read_ids)].reset_index() 377 | # split_df is reads that have false starts (i.e 2->N) 378 | split_df = seq_sum_df[seq_sum_df["read_id"].isin(fused_read_ids)].reset_index() 379 | 380 | # TODO: CLEAN UP EXTRA SERIES FROM DFS 381 | return df2, un_fused_df, split_df 382 | 383 | 384 | def die(message, status=1): 385 | """Print an error message and call sys.exit with the given status, terminating the process""" 386 | print(message, file=sys.stderr) 387 | sys.exit(status) 388 | 389 | 390 | def print_args(args, label="Arguments"): 391 | """Print and format all arguments from the command line""" 392 | print(label + ":") 393 | dirs = dir(args) 394 | m = max([len(a) for a in dirs if a[0] != "_"]) 395 | for attr in dirs: 396 | if attr[0] != "_": 397 | print("{a:<{m}}\t{b}".format(a=attr, m=m, b=getattr(args, attr))) 398 | print("========================================") 399 | 400 | 401 | if __name__ == "__main__": 402 | sys.exit("ERROR: core is not directly executable") 403 | -------------------------------------------------------------------------------- /bulkvis/fuse.py: -------------------------------------------------------------------------------- 1 | from bulkvis.core import ( 2 | concat_files_to_df, 3 | fuse_reads, 4 | length_stats, 5 | human_readable_yield, 6 | top_n, 7 | ) 8 | from collections import OrderedDict 9 | import pandas as pd 10 | import numpy as np 11 | 12 | _help = "Find incorrectly split reads from ONT sequencing_summary.txt and minimap2 .paf files" 13 | _cli = ( 14 | ( 15 | "-d", 16 | "--distance", 17 | dict( 18 | help="Specify the maximum distance between consecutive mappings. This is the difference " 19 | "between 'Target Start' and 'Target End' in the paf file ", 20 | type=int, 21 | default=10000, 22 | metavar="", 23 | ), 24 | ), 25 | ( 26 | "-t", 27 | "--top", 28 | dict( 29 | help="Show top N reads, by length, for the original dataset, fused reads, and " 30 | "corrected dataset", 31 | # This could be written better 32 | type=int, 33 | default=10, 34 | metavar="", 35 | ), 36 | ), 37 | # The behaviour of 'alt' is confusing... it seems like a double negative 38 | ( 39 | "-a", 40 | "--alt", 41 | dict( 42 | help="""Exclude alternate assemblies""", action="store_false", default=True 43 | ), 44 | ), 45 | ( 46 | "-s", 47 | "--summary", 48 | dict( 49 | metavar="", 50 | required=True, 51 | nargs="+", 52 | help="Sequencing summary file(s) generated by albacore or guppy. Can be compressed " 53 | "using gzip, bzip2, xz, or zip", 54 | ), 55 | ), 56 | ( 57 | "-p", 58 | "--paf", 59 | dict( 60 | metavar="", 61 | required=True, 62 | nargs="+", 63 | help="paf file(s) generated by minimap2. Can be compressed using gzip, bzip2, " 64 | "xz, or zip", 65 | ), 66 | ), 67 | ( 68 | "-o", 69 | "--output", 70 | dict( 71 | help="Specify name for the output file. This file only contains chains of reads.", 72 | default="fused_reads.txt", 73 | metavar="output", 74 | ), 75 | ), 76 | ) 77 | 78 | 79 | def run(parser, args): 80 | """Input and output controller for bulkvis fuse""" 81 | # Open sequencing_summary_*.txt files into a single pd.DataFrame 82 | seq_sum_df = concat_files_to_df( 83 | file_list=args.summary, 84 | sep="\t", 85 | usecols=[ 86 | "channel", 87 | "start_time", 88 | "duration", 89 | "run_id", 90 | "read_id", 91 | "sequence_length_template", 92 | ], 93 | ) 94 | # Open minimap2 paf files into a single pd.DataFrame 95 | paf_df = concat_files_to_df( 96 | file_list=args.paf, 97 | sep="\t", 98 | header=None, 99 | usecols=[0, 4, 5, 7, 8], 100 | names=["Qname", "Strand", "Tname", "Tstart", "Tend"], 101 | engine="python", 102 | ) 103 | fused_df, un_fused_df, to_be_fused_df = fuse_reads( 104 | seq_sum_df, paf_df, distance=args.distance, alt=args.alt 105 | ) 106 | # Get yield numbers 107 | original_bases = np.sum(seq_sum_df["sequence_length_template"]) 108 | new_lengths = pd.concat( 109 | [un_fused_df["sequence_length_template"], fused_df["combined_length"]] 110 | ) 111 | new_bases = np.sum(new_lengths) 112 | seq_sum_lengths = seq_sum_df[seq_sum_df["sequence_length_template"] != 0][ 113 | "sequence_length_template" 114 | ] 115 | # Initialize dictionary for holding metrics 116 | stats = OrderedDict() 117 | stats["Original reads:"] = length_stats(seq_sum_lengths) 118 | stats["Un-fused reads:"] = length_stats(un_fused_df["sequence_length_template"]) 119 | stats["To be fused reads:"] = length_stats( 120 | to_be_fused_df["sequence_length_template"] 121 | ) 122 | stats["Fused reads:"] = length_stats(fused_df["combined_length"]) 123 | stats["New reads:"] = length_stats(new_lengths) 124 | # Convert stats dict to pandas.DataFrame for easy display 125 | stats_df = pd.DataFrame(stats).T[["COUNT", "MIN", "MAX", "MEAN", "N50"]] 126 | print(stats_df) 127 | # TODO: display yield better 128 | print( 129 | "\nTotal yield {y} ({b:,} bases)".format( 130 | y=human_readable_yield(original_bases), b=original_bases 131 | ) 132 | ) 133 | print( 134 | "Total yield {y} ({b:,} bases)\n".format( 135 | y=human_readable_yield(new_bases), b=new_bases 136 | ) 137 | ) 138 | top = abs(args.top) 139 | if top > 0: 140 | print("Top {n} original reads by length:".format(n=top)) 141 | top_n(seq_sum_df, "sequence_length_template", top) 142 | print("Top {n} fused reads by combined length:".format(n=top)) 143 | top_n(fused_df, "combined_length", top) 144 | print("Top {n} reads after correction:".format(n=top)) 145 | top_n(pd.DataFrame(data={"length": new_lengths}), "length", top) 146 | header = [ 147 | "coords", 148 | "run_id", 149 | "channel", 150 | "start_time", 151 | "duration", 152 | "combined_length", 153 | "target_name", 154 | "strand", 155 | "start_match", 156 | "end_match", 157 | "cat_read_id", 158 | "count", 159 | ] 160 | fused_df.to_csv(args.output, sep="\t", header=True, columns=header, index=False) 161 | print("Fused read summary file saved as {f}".format(f=args.output)) 162 | -------------------------------------------------------------------------------- /bulkvis/mappings.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from readpaf import parse_paf 4 | import gzip 5 | 6 | # from argparse import ArgumentParser 7 | from pathlib import Path 8 | 9 | 10 | def run(parser, args): 11 | # Open [PAF] mapping file with specified columns 12 | if args.paf.endswith(".gz"): 13 | fopen = gzip.open 14 | else: 15 | fopen = open 16 | with fopen(args.paf, "rt") as fh: 17 | pf = parse_paf(fh, dataframe=True) 18 | # Thin PAF file by 'Primary alignment type' and drop duplicates 19 | pf = pf[pf["tp"].eq("P")] 20 | pf = pf.sort_values( 21 | ["query_name", "target_name", "mapping_quality"], ascending=[True, True, False] 22 | ) 23 | # col_names = [ 24 | # "Qname", 25 | # "Strand", 26 | # "Tname", 27 | # "Tstart", 28 | # "Tend", 29 | # "mapping_quality", 30 | # "alignment_type", 31 | # ] 32 | # pf = pd.read_csv( 33 | # paf_path, 34 | # sep="\t", 35 | # header=None, 36 | # names=col_names, 37 | # usecols=[0, 4, 5, 7, 8, 11, 12], 38 | # ) 39 | pf = pf.drop_duplicates(["query_name"], keep="first") 40 | # Open sequencing_summary.txt file 41 | cols = ["read_id", "run_id", "channel", "start_time", "duration"] 42 | ss = pd.read_csv(args.summary, sep="\t", usecols=cols) 43 | # Merge seq_sum and paf files 44 | df = pd.merge(ss, pf, left_on="read_id", right_on="query_name", how="outer") 45 | df = df.dropna() 46 | 47 | df["end_time"] = df["start_time"] + df["duration"] 48 | df["start_mapping"] = ( 49 | df[["target_start", "target_end"]] 50 | .min(axis=1) 51 | .astype("int64") 52 | .map("{0:,d}".format) 53 | ) 54 | df["end_mapping"] = ( 55 | df[["target_start", "target_end"]] 56 | .max(axis=1) 57 | .astype("int64") 58 | .map("{0:,d}".format) 59 | ) 60 | df["label"] = ( 61 | df["target_name"].astype("str") 62 | + ": " 63 | + df["start_mapping"].astype("str") 64 | + " - " 65 | + df["end_mapping"].astype("str") 66 | ) 67 | 68 | # df = df.rename(columns={"Tname": "target_name", "Strand": "strand"}) 69 | # export as .bmf 70 | header = [ 71 | "run_id", 72 | "read_id", 73 | "channel", 74 | "start_time", 75 | "end_time", 76 | "target_name", 77 | "strand", 78 | "start_mapping", 79 | "end_mapping", 80 | "label", 81 | ] 82 | i = 0 83 | for k, v in df.groupby(["run_id"]): 84 | # Join 'bmf' path, run_id, and file extension 85 | p = Path(args.bmf).joinpath(str(k) + ".bmf") 86 | v.to_csv(p, sep="\t", header=True, columns=header, index=False) 87 | i += 1 88 | 89 | print("{n} files written to {p}".format(n=i, p=args.bmf)) 90 | 91 | 92 | def full_path(file): 93 | return str(Path(file).expanduser().resolve()) 94 | 95 | 96 | _help = """Parse sequencing_summary.txt files and .paf files to format mapping info for bulkvis""" 97 | _cli = ( 98 | ( 99 | "-s", 100 | "--summary", 101 | dict( 102 | help="A sequencing summary file generated by albacore or guppy", 103 | type=full_path, 104 | default="", 105 | required=True, 106 | metavar="", 107 | ), 108 | ), 109 | ( 110 | "-p", 111 | "--paf", 112 | dict( 113 | help="A paf file generated by minimap2", 114 | type=full_path, 115 | default="", 116 | required=True, 117 | metavar="", 118 | ), 119 | ), 120 | ( 121 | "--bmf", 122 | dict( 123 | help="Specify the output folder, where files will be written as " 124 | ".bmf. This should be the same folder as the bulk FAST5 " 125 | "file for this experiment.", 126 | type=full_path, 127 | metavar="", 128 | required=True, 129 | ), 130 | ), 131 | ) 132 | -------------------------------------------------------------------------------- /bulkvis/merge.py: -------------------------------------------------------------------------------- 1 | """merge.py 2 | """ 3 | from bulkvis.core import die, fuse_reads, concat_files_to_df, find_files_of_type 4 | import pandas as pd 5 | from pathlib import Path 6 | from tqdm import tqdm 7 | 8 | 9 | _help = """Merge FASTQ files based on a fused_reads.txt or ONT 10 | sequencing_summary.txt and minimap2 .paf files""" 11 | _cli = ( 12 | ( 13 | "-d", 14 | "--distance", 15 | dict( 16 | help="Specify the maximum distance between consecutive mappings, only used with " 17 | "--summary and --paf options", 18 | type=int, 19 | default=10000, 20 | ), 21 | ), 22 | ( 23 | # The behaviour of 'alt' is confusing... it seems like a double negative 24 | "-a", 25 | "--alt", 26 | dict( 27 | help="""Exclude alternate assemblies""", action="store_false", default=True 28 | ), 29 | ), 30 | ( 31 | "-s", 32 | "--summary", 33 | dict( 34 | help="Sequencing summary file(s) generated by albacore or guppy", nargs="+", 35 | ), 36 | ), 37 | ( 38 | "-p", 39 | "--paf", 40 | dict(help="paf file(s) generated by minimap2", metavar="", nargs="+"), 41 | ), 42 | ( 43 | "--fused-reads", 44 | dict(help="fused_reads.txt file generated by `bulkvis fuse`", metavar=""), 45 | ), 46 | ( 47 | "-i", 48 | "--input", 49 | dict( 50 | help="FASTQ files or directories of input, if a directory is given files with extension" 51 | "'.fastq' or '.fq' will be used", 52 | nargs="+", 53 | ), 54 | ), 55 | ( 56 | "-o", 57 | "--output-dir", 58 | dict( 59 | help="Reads will be grouped as fused or un-fused. Fused reads will be saved " 60 | "in this directory. If not set uses current working directory", 61 | ), 62 | ), 63 | ( 64 | "--format", 65 | dict( 66 | help="Output format for the reads", 67 | default="fastq", 68 | choices=["fastq", "fasta"], 69 | ), 70 | ), 71 | ( 72 | "--all-reads", 73 | dict( 74 | help="Write un-fused reads to 'un_fused_reads.fastq' in the output directory", 75 | action="store_true", 76 | ), 77 | ), 78 | ) 79 | 80 | 81 | # TODO: Simplify, remove summary/paf -> require fused_reads.txt 82 | def run(parser, args): 83 | """Find fused reads and merge fasta/q""" 84 | 85 | """ 86 | Minimum required files for this script to operate: 87 | - sequencing_summary.txt AND mapping.paf 88 | OR 89 | - fused_reads.txt 90 | 91 | If both sets are provided, raise exception or if one or the other provided 92 | 93 | This code block will provide fused_read_ids and fused_reads_tuples 94 | """ 95 | if args.fused_reads and not (args.summary or args.paf): 96 | # Open fused_reads.txt file 97 | fused_df = pd.read_csv(args.fused_reads, sep='\t', usecols=['run_id', 'cat_read_id', 'count']) 98 | # Set fused_read_tuples and fused_read_ids_flat 99 | fused_read_tuples = fused_df['cat_read_id'].str.split('|').tolist() 100 | fused_read_ids = [item for sublist in fused_read_tuples for item in sublist] 101 | elif args.summary and args.paf and not args.fused_reads: 102 | # Open sequencing_summary file and paf file, and run bulkvis.fuse_reads 103 | seq_sum_df = concat_files_to_df(file_list=args.summary, 104 | sep='\t', 105 | usecols=['channel', 'start_time', 'duration', 106 | 'run_id', 'read_id', 'sequence_length_template', 107 | 'filename'] 108 | ) 109 | # Open minimap2 paf files into a single pd.DataFrame 110 | paf_df = concat_files_to_df(file_list=args.paf, 111 | sep='\t', 112 | header=None, 113 | usecols=[0, 4, 5, 7, 8], 114 | names=['Qname', 'Strand', 'Tname', 'Tstart', 'Tend'] 115 | ) 116 | fused_df, un_fused_df, to_be_fused_df = fuse_reads(seq_sum_df, paf_df, distance=args.distance, alt=False) 117 | fused_read_tuples = fused_df['cat_read_id'].str.split('|').tolist() 118 | fused_read_ids = to_be_fused_df['read_id'].tolist() 119 | else: 120 | # Raise a parser error 121 | parser.error('Either a fused_reads.txt, from bulkvis fuse OR sequencing_summary.txt ' 122 | 'and .paf files must be provided.') 123 | 124 | # Empty list for fastq file paths 125 | fastq_files = [] 126 | # TODO: Maybe consider gzip support 127 | # These should be lowercase and include the '.' 128 | endings = ['.fastq', '.fq'] 129 | 130 | for file_or_directory in args.input: 131 | fastq_files.extend(find_files_of_type(file_or_directory, endings)) 132 | # remove none from fastq_files 133 | fastq_files = list(filter(None.__ne__, fastq_files)) 134 | 135 | # End if no fastq files are found 136 | if not fastq_files: 137 | die('No FASTQ files found', status=0) 138 | 139 | print('{} fastq files found'.format(len(fastq_files))) 140 | # Create a read dictionary to hold all the fused reads that are found 141 | reads = {} 142 | # Run loop over fastq_files, opening each file and adding only reads that are fused to the dictionary. 143 | for fastq_file in tqdm(fastq_files, desc='FASTQ processed'): 144 | # with open(fastq_file, 'r') as fastq: 145 | with fastq_file.open('r') as fastq: 146 | for line in fastq: 147 | header = line.strip()[1:] 148 | read_id = header.split()[0] 149 | if read_id in fused_read_ids: 150 | sequence = next(fastq).strip() 151 | next(fastq) 152 | qualities = next(fastq).strip() 153 | reads[read_id] = { 154 | 'header': header.split(), 155 | 'sequence': sequence, 156 | 'qualities': qualities 157 | } 158 | else: 159 | next(fastq) 160 | next(fastq) 161 | next(fastq) 162 | 163 | if args.output_dir is not None: 164 | Path(args.output_dir).mkdir(parents=True, exist_ok=True) 165 | p = Path(args.output_dir) 166 | else: 167 | print('No output directory specified, using current working directory') 168 | p = Path('.') 169 | # Set format and create folders for output 170 | if args.format == 'fastq': 171 | fused_read_file = p / 'fused_reads.fastq' 172 | else: 173 | fused_read_file = p / 'fused_reads.fasta' 174 | 175 | # Split out fused reads into new file 176 | write_counter = 0 177 | miss_counter = 0 178 | # with open(fused_read_file, 'w') as output_fused: 179 | with fused_read_file.open('w') as output_fused: 180 | for pair in tqdm(fused_read_tuples, desc='Fused reads written'): 181 | if _read_id_not_in_dict(pair, reads): 182 | miss_counter += 1 183 | continue 184 | if args.format == 'fastq': 185 | read_str = '@{read_id} {run_id} {number}\n{seq}\n+\n{qual}\n'.format( 186 | read_id='|'.join(pair), 187 | run_id=reads[pair[0]]['header'][1], 188 | number=reads[pair[0]]['header'][2], 189 | seq=''.join([reads[s]['sequence'] for s in pair]), 190 | qual=''.join([reads[s]['qualities'] for s in pair]) 191 | ) 192 | else: # fasta 193 | read_str = '>{read_id} {run_id} {number}\n{seq}\n'.format( 194 | read_id='|'.join(pair), 195 | run_id=reads[pair[0]]['header'][1], 196 | number=reads[pair[0]]['header'][2], 197 | seq=''.join([reads[s]['sequence'] for s in pair]) 198 | ) 199 | output_fused.write(read_str) 200 | write_counter += 1 201 | 202 | print('{} fused reads written'.format(write_counter)) 203 | if miss_counter > 0: 204 | print('{} fused reads missed. These reads are most likely in the "fail" folder.'.format(miss_counter)) 205 | 206 | # This will return if write all is not set, 207 | if not args.all_reads: 208 | die('', status=0) 209 | 210 | print('Writing un-fused reads') 211 | 212 | # Set path the output directory and make any missing folders 213 | if args.format == 'fastq': 214 | un_fused_read_file = p / 'un_fused_reads.fastq' 215 | else: 216 | un_fused_read_file = p / 'un_fused_reads.fasta' 217 | 218 | # Set new write counter 219 | write_counter = 0 220 | # with open(un_fused_read_file, 'w') as output_un_fused: 221 | with un_fused_read_file.open('w') as output_un_fused: 222 | for file in tqdm(fastq_files, desc='FASTQ processed'): 223 | # with open(file, 'r') as fastq: 224 | with file.open('r') as fastq: 225 | for line in fastq: 226 | header = line.strip()[1:] 227 | read_id = header.split()[0] 228 | if read_id not in fused_read_ids: 229 | sequence = next(fastq).strip() 230 | next(fastq) 231 | qualities = next(fastq).strip() 232 | # Write the read out 233 | if args.format == 'fastq': 234 | read_str = '@{header}\n{seq}\n+\n{qual}\n'.format( 235 | header=header, 236 | seq=sequence, 237 | qual=qualities 238 | ) 239 | else: # fasta 240 | read_str = '>{header}\n{seq}\n'.format( 241 | header=header, 242 | seq=sequence 243 | ) 244 | output_un_fused.write(read_str) 245 | write_counter += 1 246 | else: 247 | next(fastq) 248 | next(fastq) 249 | next(fastq) 250 | print('{} un-fused reads written'.format(write_counter)) 251 | # TODO: Read and write FASTA, FASTQ, and gzip 252 | # TODO: Improve arguments, add required inputs 253 | 254 | 255 | def _read_id_not_in_dict(read_ids, read_dict): 256 | """Return True if all read_ids in a list are not in the read_dict keys, otherwise False""" 257 | for read_id in read_ids: 258 | if read_id not in read_dict.keys(): 259 | return True 260 | return False 261 | -------------------------------------------------------------------------------- /bulkvis/serve.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import shutil 3 | import subprocess 4 | import sys 5 | 6 | from bokeh.command.subcommands.serve import Serve 7 | 8 | 9 | _help = "Serve the bulk FAST5 file viewer web app" 10 | # Patch the incoming bokeh serve arguments 11 | # Remove `files` and `--args` as these are 12 | # used in the internal call to bokeh serve 13 | # prepend `dir` which is the bulk file dir 14 | _cli = [ 15 | ( 16 | "dir", 17 | dict( 18 | help="bulk FAST5 directory (default: working directory)", 19 | default=None, 20 | metavar="BULK_DIRECTORY", 21 | ), 22 | ), 23 | ] + [arg for arg in Serve.args if arg[0] not in {"files", "--args"}] 24 | 25 | 26 | def run(parser, args): 27 | bokeh = shutil.which("bokeh") 28 | if not bokeh: 29 | sys.exit("Unable to find bokeh. Is it installed?") 30 | 31 | server = str(Path(__file__).parent / "bulkvis_server") 32 | 33 | flags = sys.argv[3:] 34 | 35 | command = [bokeh, "serve", server] + flags + ["--args", args.dir] 36 | 37 | try: 38 | subprocess.run(command) 39 | except KeyboardInterrupt: 40 | pass 41 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = bulkvis 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | .figure img { 2 | border: 1px solid black; 3 | } 4 | .custom-warn { 5 | color: #D63301; 6 | background-color: #FFCCBA; 7 | border: 1px solid; 8 | margin: 10px 0px; 9 | padding: 15px 40px; 10 | font-size: 30px; 11 | } -------------------------------------------------------------------------------- /docs/_static/icons/save.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/icons/save.png -------------------------------------------------------------------------------- /docs/_static/icons/xpan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/icons/xpan.png -------------------------------------------------------------------------------- /docs/_static/icons/zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/icons/zoom.png -------------------------------------------------------------------------------- /docs/_static/images/bulk_file/01_pop_up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/bulk_file/01_pop_up.png -------------------------------------------------------------------------------- /docs/_static/images/bulk_file/02_read_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/bulk_file/02_read_config.png -------------------------------------------------------------------------------- /docs/_static/images/bulk_file/03_bulk_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/bulk_file/03_bulk_config.png -------------------------------------------------------------------------------- /docs/_static/images/quickstart/01_initial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/01_initial.png -------------------------------------------------------------------------------- /docs/_static/images/quickstart/02_position.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/02_position.png -------------------------------------------------------------------------------- /docs/_static/images/quickstart/03_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/03_plot.png -------------------------------------------------------------------------------- /docs/_static/images/quickstart/04_sidebar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/04_sidebar.png -------------------------------------------------------------------------------- /docs/_static/images/quickstart/05_annotations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/05_annotations.png -------------------------------------------------------------------------------- /docs/_static/images/quickstart/06_adjustments.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/06_adjustments.png -------------------------------------------------------------------------------- /docs/_static/images/quickstart/07_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/07_plot.png -------------------------------------------------------------------------------- /docs/_static/images/quickstart/08_read_file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/quickstart/08_read_file.png -------------------------------------------------------------------------------- /docs/_static/images/utilities/01_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LooseLab/bulkvis/37d88db2b84ddcfa357a0c56148756ef14d3f837/docs/_static/images/utilities/01_plot.png -------------------------------------------------------------------------------- /docs/collecting_a_bulk_file.rst: -------------------------------------------------------------------------------- 1 | ######################## 2 | Editing Protocol Scripts 3 | ######################## 4 | 5 | .. container:: custom-warn 6 | 7 | This page is a copy of the `Nanopore Community Knowledge Page `_. 8 | Please use the latest instructions available on the Community pages. 9 | 10 | Introduction 11 | ============ 12 | 13 | In MinKNOW version 0.51.3 onwards, the protocol scripts are structured in a way that makes it 14 | easier for users to configure certain parameters, without needing extensive programming knowledge. 15 | Namely, there are three settings that can be configured for the 48 h runs from the scripts: 16 | 17 | run_time_changes allows the user to change: 18 | 19 | - the total run time in hours 20 | - the time in seconds between MUX changes 21 | - the starting voltage of the run 22 | - the time in seconds between global potential reversals 23 | 24 | If this configuration is not enabled, the run starts with the current default settings. 25 | 26 | read_file_configuration allows the user to: 27 | 28 | - turn raw data for reads on or off 29 | - turn event data for reads on or off 30 | 31 | bulk_file_configuration allows the user to: 32 | 33 | - turn raw data on in the bulk file 34 | - turn event data on in the bulk file 35 | 36 | Note on editing MinKNOW scripts 37 | ------------------------------- 38 | 39 | Prerequisites 40 | 41 | Changing the settings of the scripts impacts on the data collected during the run, so it is advised 42 | that such configuration is attempted by advanced users only. 43 | 44 | A note of caution 45 | 46 | The scripts that control the device, particularly the sequencing run scripts, are an area of constant 47 | development. Scripts are subject to rapid change and can be added, removed and overwritten by the 48 | automatic software updates from Oxford Nanopore. Consequently, it is highly recommended that copies 49 | of altered scripts are saved in a second location, with change notes, so that if necessary they can 50 | be restored quickly. 51 | 52 | Open the script of interest 53 | =========================== 54 | 55 | Open a text editor software (e.g. Notepad++) with Administrator privileges 56 | In the text editor, click Open and navigate to: 57 | 58 | ``C:\Program Files\OxfordNanopore\MinKNOW\ont-python\Lib\site-packages\bream\core\nc\cli\NC_Sequencing.py`` in Windows 59 | 60 | ``Applications/MinKNOW.app/Contents/Resources/ont-python/lib/python2.7/site-packages/bream/core/nc/cli/NC_Sequencing.py`` in Mac OS X 61 | 62 | ``/opt/ONT/MinKNOW/ont-python/lib/python2.7/site-packages/bream/core/nc/cli`` in Linux 63 | 64 | Navigate to 65 | =========== 66 | 67 | navigate to line 349: 68 | ``popup_boxes=args.popup_boxes`` 69 | 70 | Replace 71 | ======= 72 | 73 | Replace this line with any combination of the three options below:: 74 | 75 | 'run_time_changes' 76 | 'read_file_configuration' 77 | 'bulk_file_configuration' 78 | 79 | For example, to enable all three, type:: 80 | 81 | popup_boxes=['run_time_changes', 'read_file_configuration', 'bulk_file_configuration'], 82 | 83 | **Note: the comma at the end of the line is essential for the script to function properly.** 84 | 85 | Save the script 86 | =============== 87 | 88 | Restart the MinKNOW service 89 | =========================== 90 | 91 | Open up a command prompt window as administrator, and navigate to the MinKNOW folder:: 92 | 93 | cd "Program Files\OxfordNanopore\MinKNOW" 94 | bin\mk_manager_client.exe --exit 95 | bin\mk_manager_svc.exe 96 | 97 | For Mac OS X users, open a terminal window:: 98 | 99 | cd /Applications/MinKNOW.app/Contents/Resources 100 | sudo bin/mk_manager_svc 101 | 102 | For Ubuntu users, open a terminal window:: 103 | 104 | cd /opt/ONT/MinKNOW 105 | sudo bin/mk_manager_svc 106 | 107 | Pop-up box 108 | ========== 109 | 110 | After the relevant lines in the script have been activated, a pop-up box will appear when a 48 h 111 | protocol is selected in the MinKNOW web GUI. Enter or check the appropriate information and click Update. 112 | 113 | .. figure:: _static/images/bulk_file/01_pop_up.png 114 | :class: figure 115 | :alt: run_time_changes 116 | 117 | run_time_changes 118 | 119 | .. figure:: _static/images/bulk_file/02_read_config.png 120 | :class: figure 121 | :alt: read_file_configuration 122 | 123 | read_file_configuration 124 | 125 | .. figure:: _static/images/bulk_file/03_bulk_config.png 126 | :class: figure 127 | :alt: bulk_file_configuration 128 | 129 | bulk_file_configuration 130 | 131 | 132 | **Bulk data acquisition is turned off by default; to enable it, check the 133 | debug_data box in addition to either the event or raw data.** -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/stable/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'bulkvis' 23 | copyright = '2018, Alex Payne' 24 | author = 'Alex Payne' 25 | 26 | # The short X.Y version 27 | version = '' 28 | # The full version, including alpha/beta/rc tags 29 | release = '' 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | ] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # The suffix(es) of source filenames. 48 | # You can specify multiple suffix as a list of string: 49 | # 50 | # source_suffix = ['.rst', '.md'] 51 | source_suffix = '.rst' 52 | 53 | # The master toctree document. 54 | master_doc = 'index' 55 | 56 | # The language for content autogenerated by Sphinx. Refer to documentation 57 | # for a list of supported languages. 58 | # 59 | # This is also used if you do content translation via gettext catalogs. 60 | # Usually you set "language" from the command line for these cases. 61 | language = None 62 | 63 | # List of patterns, relative to source directory, that match files and 64 | # directories to ignore when looking for source files. 65 | # This pattern also affects html_static_path and html_extra_path . 66 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 67 | 68 | # The name of the Pygments (syntax highlighting) style to use. 69 | pygments_style = 'sphinx' 70 | 71 | 72 | # -- Options for HTML output ------------------------------------------------- 73 | 74 | # The theme to use for HTML and HTML Help pages. See the documentation for 75 | # a list of builtin themes. 76 | # 77 | html_theme = 'alabaster' 78 | 79 | # Theme options are theme-specific and customize the look and feel of a theme 80 | # further. For a list of options available for each theme, see the 81 | # documentation. 82 | # 83 | # html_theme_options = {} 84 | 85 | # Add any paths that contain custom static files (such as style sheets) here, 86 | # relative to this directory. They are copied after the builtin static files, 87 | # so a file named "default.css" will overwrite the builtin "default.css". 88 | html_static_path = ['_static'] 89 | 90 | # Custom sidebar templates, must be a dictionary that maps document names 91 | # to template names. 92 | # 93 | # The default sidebars (for documents that don't match any pattern) are 94 | # defined by theme itself. Builtin themes are using these templates by 95 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 96 | # 'searchbox.html']``. 97 | # 98 | # html_sidebars = { '***': ['globaltoc.html', 'relations.html', 'sourcelink.html', 'searchbox.html'], } 99 | 100 | 101 | # -- Options for HTMLHelp output --------------------------------------------- 102 | 103 | # Output file base name for HTML help builder. 104 | htmlhelp_basename = 'bulkvisdoc' 105 | 106 | 107 | # -- Options for LaTeX output ------------------------------------------------ 108 | 109 | latex_elements = { 110 | # The paper size ('letterpaper' or 'a4paper'). 111 | # 112 | # 'papersize': 'letterpaper', 113 | 114 | # The font size ('10pt', '11pt' or '12pt'). 115 | # 116 | # 'pointsize': '10pt', 117 | 118 | # Additional stuff for the LaTeX preamble. 119 | # 120 | # 'preamble': '', 121 | 122 | # Latex figure (float) alignment 123 | # 124 | # 'figure_align': 'htbp', 125 | } 126 | 127 | # Grouping the document tree into LaTeX files. List of tuples 128 | # (source start file, target name, title, 129 | # author, documentclass [howto, manual, or own class]). 130 | latex_documents = [ 131 | (master_doc, 'bulkvis.tex', 'bulkvis Documentation', 132 | 'Alex Payne', 'manual'), 133 | ] 134 | 135 | 136 | # -- Options for manual page output ------------------------------------------ 137 | 138 | # One entry per manual page. List of tuples 139 | # (source start file, name, description, authors, manual section). 140 | man_pages = [ 141 | (master_doc, 'bulkvis', 'bulkvis Documentation', 142 | [author], 1) 143 | ] 144 | 145 | 146 | # -- Options for Texinfo output ---------------------------------------------- 147 | 148 | # Grouping the document tree into Texinfo files. List of tuples 149 | # (source start file, target name, title, author, 150 | # dir menu entry, description, category) 151 | texinfo_documents = [ 152 | (master_doc, 'bulkvis', 'bulkvis Documentation', 153 | author, 'bulkvis', 'One line description of project.', 154 | 'Miscellaneous'), 155 | ] 156 | 157 | def setup(app): 158 | app.add_stylesheet('css/custom.css') -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. bulkvis documentation master file, created by 2 | sphinx-quickstart on Wed Apr 4 09:36:43 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ####### 7 | bulkvis 8 | ####### 9 | 10 | Introduction 11 | ============ 12 | 13 | bulkvis is an interactive bulk-fast5-file explorer built using python 3 and bokeh. 14 | It enables the visualisation of raw 'squiggle' data from Oxford Nanopore Technologies sequencers. 15 | 16 | .. toctree:: 17 | :maxdepth: 2 18 | :titlesonly: 19 | :caption: Contents: 20 | 21 | installation 22 | quickstart 23 | utilities 24 | collecting_a_bulk_file -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | ############ 2 | Installation 3 | ############ 4 | 5 | We recommend running bulkvis from within a python virtual environment so that there are no conflicts in dependencies. 6 | 7 | Installing pip 8 | ============== 9 | 10 | pip is most likely already installed, to find out run:: 11 | 12 | pip --version 13 | 14 | If pip is not installed, use the official 15 | `get-pip.py `_ script. 16 | 17 | Create and activate a virtual environment 18 | ========================================= 19 | 20 | For linux and MacOS:: 21 | 22 | python3 -m venv bulkvis-env 23 | source bulkvis-env/bin/activate 24 | 25 | For Windows:: 26 | 27 | python3 -m venv bulkvis 28 | bulkvis\Scripts\activate 29 | 30 | If the virtual environment is successfully activated the prefix ``(bulkvis)`` will be present. 31 | 32 | Running ``deactivate`` will deactivate and exit the virtual environment 33 | 34 | Clone bulkvis 35 | ============= 36 | 37 | bulkvis can be retrieved by cloning the git repository:: 38 | 39 | git clone https://github.com/LooseLab/bulkvis.git 40 | 41 | or by navigating to `bulkvis `_ and downloading an zip of the repository, 42 | this will then need to be unzipped. 43 | 44 | Installing dependencies 45 | ======================= 46 | 47 | Once the repository is cloned or downloaded bulkvis' dependencies will need to be installed. This **must** be run from 48 | within the virtual environment to prevent conflicts. Run:: 49 | 50 | pip install -r bulkvis/requirements.txt 51 | 52 | This will fetch and install all the required packages. 53 | 54 | Creating config.ini 55 | =================== 56 | 57 | bulkvis uses a configuration file, config.ini, to provide global variables that are required for operation. A config 58 | file can either be generated by running ``utils/set_config.py`` (requires a bulkfile) or by copying and editing an 59 | example bulkfile from ``config.md``. 60 | 61 | Using ``set_config.py``:: 62 | 63 | cd bulkvis 64 | python utils/set_confi.py -b <> -i /path/to/bulkfile/directory -e /path/to/readfile/directory -m /path/to/mapfile/directory -c config.ini 65 | 66 | Using ``config.md``:: 67 | 68 | cd bulkvis 69 | touch config.ini 70 | nano config.ini 71 | 72 | Then navigate to `config.md `_, copy and paste the example 73 | configuration settings into nano in the terminal and finally change the directories (``dir``, ``out`` and ``map``) to point 74 | towards your bulk-fast5-files and a read directory. 75 | 76 | Starting bulkvis 77 | ================ 78 | 79 | To start bulkvis:: 80 | 81 | bokeh serve --show bulkvis 82 | 83 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=bulkvis 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | ########## 2 | Quickstart 3 | ########## 4 | 5 | This page will provide a quick overview of the bulkvis features and how to use them. 6 | 7 | Start bulkvis 8 | ============= 9 | From the directory containing bulkvis, run:: 10 | 11 | bokeh serve --show bulkvis 12 | 13 | This will start the bulkvis app and open the page in your default web browser. The page should look like this: 14 | 15 | .. figure:: _static/images/quickstart/01_initial.png 16 | :class: figure 17 | :alt: Screenshot of bulkvis showing a blank screen with a drop-down box in the top left corner 18 | 19 | Screenshot of bulkvis on initial load 20 | 21 | Selecting a bulk-fast5-file 22 | =========================== 23 | Use the drop-down to select a bulk-fast5-file from your supplied directory. Once the file is loaded the position can be 24 | entered. 25 | 26 | .. figure:: _static/images/quickstart/02_position.png 27 | :class: figure 28 | :alt: Screenshot of bulkvis showing, in the top left corner, a drop-down box with a file selected and a text box labeled position 29 | 30 | Screenshot of bulkvis waiting for position information 31 | 32 | Selecting a position 33 | ==================== 34 | The position can be selected by either supplying coordinates or a fastq read header. The contents of this input is submitted 35 | by clicking away from the text box or by pressing return/enter. If bulkvis cannot parse the input the text box will turn 36 | red until valid input is detected. 37 | 38 | After a position is entered bulkvis will completely load and the chart will be visible. 39 | 40 | Using coordinates 41 | ----------------- 42 | Coordinates refer to the channel, start time and end time. This is given in the format ``channel:start-end``. For 43 | example to navigate to channel 42 and see the squiggle from 30 seconds to 90 seconds:: 44 | 45 | 42:30-90 46 | 47 | Using a fastq read header 48 | ------------------------- 49 | Alternatively, the position can be given as a fastq read header that is from the run associated with this bulk-fast5-file. 50 | This can be copied and pasted into the text-box e.g:: 51 | 52 | @b45a4b09-6f22-40f6-afd9-aa7fca8e89f3 runid=f9291b45b0c66faa77755e51738d193fcfafffc7 read=234 ch=391 start_time=2018-01-18T21:59:40Z 53 | 54 | 55 | After entering valid input the chart and other elements will load: 56 | 57 | .. figure:: _static/images/quickstart/03_plot.png 58 | :class: figure 59 | :alt: Screenshot of bulkvis showing a 'squiggle' plot of raw nanopore signal and a left-hand sidebar containing information about the plot 60 | 61 | Screenshot of bulkvis fully loaded with both plot and sidebar 62 | 63 | Navigating 64 | ========== 65 | The bulk-fast5-file can be navigated by jumping to the next or previous event and by using the xpan (|xpan_icon|) to drag 66 | the plot along the x-axis or zoom (|zoom_icon|) to take a closer look at a section of the plot. 67 | 68 | The jump to action is available for any event type that is listed as ``True`` in config.ini and is available even when the 69 | event is not currently being displayed. 70 | 71 | 72 | Bulkfile information 73 | ==================== 74 | The bulkfile information panel showcases information that is present in the bulk-fast5-file that is not necessarily 75 | displayed in MinKNOW. 76 | 77 | .. figure:: _static/images/quickstart/04_sidebar.png 78 | :class: figure 79 | :alt: Screenshot of the sidebar from bulkvis, showing the file selections drop-down, position input, jump-to buttons, export button, information panel, and two hidden sections ('Select annotations' and 'Plot adjustments') 80 | 81 | Screenshot of bulkvis sidebar 82 | 83 | Annotations 84 | =========== 85 | Annotations are added to the plot based on state data and intermediate data from the bulkfile. These represent the label 86 | computed by MinKNOW at specific time points in the experiment. 87 | 88 | Selecting a checkbox will allow that specific label to be rendered on the plot. The 'Display annotations' or 'Display mappings' 89 | button will toggle these annotations on/off. The 'Select all' button will turn all annotations on, while the 'Select none' 90 | button will de-select all the annotations. 91 | 92 | .. figure:: _static/images/quickstart/05_annotations.png 93 | :class: figure 94 | :alt: Screenshot of 'Select annotations' section of sidebar showing buttons labeled 'Display annotations', 'Display mappings', 'Select all', 'Select none' and a list of checkboxes 95 | 96 | Screenshot of the annotation selection panel 97 | 98 | Plot adjustments 99 | ================ 100 | The plot adjustments are infrequently used options that are for tweaking the appearance of the plot without having to modify the configuration file. 101 | Here the width and height of the plot can be set to match the current screen, the height that annotations are rendered at can be adjusted, and the Y-axis can 102 | be fixed to a given range. 103 | 104 | Plot smoothing is on by default, as raw signal data can quickly become massive, this reduces the number of points plot but maintains the shape of the data. 105 | Smoothing will automatically turn on whenever the position is changed. 106 | 107 | .. figure:: _static/images/quickstart/06_adjustments.png 108 | :class: figure 109 | :alt: Screenshot of 'Plot adjustments' section of the sidebar showing inputs for width, height, annotation height, y max, and y min as well as buttons for 'Fixed Y-axis' and 'Smoothing' 110 | 111 | Screenshot of the plot adjustments panel 112 | 113 | Exporting images 114 | ================ 115 | bulkvis is able to export images of plots as below, this is done using the save function (|save_icon|) which will either download 116 | the current plot view as ``bokeh_plot.png`` or, in safari, open the image in a new tab where it can be saved by right clicking and selecting save. 117 | 118 | .. figure:: _static/images/quickstart/07_plot.png 119 | :class: figure 120 | :alt: Example plot of raw signal data from an Oxford Nanopore bulk-fast5-file 121 | 122 | Example plot from bulkvis 123 | 124 | Exporting read files 125 | ==================== 126 | bulkvis is able to export arbitrary read files from bulk-fast5-files. The data range is determined by the current position as set in the text input. 127 | When a read file is generated it is written to the folder set in the configuration file. 128 | 129 | .. figure:: _static/images/quickstart/08_read_file.png 130 | :class: figure 131 | :alt: Screenshot of bulkvis sidebar showing position, duration, 'read file created', jump buttons and export button 132 | 133 | Screenshot of export button and success message (below 'duration') 134 | 135 | .. |zoom_icon| image:: /_static/icons/zoom.png 136 | :height: 11pt 137 | .. |xpan_icon| image:: /_static/icons/xpan.png 138 | :height: 11pt 139 | .. |save_icon| image:: /_static/icons/save.png 140 | :height: 11pt 141 | -------------------------------------------------------------------------------- /docs/utilities.rst: -------------------------------------------------------------------------------- 1 | Utilities 2 | ========= 3 | 4 | An overview of the utility scripts provided to conduct analysis on fused reads. 5 | 6 | whale_watch.py 7 | -------------- 8 | .. code-block:: bash 9 | 10 | Parse sequencing_summary.txt files and .paf files to find split reads in an 11 | Oxford Nanopore Dataset 12 | 13 | General options: 14 | -h, --help Show this help and exit 15 | -d , --distance Specify the maximum distance between consecutive 16 | mappings. This is the difference between 'Target Start' 17 | and 'Target End' in the paf file. Defaults to 10000 18 | -t , --top Specify how many top processed reads to display. Default 19 | is 10 20 | -D, --debug Write debug file 21 | 22 | Input sources: 23 | -s , --summary A sequencing summary file generated by albacore 24 | -p , --paf A paf file generated by minimap2 25 | 26 | Output files: 27 | -F , --out-fused Specify name of the fused_read file. This file only 28 | contains chains of reads. Defaults to 'fused_reads.txt' 29 | 30 | 31 | Output format 32 | ^^^^^^^^^^^^^ 33 | .. csv-table:: 34 | :header: "Field", "Description", "Example" 35 | 36 | "coords", "bulkvis position coordinates", "231:30782-32296" 37 | "run_id", "The run that these reads came from", "8093748fc82dc4c5cc441125d76432dd658c27c8" 38 | "channel", "Channel that sequenced these reads", "231" 39 | "start_time", "Time, in seconds, that the (first) incorrectly split read starting sequencing", "30782.8425" 40 | "duration", "Time, in seconds, it took for the incorrectly split read to pass through the channel", "1512.46425" 41 | "combined_length", "Number of bases in the combined reads", "611531" 42 | "target_name", "The mapping target, determined by minimap", "chr7" 43 | "strand", "'+' if query and target on the same strand; '-' if opposite", "\+" 44 | "start_match", "Start coordinate on the original strand", "46731340" 45 | "end_match", "End coordinate on the original strand", "46791591" 46 | "cat_read_id", "Read ids of all the reads in this group", "82eed45a-7774-4778-8f8a-eb17d7010116|6e9c7720-b7a3-47cc-8f42-30e2219add4b" 47 | "count", "Number of reads in this group", "2" 48 | 49 | 50 | whale_merge.py 51 | -------------- 52 | .. code-block:: bash 53 | 54 | Parse sequencing_summary.txt files and .paf files to find chained reads in an 55 | Oxford Nanopore Dataset and output fused fastq files 56 | 57 | General options: 58 | -h, --help Show this help and exit 59 | -d , --distance Specify the maximum distance between consecutive 60 | mappings. This is the difference between 'Target Start' 61 | and 'Target End' in the paf file. Defaults to 10000 62 | 63 | Input sources: 64 | -s , --summary A sequencing summary file generated by albacore 65 | -p , --paf A paf file generated by minimap2 66 | -f , --readfiles Full path to the folder containing fastq files you wish 67 | to join 68 | 69 | Output files: 70 | -o , --out-fused Specify name of the fused_read fastq file. This file will 71 | contain fused reads and the remaining singleton reads. 72 | Defaults to 'fused_reads.fastq' 73 | -W Outputs just the fused reads 74 | 75 | 76 | 77 | set_config.py 78 | ------------- 79 | .. code-block:: bash 80 | 81 | Generate a configuration file required for bulkvis to run 82 | 83 | General options: 84 | -h, --help Show this help and exit 85 | 86 | Input sources: 87 | -b , --bulkfile A bulk-fast5 file to get labels from 88 | -i , --input-dir The path to tbe folder containing bulk-files for 89 | visualisation 90 | -e , --export-dir The path to tbe folder where read-files will be written 91 | by bulkvis 92 | 93 | Output: 94 | -c , --config Path to the config.ini file in your bulkvis installation 95 | 96 | 97 | Figure scripts 98 | -------------- 99 | whale_plot.py 100 | ^^^^^^^^^^^^^ 101 | .. code-block:: bash 102 | 103 | Parse sequencing_summary.txt, .paf, and bulk fast5 files to generate CSV files 104 | containing the distributions of MinKNOW events around read starts and ends. 105 | These are divided into unique reads, split reads and internal reads. The R 106 | script, whale.R, is called to generate the plot; this requires the packages: 107 | ggplot2, tidyr, and dplyr. Note: of the MinKNOW classifications only above, 108 | adapter, pore, transition, unblocking, and unclassified are included. 109 | 110 | General options: 111 | -h, --help Show this help and exit 112 | -d DISTANCE, --distance DISTANCE 113 | Specify the maximum distance, in bases, between 114 | consecutive mappings. This is the difference between 115 | 'Target Start' and 'Target End' in a paf file 116 | (default: 10000) 117 | -V, --verbose Print verbose output to terminal (default: False) 118 | 119 | Input sources: 120 | -b BULK_FILE, --bulk-file BULK_FILE 121 | An ONT bulk fast5 file containing raw signal (default: 122 | None) 123 | -s SUMMARY, --summary SUMMARY 124 | A sequencing summary file generated by albacore 125 | (default: None) 126 | -p PAF, --paf PAF A paf file generated by minimap2 (default: None) 127 | -t TIME, --time TIME +/- time around a strand event in seconds (default: 128 | 10) 129 | 130 | Output files: 131 | --no-generate-plot If set, do not generate density plot (default: False) 132 | -A A CSV of MinKNOW events occurring before and after 133 | correctly called read starts (default: 134 | unique_read_start.csv) 135 | -B B CSV of MinKNOW events occurring before and after 136 | correctly called read ends (default: 137 | unique_read_end.csv) 138 | -C C CSV of MinKNOW events occurring before and after the 139 | start of the first incorrectly split read in a group 140 | (default: split_read_start.csv) 141 | -D D CSV of MinKNOW events occurring before and after 142 | incorrectly called read starts, within a group of 143 | incorrectly split reads (default: 144 | internal_read_start.csv) 145 | -E E CSV of MinKNOW events occurring before and after 146 | incorrectly called read ends, within a group of 147 | incorrectly split reads (default: 148 | internal_read_end.csv) 149 | -F F CSV of MinKNOW events occurring before and after the 150 | end of the first incorrectly split read in a group 151 | (default: split_read_end.csv) 152 | --out OUT Specify the output filename for the plot. File 153 | extension must be one of [.eps, .ps, .tex, .pdf, 154 | .jpeg, .tiff, .png, .bmp, .svg, .wmf] (default: 155 | classification_count.pdf) 156 | 157 | 158 | Example plot: 159 | """"""""""""" 160 | .. figure:: _static/images/utilities/01_plot.png 161 | :class: figure 162 | :alt: Example whale_plot.py output, showing a six columns: unique read start, unique read end, split read start, internal read start, internal read end, split read end. Each column shows the count of different classifications (above, adapter, pore, transition, unblocking, unclassified) around read starts and ends. 163 | 164 | Example plot from whale_plot.py 165 | 166 | whale.R 167 | ^^^^^^^ 168 | 169 | This R script is called by whale_plot.py to produce the above plot, it requires `Rscript` to run and can be run independently. To run: 170 | 171 | .. code-block:: bash 172 | 173 | $ Rscript whale.R col_A.csv col_B.csv col_C.csv col_D.csv col_E.csv col_F.csv <> <> 174 | 175 | The order arguments is given is essential in this script, otherwise labels will not match. 176 | The output filename must include a file extesion from `[.eps, .ps, .tex, .pdf, .jpeg, .tiff, .png, .bmp, .svg, .wmf]`. 177 | Run id is not required to execute this script. 178 | 179 | pod_plot.py 180 | ^^^^^^^^^^^ 181 | .. code-block:: bash 182 | 183 | Generate plots for all reads in a fused_reads.txt file. This uses bokeh to 184 | render a plot and requires selenium, phantomjs, and Pillow to be installed. 185 | These are available via conda/pip. 186 | 187 | General options: 188 | -h, --help Show this help and exit 189 | 190 | Input sources: 191 | -f , --fused A fused read file generated by whale_watch.py 192 | -b , --bulk-file An ONT bulk-fast5-file 193 | 194 | Output files: 195 | -D , --out-dir Specify the output directory where plots will be saved. 196 | Defaults to current working directory 197 | 198 | gen_bmf.py 199 | ---------- 200 | .. code-block:: bash 201 | 202 | Parse sequencing_summary.txt files and .paf files to format mapping info for 203 | bulkvis 204 | 205 | General options: 206 | -h, --help Show this help and exit 207 | 208 | Input sources: 209 | -s , --summary A sequencing summary file generated by albacore 210 | -p , --paf A paf file generated by minimap2 211 | 212 | Output:: 213 | --bmf Specify the output folder, where files will be written as 214 | .bmf. This should be the 'map' path specified in 215 | the config.ini 216 | 217 | bulk_info.py 218 | ------------- 219 | .. code-block:: bash 220 | 221 | Given a directory containing bulk fast5 files output a csv containing the run 222 | information for them 223 | 224 | General options: 225 | -h, --help Show this help and exit 226 | 227 | Input sources: 228 | -d , --dir A directory containing bulk-fast5-files 229 | 230 | Output sources: 231 | -o , --out Output csv filename 232 | 233 | Other scripts 234 | ------------- 235 | 236 | channelmaps.py 237 | ^^^^^^^^^^^^^^ 238 | `channelmaps.py` is a utility script that is designed to be called by other scripts. It contains the physical layout of 239 | ONT minION flowcells and allows lookup by channel number, reverse lookup by coordinates, and can return a list of 240 | surrounding channels. 241 | 242 | stitch.py 243 | ^^^^^^^^^ 244 | `stitch.py` is a utility script that is called from bulkvis, it will produce the read fast5 file from the squiggle data. 245 | -------------------------------------------------------------------------------- /env.yml: -------------------------------------------------------------------------------- 1 | name: bulkvis 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - python=3.11 8 | - pip 9 | - pip: 10 | - numpy==1.26.4 11 | - git+https://github.com/LooseLab/bulkvis.git 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | version = {} 4 | with open("bulkvis/_version.py") as fh: 5 | exec(fh.read(), version) 6 | 7 | install_requires = [ 8 | "bokeh>=2.1.0,<2.4.0", 9 | "h5py", 10 | "pandas>1.0,<2.0", 11 | "tornado", 12 | "tqdm", 13 | "readpaf", 14 | ] 15 | 16 | setup( 17 | name="bulkvis", 18 | version=version["__version__"], 19 | author="Alexander Payne", 20 | install_requires=install_requires, 21 | entry_points={ 22 | "console_scripts": [ 23 | "bulkvis=bulkvis.bulkvis:main", 24 | ], 25 | }, 26 | packages=["bulkvis", "bulkvis.bulkvis_server"], 27 | python_requires=">=3.6", 28 | include_package_data=True, 29 | ) 30 | --------------------------------------------------------------------------------