├── .circleci
    └── config.yml
├── .gitignore
├── .gitlab-ci.yml
├── COPYRIGHT
├── LICENSE.md
├── MANIFEST.in
├── Makefile
├── ONT_logo.png
├── README.md
├── docs
    ├── .gitignore
    ├── Makefile
    ├── _static
    │   └── .gitignore
    ├── cmd_tools.py
    ├── conf.py
    ├── index.rst
    ├── make.bat
    ├── requirements.txt
    └── usage.rst
├── pytest.ini
├── scripts
    ├── __init__.py
    ├── _template_script.py
    ├── add_errors.py
    ├── annotate_length.py
    ├── bam_accuracy.py
    ├── bam_alignment_length.py
    ├── bam_alignment_qc.py
    ├── bam_alignments_compare.py
    ├── bam_count_reads.py
    ├── bam_cov.py
    ├── bam_fill_unaligned.py
    ├── bam_frag_coverage.py
    ├── bam_gc_vs_qual.py
    ├── bam_multi_qc.py
    ├── bam_ref_base_coverage.py
    ├── bam_ref_tab.py
    ├── bam_score_filter.py
    ├── bam_soft_clips_tab.py
    ├── bias_explorer.py
    ├── calculate_coverage.py
    ├── compare_genomes_dnadiff.py
    ├── compare_genomes_lastal.py
    ├── convert_alphabet.py
    ├── correlate_counts.py
    ├── fasta_to_mock_fastq.py
    ├── fastq_qual_tab.py
    ├── fastq_time_slice.py
    ├── fastq_time_tab.py
    ├── fastx_ends_tab.py
    ├── fastx_grep.py
    ├── fastx_length_tab.py
    ├── length_normalise_counts.py
    ├── merge_tsvs.py
    ├── multi_length_hist.py
    ├── pickle_cat.py
    ├── plot_counts_correlation.py
    ├── plot_gffcmp_stats.py
    ├── plot_qualities.py
    ├── plot_sequence_properties.py
    ├── reads_across_time.py
    ├── reads_stats.py
    ├── reverse_fastq.py
    ├── sequence_filter.py
    ├── sequence_subtract.py
    ├── simulate_errors.py
    ├── simulate_genome.py
    ├── simulate_sequences.py
    ├── simulate_sequencing_simple.py
    └── split_fastx.py
├── setup.cfg
├── setup.py
└── wub
    ├── __init__.py
    ├── bam
        ├── __init__.py
        ├── common.py
        ├── compare.py
        ├── filter.py
        ├── read_counter.py
        ├── sam_writer.py
        └── stats.py
    ├── mappers
        ├── __init__.py
        └── lastal.py
    ├── parsers
        ├── __init__.py
        ├── blastn.py
        └── mummer.py
    ├── read_stats
        ├── __init__.py
        └── contig_stats.py
    ├── simulate
        ├── __init__.py
        ├── dist.py
        ├── genome.py
        └── seq.py
    ├── tests
        ├── __init__.py
        ├── data
        │   ├── test_bam_stats
        │   │   ├── stat_ref.fas
        │   │   ├── stat_test.bam
        │   │   └── stat_test.bam.bai
        │   ├── test_blastn_parse
        │   │   └── blastn_test.coords
        │   └── test_nucmer_parse
        │   │   └── nucmer_test.coords
        ├── test_bam_compare.py
        ├── test_bam_read_counter.py
        ├── test_bam_stats.py
        ├── test_blastn_coord_parse.py
        ├── test_contig_stats.py
        ├── test_example.py
        ├── test_mappers_lastal.py
        ├── test_nucmer_coord_parse.py
        ├── test_simulate_genome.py
        ├── test_simulate_seq.py
        ├── test_util_parse.py
        ├── test_util_seq.py
        └── test_wrappers_dnadiff.py
    ├── util
        ├── __init__.py
        ├── cmd.py
        ├── misc.py
        ├── parse.py
        └── seq.py
    ├── vis
        ├── __init__.py
        └── report.py
    └── wrappers
        ├── __init__.py
        └── dnadiff.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # This configuration was automatically generated from a CircleCI 1.0 config.
 2 | # It should include any build commands you had along with commands that CircleCI
 3 | # inferred from your project structure. We strongly recommend you read all the
 4 | # comments in this file to understand the structure of CircleCI 2.0, as the idiom
 5 | # for configuration has changed substantially in 2.0 to allow arbitrary jobs rather
 6 | # than the prescribed lifecycle of 1.0. In general, we recommend using this generated
 7 | # configuration as a reference rather than using it in production, though in most
 8 | # cases it should duplicate the execution of your original 1.0 config.
 9 | version: 2
10 | jobs:
11 |   build:
12 |     working_directory: ~/nanoporetech/wub
13 |     parallelism: 1
14 |     shell: /bin/bash --login
15 |     # CircleCI 2.0 does not support environment variables that refer to each other the same way as 1.0 did.
16 |     # If any of these refer to each other, rewrite them so that they don't or see https://circleci.com/docs/2.0/env-vars/#interpolating-environment-variables-to-set-other-environment-variables .
17 |     environment:
18 |       CIRCLE_ARTIFACTS: /tmp/circleci-artifacts
19 |       CIRCLE_TEST_REPORTS: /tmp/circleci-test-results
20 |     # In CircleCI 1.0 we used a pre-configured image with a large number of languages and other packages.
21 |     # In CircleCI 2.0 you can now specify your own image, or use one of our pre-configured images.
22 |     # The following configuration line tells CircleCI to use the specified docker image as the runtime environment for you job.
23 |     # We have selected a pre-built image that mirrors the build environment we use on
24 |     # the 1.0 platform, but we recommend you choose an image more tailored to the needs
25 |     # of each job. For more information on choosing an image (or alternatively using a
26 |     # VM instead of a container) see https://circleci.com/docs/2.0/executor-types/
27 |     # To see the list of pre-built images that CircleCI provides for most common languages see
28 |     # https://circleci.com/docs/2.0/circleci-images/
29 |     docker:
30 |     - image: circleci/python:3.6.1
31 |       command: /sbin/init
32 |     steps:
33 |     # Machine Setup
34 |     #   If you break your build into multiple jobs with workflows, you will probably want to do the parts of this that are relevant in each
35 |     # The following `checkout` command checks out your code to your working directory. In 1.0 we did this implicitly. In 2.0 you can choose where in the course of a job your code should be checked out.
36 |     - checkout
37 |     # Prepare for artifact and test results  collection equivalent to how it was done on 1.0.
38 |     # In many cases you can simplify this from what is generated here.
39 |     # 'See docs on artifact collection here https://circleci.com/docs/2.0/artifacts/'
40 |     - run: mkdir -p $CIRCLE_ARTIFACTS $CIRCLE_TEST_REPORTS
41 |     # Dependencies
42 |     #   This would typically go in either a build or a build-and-test job when using workflows
43 |     # Restore the dependency cache
44 |     - restore_cache:
45 |         keys:
46 |         # This branch if available
47 |         - v1-dep-{{ .Branch }}-
48 |         # Default branch if not
49 |         - v1-dep-master-
50 |         # Any branch if there are none on the default branch - this should be unnecessary if you have your default branch configured correctly
51 |         - v1-dep-
52 |     # The following line was run implicitly in your 1.0 builds based on what CircleCI inferred about the structure of your project. In 2.0 you need to be explicit about which commands should be run. In some cases you can discard inferred commands if they are not relevant to your project.
53 |     # You can remove the conditional and simply install the requirements file you use
54 |     #- run: #if [ -e requirements.txt ]; then pip install -r requirements.txt; else pip install -r requirements.pip; fi
55 |     - run: sudo apt-get update
56 |     - run: sudo apt-get install python3 python3-pip
57 |     - run: sudo pip install -e ./
58 |     # Save dependency cache
59 |     - save_cache:
60 |         key: v1-dep-{{ .Branch }}-{{ epoch }}
61 |         paths:
62 |         # This is a broad list of cache paths to include many possible development environments
63 |         # You can probably delete some of these entries
64 |         - vendor/bundle
65 |         - ~/virtualenvs
66 |         - ~/.m2
67 |         - ~/.ivy2
68 |         - ~/.bundle
69 |         - ~/.go_workspace
70 |         - ~/.gradle
71 |         - ~/.cache/bower
72 |     # Test
73 |     #   This would typically be a build job when using workflows, possibly combined with build
74 |     # The following line was run implicitly in your 1.0 builds based on what CircleCI inferred about the structure of your project. In 2.0 you need to be explicit about which commands should be run. In some cases you can discard inferred commands if they are not relevant to your project.
75 |     # Python test inference is not supported by the config translator.
76 |     # Put the commands that you use to run your tests here.
77 |     # If run your tests in parallel containers you can use this third party nose plugin
78 |     # https://github.com/dlanger/nose-parallel
79 |     # If you use django you can switch to the nose django runner to make use of this.
80 |     - run: sudo make test
81 |     # Teardown
82 |     #   If you break your build into multiple jobs with workflows, you will probably want to do the parts of this that are relevant in each
83 |     # Save test results
84 |     - store_test_results:
85 |         path: /tmp/circleci-test-results
86 |     # Save artifacts
87 |     - store_artifacts:
88 |         path: /tmp/circleci-artifacts
89 |     - store_artifacts:
90 |         path: /tmp/circleci-test-results
91 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #misc
62 | .env/
63 | .idea/
64 | ._.DS_Store
65 | *.DS_Store
66 | *.pyc


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | image: ubuntu:xenial
 2 | 
 3 | stages:
 4 |     - test
 5 |     - pages
 6 |     - release
 7 | 
 8 | before_script:
 9 |     - apt-get update
10 |     - apt-get install -y software-properties-common
11 |     - apt-add-repository universe
12 |     - apt-get update
13 |     - apt-get install -y python3-pip make python3-numpy python3-matplotlib python3-biopython python3-pandas mummer last-align cython zlib1g-dev lbzip2 libbz2-dev liblzma-dev libhdf5-serial-dev
14 |     - pip3 install --upgrade sphinx sphinx-argparse sphinx_rtd_theme pytest pycmd futures packaging appdirs pysam
15 |     - pip3 install -e ./
16 | 
17 |   
18 | do_testing:
19 |   stage: test
20 |   script:
21 |     - alias python=python3;make test
22 |   except:
23 |     - tags
24 |     
25 | pages:
26 |   stage: pages
27 |   script: 
28 |     - make docs
29 |     - mv docs/_build/html public
30 |   artifacts:
31 |     paths:
32 |     - public/
33 |   only:
34 |     - master
35 |   except:
36 |     - tags
37 | 


--------------------------------------------------------------------------------
/COPYRIGHT:
--------------------------------------------------------------------------------
1 | 
2 | (c) 2016 Oxford Nanopore Technologies Ltd.
3 | 
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | 
2 | include README.md
3 | include LICENSE
4 | 
5 | recursive-exclude * __pycache__
6 | recursive-exclude * *.py[co]
7 | 
8 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
9 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | MODULE=wub
 2 | 
 3 | .PHONY: clean clean-test clean-pyc clean-build docs com help 
 4 | 
 5 | .DEFAULT_GOAL := help
 6 | 
 7 | define PRINT_HELP_PYSCRIPT
 8 | import re, sys
 9 | 
10 | for line in sys.stdin:
11 | 	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
12 | 	if match:
13 | 		target, help = match.groups()
14 | 		print("%-20s %s" % (target, help))
15 | endef
16 | export PRINT_HELP_PYSCRIPT
17 | 
18 | help:
19 | 	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
20 | 
21 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
22 | 
23 | 
24 | clean-build: ## remove build artifacts
25 | 	rm -fr build/
26 | 	rm -fr dist/
27 | 	rm -fr .eggs/
28 | 	find . -name '*.egg-info' -exec rm -fr {} +
29 | 	find . -name '*.egg' -exec rm -f {} +
30 | 
31 | clean-pyc: ## remove Python file artifacts
32 | 	find . -name '*.pyc' -exec rm -f {} +
33 | 	find . -name '*.pyo' -exec rm -f {} +
34 | 	find . -name '*~' -exec rm -f {} +
35 | 	find . -name '__pycache__' -exec rm -fr {} +
36 | 
37 | clean-test: ## remove test and coverage artifacts
38 | 	rm -f .coverage
39 | 	rm -fr htmlcov/
40 | 
41 | lint: ## check style with flake8
42 | 	@(flake8 --max-line-length=120 $(MODULE) | grep -v "E501 line too long") || true
43 | 	@(flake8 --max-line-length=120 scripts/*.py | grep -v "E501 line too long") || true
44 | 
45 | test: ## run tests quickly with the default Python
46 | 	py.test -s
47 | 
48 | coverage: ## check code coverage quickly with the default Python
49 | 		coverage run --source $(MODULE) --omit="*/tests/*,*__init__.py" `which py.test`
50 | 		coverage report -m --omit="*/tests/*,*__init__.py"
51 | 		coverage html
52 | 
53 | docs: ## generate Sphinx HTML documentation, including API docs
54 | 	@cd docs; make clean html
55 | 
56 | servedocs: docs ## compile the docs watching for changes
57 | 	watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
58 | 
59 | release: clean ## package and upload a release
60 | 	python setup.py sdist upload
61 | 	python setup.py bdist_wheel upload
62 | 
63 | dist: clean ## builds source and wheel package
64 | 	python setup.py sdist
65 | 	python setup.py bdist_wheel
66 | 	ls -l dist
67 | 
68 | install: clean ## install the package to the active Python's site-packages
69 | 	python setup.py install
70 | 
71 | com: ## commit all changes to git
72 | 	git commit -a
73 | 


--------------------------------------------------------------------------------
/ONT_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/ONT_logo.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![ONT_logo](/ONT_logo.png)
  2 | -----------------------------
  3 | 
  4 | This repository is now unsupported and we do not recommend its use. Please contact Oxford Nanopore: support@nanoporetech.com for help with your application.
  5 | 
  6 | Wub
  7 | ==================================================================
  8 | 
  9 | [![CircleCI](https://circleci.com/gh/nanoporetech/wub.svg?style=svg)](https://circleci.com/gh/nanoporetech/wub) [![Documentation Status](https://readthedocs.org/projects/wub/badge/?version=latest)](http://wub.readthedocs.io/en/latest/?badge=latest) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square)](http://bioconda.github.io/recipes/wub/README.html)
 10 | 
 11 | Tools and software library developed by the Oxford Nanopore Technologies Applications group.
 12 | 
 13 | ## Features:
 14 | 
 15 | - Simple sequence and error simulation tools.
 16 | - Tools to visualise basic sequence properties.
 17 | - Fastq and fasta utilities.
 18 | - Tools to calculate read and genome assembly accuracy.
 19 | - Transcriptome alignment QC tools.
 20 | - Read couting and related utilities.
 21 | - BAM utilities.
 22 | - Miscellaneous utilities.
 23 | 
 24 | Getting Started
 25 | ===============
 26 | 
 27 | ## Installation
 28 | 
 29 | Set up a virtual environment 
 30 | 
 31 | ```
 32 | virtualenv  --system-site-packages wub_env
 33 | source wub_env/bin/activate
 34 | pip install --upgrade pip
 35 | pip install requests[security]
 36 | ```
 37 | 
 38 | Then install the package via pip:
 39 | 
 40 | ```
 41 | pip install git+https://github.com/nanoporetech/wub.git
 42 | ```
 43 | 
 44 | If you installed the package in a virtual environment then do not forget to
 45 | load it before using the package:
 46 | 
 47 | ```
 48 | source wub_env/bin/activate
 49 | ```
 50 | 
 51 | Run the following to leave the virtual environment:
 52 | 
 53 | ```
 54 | deactivate
 55 | ```
 56 | 
 57 | You can also clone the repository and install using `setup.py`:
 58 | 
 59 | ```
 60 | git clone https://github.com/nanoporetech/wub.git
 61 | cd wub
 62 | python setup.py install
 63 | ```
 64 | 
 65 | Install the package in developer mode:
 66 | 
 67 | ```
 68 | python setup.py develop
 69 | ```
 70 | 
 71 | Run the tests:
 72 | 
 73 | ```
 74 | make test
 75 | ```
 76 | 
 77 | Build the documentation:
 78 | 
 79 | ```
 80 | make docs
 81 | ```
 82 | 
 83 | Issue `make help` to get a list of `make` targets.
 84 | 
 85 | Documentation
 86 | -----------------
 87 | 
 88 | Online documentation is avalaible at [wub.readthedocs.io](http://wub.readthedocs.io/en/latest/).
 89 | 
 90 | Contributing
 91 | ----------------
 92 | 
 93 | - Please fork the repository and create a merge request to contribute.
 94 | - Please respect the structure outlined in `scripts/_template_script.py` from command line tools so documentation can be generated automatically.
 95 | - All non-trivial functions should have at least one test (with the exception of plotting functions).
 96 | - Use your best judgement when deciding whether to put a piece of code in a script or make it more reusable by incorporating it into the module.
 97 | - Use [bumpversion](http://bit.ly/2cSUryt) to manage package versioning.
 98 | - The code should be [PEP8](https://www.python.org/dev/peps/pep-0008) compliant, which can be tested by `make lint`.
 99 | 
100 | Help
101 | ====
102 | 
103 | ## Licence and Copyright
104 | 
105 | (c) 2016 Oxford Nanopore Technologies Ltd.
106 | 
107 | This Source Code Form is subject to the terms of the Mozilla Public
108 | License, v. 2.0. If a copy of the MPL was not distributed with this
109 | file, You can obtain one at http://mozilla.org/MPL/2.0/.
110 | 
111 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | *.rst
2 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | MODULE		  = wub
 10 | 
 11 | # User-friendly check for sphinx-build
 12 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 13 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 14 | endif
 15 | 
 16 | # Internal variables.
 17 | PAPEROPT_a4     = -D latex_paper_size=a4
 18 | PAPEROPT_letter = -D latex_paper_size=letter
 19 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 20 | # the i18n builder cannot share the environment and doctrees with the others
 21 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 22 | 
 23 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 24 | 
 25 | help:
 26 | 	@echo "Please use \`make <target>' where <target> is one of"
 27 | 	@echo "  html       to make standalone HTML files"
 28 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 29 | 	@echo "  singlehtml to make a single large HTML file"
 30 | 	@echo "  pickle     to make pickle files"
 31 | 	@echo "  json       to make JSON files"
 32 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 33 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 	rm -f $(MODULE).rst
 54 | 	rm -f modules.rst
 55 | 
 56 | html:
 57 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 58 | 	@echo
 59 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 60 | 
 61 | dirhtml:
 62 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 63 | 	@echo
 64 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 65 | 
 66 | singlehtml:
 67 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 68 | 	@echo
 69 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 70 | 
 71 | pickle:
 72 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 73 | 	@echo
 74 | 	@echo "Build finished; now you can process the pickle files."
 75 | 
 76 | json:
 77 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 78 | 	@echo
 79 | 	@echo "Build finished; now you can process the JSON files."
 80 | 
 81 | htmlhelp:
 82 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 83 | 	@echo
 84 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 85 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 86 | 
 87 | qthelp:
 88 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 89 | 	@echo
 90 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 91 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 92 | 	@echo "# 
 93 | 	@echo "To view the help file:"
 94 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/wub.qhc"
 95 | 
 96 | devhelp:
 97 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 98 | 	@echo
 99 | 	@echo "Build finished."
100 | 	@echo "To view the help file:"
101 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/wub"
102 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/wub"
103 | 	@echo "# devhelp"
104 | 
105 | epub:
106 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
107 | 	@echo
108 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
109 | 
110 | latex:
111 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
112 | 	@echo
113 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
114 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
115 | 	      "(use \`make latexpdf' here to do that automatically)."
116 | 
117 | latexpdf:
118 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
119 | 	@echo "Running LaTeX files through pdflatex..."
120 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
121 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
122 | 
123 | latexpdfja:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | text:
130 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
131 | 	@echo
132 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
133 | 
134 | man:
135 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
136 | 	@echo
137 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
138 | 
139 | texinfo:
140 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
141 | 	@echo
142 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
143 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
144 | 	      "(use \`make info' here to do that automatically)."
145 | 
146 | info:
147 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
148 | 	@echo "Running Texinfo files through makeinfo..."
149 | 	make -C $(BUILDDIR)/texinfo info
150 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
151 | 
152 | gettext:
153 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
154 | 	@echo
155 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
156 | 
157 | changes:
158 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
159 | 	@echo
160 | 	@echo "The overview file is in $(BUILDDIR)/changes."
161 | 
162 | linkcheck:
163 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
164 | 	@echo
165 | 	@echo "Link check complete; look for any errors in the above output " \
166 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
167 | 
168 | doctest:
169 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
170 | 	@echo "Testing of doctests in the sources finished, look at the " \
171 | 	      "results in $(BUILDDIR)/doctest/output.txt."
172 | 
173 | xml:
174 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
175 | 	@echo
176 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
177 | 
178 | pseudoxml:
179 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
180 | 	@echo
181 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
182 | 


--------------------------------------------------------------------------------
/docs/_static/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/docs/_static/.gitignore


--------------------------------------------------------------------------------
/docs/cmd_tools.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Helper script to generate cmd_line.rst file for all scripts in bin which
 4 | have and parser object defined in their global scope - taken from tang
 5 | 
 6 | """
 7 | from __future__ import print_function
 8 | import sys
 9 | import os
10 | import imp
11 | 
12 | scripts_rel = 'scripts'
13 | attr_name = 'parser'
14 | blacklist = ['__init__.py']
15 | 
16 | location = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')
17 | scripts_abs = os.path.join(location, scripts_rel)
18 | scripts = sorted(filter(lambda s: s[0] != '.' and s not in blacklist, os.listdir(scripts_abs)))
19 | 
20 | sys.stderr.write("Found following scripts:\n{}\n{}\n{}\n".format(
21 |     location, scripts_abs, scripts
22 | ))
23 | 
24 | 
25 | print ("""
26 | .. _command_line_tools:
27 | 
28 | Command line tools
29 | ==================
30 | """)
31 | 
32 | for script in scripts:
33 |     script_name, script_ext = os.path.splitext(script)
34 |     if script_ext == '.pyc':
35 |         continue
36 | 
37 |     try:
38 |         mod_name = '{}.{}'.format(scripts_rel, script_name)
39 |         # mod = __import__(mod_name, globals(), locals(), [attr_name])
40 |         mod = imp.load_source(script_name, os.path.join(scripts_abs, script))
41 |         script = script.replace('.py', '')
42 | 
43 |         print ('.. _{}:\n\n{}\n{}'.format(script, script, '-' * len(script)))
44 |         if hasattr(mod, attr_name):
45 |             print ("""
46 | .. argparse::
47 |    :ref: {}.{}
48 |    :prog: {}
49 | """.format(mod_name, attr_name, script_name))
50 |         else:
51 |             print ('No documentation available')
52 | 
53 |     except Exception as e:
54 |         # Wha' yer' gonna do?
55 |         sys.stderr.write('Error making docs for {}:\n{}\n'.format(script_name, e))
56 |         pass
57 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Welcome to the documentation of the wub package!
 3 | ================================================
 4 | 
 5 | 
 6 | Command line tools
 7 | ------------------
 8 | 
 9 | .. toctree::
10 | 
11 |    cmd_list
12 | 
13 | 
14 | Full API reference
15 | ------------------
16 | 
17 | .. toctree::
18 |     :maxdepth: 4
19 |     
20 |     usage 
21 |     modules
22 | 
23 | 
24 | Indices and tables
25 | ==================
26 | 
27 | * :ref:`genindex`
28 | * :ref:`modindex`
29 | * :ref:`search`
30 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\wub.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\wub.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | mock
 2 | sphinx-argparse
 3 | pysam
 4 | tqdm
 5 | biopython
 6 | numpy
 7 | pandas
 8 | editdistance
 9 | matplotlib
10 | h5py
11 | futures
12 | git+https://github.com/nanoporetech/wub.git
13 | 


--------------------------------------------------------------------------------
/docs/usage.rst:
--------------------------------------------------------------------------------
1 | =====
2 | Usage
3 | =====
4 | 
5 | To use wub package in a project::
6 | 
7 |     import wub
8 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = wub
3 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/scripts/_template_script.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | 
 6 | # Parse command line arguments:
 7 | parser = argparse.ArgumentParser(
 8 |     description='Template script.')
 9 | parser.add_argument(
10 |     '-i', metavar='input', type=str, help="Input.")
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     args = parser.parse_args()
15 | 


--------------------------------------------------------------------------------
/scripts/add_errors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | from Bio.Seq import Seq
 8 | 
 9 | from wub.simulate import seq as sim_seq
10 | from wub.util import seq as seq_util
11 | 
12 | # Parse command line arguments:
13 | parser = argparse.ArgumentParser(
14 |     description="""Add a specified number of errors to random sites for each input sequence.""")
15 | parser.add_argument('-n', metavar='nr_errors', type=int,
16 |                     help="Number of errors to introduce (0).", default=0)
17 | parser.add_argument('-t', metavar='error_type', type=str,
18 |                     help="Error type: substitution, insertion or deletion.", choices=['substitution', 'insertion', 'deletion'], default='substitution')
19 | parser.add_argument('input_fasta', nargs='?', help='Input fasta (default: stdin).',
20 |                     type=argparse.FileType('r'), default=sys.stdin)
21 | parser.add_argument('output_fasta', nargs='?', help='Output fasta (default: stdout)',
22 |                     type=argparse.FileType('w'), default=sys.stdout)
23 | 
24 | 
25 | def add_fixed_errors(input_iter, nr_errors, error_type):
26 |     """Simulate sequencing errors for each SeqRecord object in the input iterator.
27 | 
28 |     :param input_iter: Iterator of SeqRecord objects.
29 |     :para nr_errors: Number of errors to introduce.
30 |     :param error_type: Error type: substitution, insertion or deletion.
31 |     :returns: Generator of SeqRecord objects.
32 |     :rtype: generator
33 |     """
34 |     for record in input_iter:
35 |         mutated_seq = sim_seq.add_errors(record.seq, nr_errors, error_type)
36 |         record.seq = Seq(mutated_seq)
37 |         yield record
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     args = parser.parse_args()
42 | 
43 |     input_iterator = seq_util.read_seq_records(
44 |         args.input_fasta, format='fasta')
45 | 
46 |     simulation_iterator = add_fixed_errors(input_iterator, args.n, args.t)
47 | 
48 |     seq_util.write_seq_records(
49 |         simulation_iterator, args.output_fasta, format='fasta')
50 | 


--------------------------------------------------------------------------------
/scripts/annotate_length.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | from wub.util import seq as seq_util
 8 | 
 9 | # Parse command line arguments:
10 | parser = argparse.ArgumentParser(
11 |     description='Add sequence length to sequence record descriptions.')
12 | parser.add_argument(
13 |     '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq')
14 | parser.add_argument(
15 |     '-o', metavar='out_format', type=str, help="Output format (fastq).", default='fastq')
16 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).',
17 |                     type=argparse.FileType('r'), default=sys.stdin)
18 | parser.add_argument('output_fastx', nargs='?', help='Output file (default: stdout).',
19 |                     type=argparse.FileType('w'), default=sys.stdout)
20 | 
21 | 
22 | def _record_annotate_length(input_iter):
23 |     """ Add sequence length to record description.
24 |     """
25 |     for record in input_iter:
26 |         record.description = record.description + " seq_length={}".format(len(record.seq))
27 |         yield record
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     args = parser.parse_args()
32 | 
33 |     if args.i == 'fasta' and args.o == 'fastq':
34 |         sys.stderr.write(
35 |             "Cannot produce fastq output from fasta! Use fasta_to_mock_fastq.py instead.\n")
36 |         sys.exit(1)
37 | 
38 |     input_iterator = seq_util.read_seq_records(
39 |         args.input_fastx, format=args.i)
40 | 
41 |     output_iterator = _record_annotate_length(input_iterator)
42 | 
43 |     seq_util.write_seq_records(output_iterator, args.output_fastx, format=args.o)
44 | 


--------------------------------------------------------------------------------
/scripts/bam_accuracy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | 
  6 | import os
  7 | import sys
  8 | import pandas as pd
  9 | from collections import OrderedDict
 10 | from scipy.stats import gaussian_kde
 11 | from scipy.optimize import minimize_scalar
 12 | 
 13 | from wub.util import misc
 14 | from wub.vis import report
 15 | from wub.bam import stats
 16 | 
 17 | # Parse command line arguments:
 18 | parser = argparse.ArgumentParser(
 19 |     description="""Produce accuracy statistics of the input BAM file. Calculates global accuracy and identity and various per-read statistics.
 20 |     The input BAM file must be sorted by coordinates and indexed.
 21 |     """)
 22 | parser.add_argument(
 23 |     '-c', metavar='region', type=str, help="BAM region (None).", required=False, default=None)
 24 | parser.add_argument(
 25 |     '-g', metavar='global_tsv', type=str, default=None, help="Tab separated file to save global statistics (None).", required=False)
 26 | parser.add_argument(
 27 |     '-l', metavar='read_tsv', type=str, default=None, help="Tab separated file to save per-read statistics (None).", required=False)
 28 | parser.add_argument(
 29 |     '-t', metavar='bam_tag', type=str, default=None, help="Dataset tag (BAM basename).", required=False)
 30 | parser.add_argument(
 31 |     '-q', metavar='aqual', type=int, default=0, help="Minimum alignment quality (0).")
 32 | parser.add_argument(
 33 |     '-e', action="store_true", default=False, help="Include hard and soft clipps in alignment length when calculating accuracy (False).")
 34 | parser.add_argument(
 35 |     '-r', metavar='report_pdf', type=str, help="Report PDF (bam_accuracy.pdf).", default="bam_accuracy.pdf")
 36 | parser.add_argument(
 37 |     '-p', metavar='results_pickle', type=str, help="Save pickled results in this file (None).", default=None)
 38 | parser.add_argument(
 39 |     '-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False)
 40 | parser.add_argument(
 41 |     'bam', metavar='bam', type=str, help="Input BAM file.")
 42 | 
 43 | 
 44 | def estimate_mode(acc):
 45 |     """ Estimate the mode of a set of float values between 0 and 1.
 46 | 
 47 |     :param acc: Data.
 48 |     :returns: The mode of the sample
 49 |     :rtype: float
 50 |     """
 51 |     # Taken from sloika.
 52 |     if len(acc) > 1:
 53 |         da = gaussian_kde(acc)
 54 |         optimization_result = minimize_scalar(lambda x: -da(x), bounds=(0, 1), method='brent')
 55 |         if optimization_result.success:
 56 |             try:
 57 |                 mode = optimization_result.x[0]
 58 |             except IndexError:
 59 |                 mode = optimization_result.x
 60 |             except TypeError:
 61 |                 mode = optimization_result.x
 62 |         else:
 63 |             sys.stderr.write("Mode computation failed")
 64 |             mode = 0
 65 |     else:
 66 |         mode = acc[0]
 67 |     return mode
 68 | 
 69 | 
 70 | def base_stats_qc(st, report):
 71 |     """ Plot base statistics.
 72 | 
 73 |     :param st: Statistics dict.
 74 |     :param report: Plotter object.
 75 |     :returns: None
 76 |     """
 77 | 
 78 |     bs = st.copy()
 79 |     del bs['accuracy']
 80 |     del bs['identity']
 81 |     plotter.plot_bars_simple(
 82 |         bs, title="Basewise statistics", xlab="Type", ylab="Count")
 83 |     plotter.plot_bars_simple(OrderedDict([('Identity ({})'.format(st['identity']), st['identity']), ('Accuracy ({})'.format(
 84 |         st['accuracy']), st['accuracy'])]), title="Precision statistics: length weighted means", xlab="Type", ylab="Count")
 85 | 
 86 | 
 87 | def read_precision_qc(st, report):
 88 |     """ Plot read precision statistics.
 89 | 
 90 |     :param st: Statistics dict.
 91 |     :param report: Plotter object.
 92 |     :returns: Mode of accuracy and identity.
 93 |     :rtype: dict
 94 |     """
 95 |     accuracy_mode = estimate_mode(st['accuracy'])
 96 |     report.plot_histograms(OrderedDict([('Accuracy', st[
 97 |         'accuracy'])]), title="Distribution of per-read accuracies", xlab="Accuracy", ylab="Count", legend=True,
 98 |         vlines={'Mode:{0:.4f}'.format(accuracy_mode): accuracy_mode})
 99 | 
100 |     identity_mode = estimate_mode(st['identity'])
101 |     report.plot_histograms(OrderedDict([('Identity', st[
102 |         'identity'])]), title="Distribution of per-read identitities", xlab="Identity", ylab="Count", legend=True,
103 |         vlines={'Mode:{0:.4f}'.format(identity_mode): identity_mode})
104 | 
105 |     modes = {'accuracy_mode': accuracy_mode, 'identity_mode': identity_mode}
106 |     return modes
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     args = parser.parse_args()
111 |     tag = args.t if args.t is not None else os.path.basename(args.bam)
112 | 
113 |     plotter = report.Report(args.r)
114 | 
115 |     read_stats = stats.read_stats(
116 |         args.bam, region=args.c, min_aqual=args.q, with_clipps=args.e, verbose=not args.Q)
117 |     read_stats['tag'] = tag
118 |     base_stats = read_stats['base_stats']
119 |     precision_stats = read_stats['read_stats']
120 | 
121 |     base_stats_qc(base_stats, plotter)
122 |     modes = read_precision_qc(precision_stats, plotter)
123 | 
124 |     plotter.close()
125 | 
126 |     global_stats = OrderedDict([
127 |         ('Accuracy', [read_stats['base_stats']['accuracy']]),
128 |         ('AccuracyMode', modes['accuracy_mode']),
129 |         ('Identity', [read_stats['base_stats']['identity']]),
130 |         ('IdentityMode', modes['identity_mode']),
131 |         ('Mapped', [read_stats['mapped']]),
132 |         ('Unmapped', [read_stats['unmapped']]),
133 |         ('Tag', [read_stats['tag']]), ])
134 |     global_stats = pd.DataFrame(global_stats)
135 | 
136 |     if args.g is not None:
137 |         global_stats.to_csv(args.g, sep="\t", index=False)
138 | 
139 |     if args.l is not None:
140 |         read_df = pd.DataFrame(precision_stats)
141 |         read_df.to_csv(args.l, sep="\t", index=False)
142 | 
143 |     if args.p is not None:
144 |         misc.pickle_dump(read_stats, args.p)
145 | 


--------------------------------------------------------------------------------
/scripts/bam_alignment_length.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | 
 6 | import sys
 7 | import pandas as pd
 8 | from collections import OrderedDict
 9 | import tqdm
10 | 
11 | from wub.bam import common as bam_common
12 | 
13 | # Parse command line arguments:
14 | parser = argparse.ArgumentParser(
15 |     description="""Produce a tab separated file of alignment lengths and other information.
16 |     Rows are sorted by number of aligned reference bases unless the -x option is specified.
17 |     """)
18 | parser.add_argument(
19 |     '-t', metavar='tsv_file', type=str, help="Tab separated file to save alignment lengths (bam_alignment_length.tsv).", required=False, default="bam_alignment_length.tsv")
20 | parser.add_argument(
21 |     '-q', metavar='aqual', type=int, default=0, help="Minimum alignment quality (0).")
22 | parser.add_argument(
23 |     '-x', action="store_true", help="Sort by number of read bases instead of number of aligned reference bases.", default=False)
24 | parser.add_argument(
25 |     '-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False)
26 | parser.add_argument(
27 |     'bam', metavar='bam', type=str, help="Input BAM file.")
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     args = parser.parse_args()
32 |     verbose = not args.Q
33 | 
34 |     bam_reader = bam_common.pysam_open(args.bam, in_format='BAM')
35 | 
36 |     if verbose:
37 |         sys.stdout.write(
38 |             "Gathering read and alignment lengths from file: {}\n".format(args.bam))
39 |         try:
40 |             total_reads = bam_reader.mapped + bam_reader.unmapped
41 |         except:
42 |             total_reads = None
43 |         bam_reader = tqdm.tqdm(bam_reader, total=total_reads)
44 | 
45 |     read_names = []
46 |     ref_names = []
47 |     ref_lengths = []
48 |     read_lengths = []
49 |     aln_lengths = []
50 |     mapping_quals = []
51 | 
52 |     # Gather alignment information:
53 |     for record in bam_reader:
54 |         if (not record.is_unmapped) and (record.mapping_quality > args.q):
55 |             read_names.append(record.query_name)
56 |             ref_names.append(record.reference_name)
57 |             read_lengths.append(len(record.query_sequence))
58 |             aln_lengths.append(record.query_alignment_length)
59 |             ref_lengths.append(record.reference_length)
60 |             mapping_quals.append(record.mapping_quality)
61 | 
62 |     # Construct data frame:
63 |     data = OrderedDict([('read_name', read_names),
64 |                         ('aligned_ref_bases', ref_lengths),
65 |                         ('aligned_read_bases', aln_lengths),
66 |                         ('read_length', read_lengths),
67 |                         ('reference', ref_names),
68 |                         ('mapping_quality', mapping_quals)
69 |                         ])
70 | 
71 |     df = pd.DataFrame(data)
72 |     del data, read_names, ref_names, mapping_quals
73 |     del read_lengths, aln_lengths, ref_lengths
74 | 
75 |     # Sort data frame and save tsv:
76 |     sort_by = 'aligned_ref_bases'
77 |     if args.x:
78 |         sort_by = 'aligned_read_bases'
79 | 
80 |     df.sort_values([sort_by], ascending=[0], inplace=True)
81 |     df.to_csv(args.t, sep="\t", index=False)
82 | 


--------------------------------------------------------------------------------
/scripts/bam_alignments_compare.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import six
  5 | import argparse
  6 | import os
  7 | import pandas as pd
  8 | from collections import OrderedDict
  9 | from wub.util import misc
 10 | from wub.bam import compare as bam_compare
 11 | from wub.vis import report
 12 | 
 13 | # Parse command line arguments:
 14 | parser = argparse.ArgumentParser(
 15 |     description="""Compare alignments stored in two BAM files.
 16 |     The two BAM files must have the same set of reads in the same order (name sorted).
 17 |     """)
 18 | parser.add_argument(
 19 |     '-w', metavar='coarse_tolerance', type=int, help="Tolerance when performing coarse comparison of alignments (50).", default=50)
 20 | parser.add_argument(
 21 |     '-g', action="store_true", help="Do strict comparison of alignment flags.", default=False)
 22 | parser.add_argument(
 23 |     '-r', metavar='report_pdf', type=str, help="Report PDF (bam_alignments_compare.pdf).", default="bam_alignments_compare.pdf")
 24 | parser.add_argument(
 25 |     '-p', metavar='results_pickle', type=str, help="Save pickled results in this file (bam_alignments_compare.pk).", default="bam_alignments_compare.pk")
 26 | parser.add_argument(
 27 |     '-t', metavar='tsv_file', type=str, help="Save results in tsv format in this file (None).", default=None)
 28 | parser.add_argument(
 29 |     '-f', metavar='format', type=str, help="Input format (BAM).", default='BAM')
 30 | parser.add_argument(
 31 |     '-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False)
 32 | parser.add_argument(
 33 |     'bam_one', metavar='bam_one', type=str, help="First input BAM file.")
 34 | parser.add_argument(
 35 |     'bam_two', metavar='bam_two', type=str, help="Second input BAM file.")
 36 | 
 37 | if __name__ == '__main__':
 38 |     args = parser.parse_args()
 39 | 
 40 |     stats = bam_compare.bam_compare(
 41 |         args.bam_one, args.bam_two, coarse_tolerance=args.w, strict_flags=args.g, in_format=args.f, verbose=not args.Q)
 42 | 
 43 |     plotter = report.Report(args.r)
 44 | 
 45 |     query_stats = OrderedDict((field, stats[field]) for field in (
 46 |         'TotalQueries', 'CoarseMatches', 'DirectionMismatch', 'SeqMismatch'))
 47 |     plotter.plot_bars_simple(
 48 |         query_stats, title="Per query statistics", xlab="Field", ylab="Count", auto_limit=False)
 49 | 
 50 |     query_stats = OrderedDict((field, stats[field]) for field in (
 51 |         'TotalQueries', 'StrictFlagMismatch', 'RefMismatch'))
 52 |     plotter.plot_bars_simple(
 53 |         query_stats, title="Per query statistics (continued)", xlab="Field", ylab="Count", auto_limit=False)
 54 | 
 55 |     aligned_bases = OrderedDict(
 56 |         (os.path.basename(bam), stats[bam]['AlignedBases']) for bam in stats['BamFiles'])
 57 |     plotter.plot_bars_simple(
 58 |         aligned_bases, title="Aligned bases", xlab="BAM", ylab="Bases", auto_limit=False)
 59 | 
 60 |     aligned_queries = OrderedDict(
 61 |         (os.path.basename(bam), stats[bam]['AlignedQueries']) for bam in stats['BamFiles'])
 62 |     plotter.plot_bars_simple(
 63 |         aligned_queries, title="Aligned queries", xlab="BAM", ylab="Bases", auto_limit=False)
 64 | 
 65 |     unaligned_queries = OrderedDict(
 66 |         (os.path.basename(bam), stats[bam]['UnalignedQueries']) for bam in stats['BamFiles'])
 67 |     plotter.plot_bars_simple(
 68 |         unaligned_queries, title="Unaligned queries", xlab="BAM", ylab="Bases", auto_limit=False)
 69 | 
 70 |     unaligned_queries = OrderedDict(
 71 |         (os.path.basename(bam), stats[bam]['HardClippedBases']) for bam in stats['BamFiles'])
 72 |     plotter.plot_bars_simple(
 73 |         unaligned_queries, title="Hard clipped bases", xlab="BAM", ylab="Bases", auto_limit=False)
 74 | 
 75 |     base_stats = OrderedDict((field, stats[field])
 76 |                              for field in ('CommonAlignedBases', 'CommonMatchingBases'))
 77 |     plotter.plot_bars_simple(
 78 |         base_stats, title="Common aligned base statistics", xlab="Field", ylab="Count", auto_limit=False)
 79 | 
 80 |     sim_stats = OrderedDict((field, stats[field]) for field in ['AlignedSimilarity'])
 81 |     plotter.plot_bars_simple(
 82 |         sim_stats, title="Proportion of bases with matching alignment ({})".format(sim_stats.values()[0]), xlab="Field", ylab="Count", auto_limit=False)
 83 | 
 84 |     plotter.plot_histograms({'PerQuerySim': stats[
 85 |                             'PerQueryBaseSim']}, title="Distribution of percent bases with matched alignment",
 86 |                             xlab="Percent bases with matched alignment", ylab="Count", legend=False)
 87 | 
 88 |     plotter.plot_histograms({'PerQuerySimClipped': stats[
 89 |                             'PerQueryBaseSimClipped']}, title="Distribution of percent bases with matched alignment (with clipping)",
 90 |                             xlab="Percent bases with matched alignment", ylab="Count", legend=False)
 91 | 
 92 |     plotter.close()
 93 | 
 94 |     if args.p is not None:
 95 |         misc.pickle_dump(dict(stats), args.p)
 96 | 
 97 |     if args.t is not None:
 98 |         data_map = stats.copy()
 99 |         del data_map['PerQueryBaseSim']
100 |         del data_map['PerQueryBaseSimClipped']
101 |         for bam in data_map['BamFiles']:
102 |             del data_map[bam]
103 |         del data_map['BamFiles']
104 |         data_map = OrderedDict((key, [value]) for key, value in six.iteritems(data_map))
105 |         data_frame = pd.DataFrame(data_map)
106 |         data_frame.to_csv(args.t, sep="\t", index=False)
107 | 


--------------------------------------------------------------------------------
/scripts/bam_count_reads.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import six
  5 | import argparse
  6 | import sys
  7 | import pandas as pd
  8 | from collections import OrderedDict, defaultdict
  9 | from wub.bam import read_counter
 10 | from wub.util import misc
 11 | from wub.util import seq as seq_util
 12 | 
 13 | import tqdm
 14 | 
 15 | # Parse command line arguments:
 16 | parser = argparse.ArgumentParser(
 17 |     description="""Count reads mapping to each reference in a BAM file.""")
 18 | parser.add_argument(
 19 |     '-a', metavar='min_aqual', type=int, help="Minimum mapping quality (0).", default=0)
 20 | parser.add_argument(
 21 |     '-f', metavar='in_format', type=str, help="Input format (BAM).", default='BAM')
 22 | parser.add_argument(
 23 |     '-z', metavar='ref_fasta', type=str, help="Reference fasta. GC content and length columns are added if present (None).", default=None)
 24 | parser.add_argument(
 25 |     '-k', metavar="words", type=str, help="Include word frequencies of specifed length in output (None).", default=None)
 26 | parser.add_argument(
 27 |     '-g', action="store_true", help="Include mean GC content of reads mapped to each reference (False).", default=False)
 28 | parser.add_argument(
 29 |     '-p', metavar='results_pickle', type=str, help="Save pickled results in this file (None).", default=None)
 30 | parser.add_argument(
 31 |     '-t', metavar='tsv_file', type=str, help="Save results in tsv format in this file (bam_count_reads.tsv).", default="bam_count_reads.tsv")
 32 | parser.add_argument(
 33 |     '-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False)
 34 | parser.add_argument(
 35 |     '-R', action="store_true", help="Count reads from SAM stream in stdin. Only read count fields are written. Header required! (False).", default=False)
 36 | parser.add_argument(
 37 |     '-F', metavar='yield_freq', type=int, help="Yield counts after every -Fth mapped record when doing online counting (100).", default=100)
 38 | parser.add_argument('bam', nargs='?', help='Input file (default: stdin).',
 39 |                     type=argparse.FileType('r'), default=sys.stdin)
 40 | 
 41 | 
 42 | def _offline_counter(args):
 43 |     """ Offline counting from SAM/BAM file. """
 44 |     # Offline counting from SAM/BAM file:
 45 |     counts, gc_means = read_counter.count_reads(
 46 |         args.bam.name, in_format=args.f, min_aln_qual=args.a, verbose=not args.Q, reads_gc=args.g)
 47 |     counts = OrderedDict(six.iteritems(counts))
 48 | 
 49 |     if args.k is not None:
 50 |         calc_words = [int(k) for k in args.k.split(",")]
 51 | 
 52 |     data = OrderedDict()
 53 | 
 54 |     # Calculate sequence properties:
 55 |     if args.z is not None:
 56 |         lengths, gc_contents, word_freqs = {}, {}, defaultdict(
 57 |             lambda: defaultdict(dict))
 58 |         ref_iter = seq_util.read_seq_records(args.z)
 59 |         if not args.Q:
 60 |             sys.stderr.write("Calculating sequence features:\n")
 61 |             ref_iter = tqdm.tqdm(ref_iter)
 62 | 
 63 |         for ref in ref_iter:
 64 |             # Augment counts dictionary with missing reference entries:
 65 |             if ref.id not in counts:
 66 |                 counts[ref.id] = 0
 67 |             lengths[ref.id] = len(ref)
 68 |             gc_contents[ref.id] = seq_util.gc_content(str(ref.seq))
 69 |             if args.k is not None:
 70 |                 for word_size in calc_words:
 71 |                     bf = seq_util.word_composition(ref.seq, word_size)
 72 |                     for word, count in six.iteritems(bf):
 73 |                         word_freqs[word_size][ref.id][
 74 |                             word] = float(count) / len(ref)
 75 | 
 76 |         data['Length'] = [lengths[tr] for tr in six.iterkeys(counts)]
 77 |         data['GC_content'] = [gc_contents[tr] for tr in six.iterkeys(counts)]
 78 | 
 79 |     data['Reference'] = list(counts.keys())
 80 |     data['Count'] = list(counts.values())
 81 | 
 82 |     # Calculate word frequencies:
 83 |     if args.k is not None and args.z:
 84 |         for ks in calc_words:
 85 |             for word in next(iter((word_freqs[ks].values()))).keys():
 86 |                 tmp = []
 87 |                 for ref in counts.keys():
 88 |                     tmp.append(word_freqs[ks][ref][word])
 89 |                 data[word] = tmp
 90 | 
 91 |     data_frame = pd.DataFrame(data)
 92 | 
 93 |     if args.g:
 94 |         gc_frame = pd.DataFrame({'Reference': list(gc_means.keys()), 'ReadGC': list(gc_means.values())})
 95 |         data_frame = pd.merge(data_frame, gc_frame, how='inner', on='Reference')
 96 | 
 97 |     data_frame = data_frame.sort_values(['Count', 'Reference'], ascending=False)
 98 |     data_frame = data_frame[data_frame.Count > 0]
 99 | 
100 |     if args.t is not None:
101 |         data_frame.to_csv(args.t, sep='\t', index=False)
102 | 
103 |     if args.p is not None:
104 |         misc.pickle_dump(data, args.p)
105 | 
106 | 
107 | def _online_counter(args):
108 |     """ Online counting from SAM stream. """
109 |     # Open counts stream:
110 |     counts_iter = read_counter.count_reads_realtime(
111 |         alignment_file='-', in_format=args.f, min_aln_qual=args.a, verbose=not args.Q, yield_freq=args.F)
112 | 
113 |     for counts in counts_iter:
114 |         data_frame = pd.DataFrame(
115 |             OrderedDict([('Reference', list(counts.keys())), ('Count', list(counts.values()))]))
116 |         data_frame = data_frame.sort_values(['Count', 'Reference'], ascending=False)
117 | 
118 |         if args.t is not None:
119 |             data_frame.to_csv(args.t, sep='\t', index=False)
120 |         if args.p is not None:
121 |             misc.pickle_dump(counts, args.p)
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     args = parser.parse_args()
126 | 
127 |     if not args.R:
128 |         # Offline counting from SAM/BAM file:
129 |         if args.bam == sys.stdin:
130 |             raise Exception("Input file not specified!")
131 |         _offline_counter(args)
132 |     else:
133 |         # Online counting from SAM on stdin.
134 |         _online_counter(args)
135 | 


--------------------------------------------------------------------------------
/scripts/bam_cov.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import tqdm
 6 | import sys
 7 | from Bio import SeqIO
 8 | from wub.util import misc
 9 | from wub.bam import common as bam_common
10 | 
11 | # Parse command line arguments:
12 | parser = argparse.ArgumentParser(
13 |     description="""Produce refrence coverage table.""")
14 | parser.add_argument(
15 |     '-f', metavar='reference', type=str, help="Reference fasta.", required=True)
16 | parser.add_argument(
17 |     '-c', metavar='region', type=str, help="BAM region (None).", required=False, default=None)
18 | parser.add_argument(
19 |     '-t', metavar='tsv', type=str, default="bam_cov.tsv", help="Output TSV (bam_cov.tsv).", required=False)
20 | parser.add_argument(
21 |     '-q', metavar='aqual', type=int, default=0, help="Minimum alignment quality (0).")
22 | parser.add_argument(
23 |     '-Q', action="store_true", help="Be quiet and do not show progress bars.", default=False)
24 | parser.add_argument('bam', metavar='bam', type=str, help="Input BAM file.")
25 | 
26 | 
27 | def _process_bam(bam, out_tsv, chrom_lengths, region=None, min_aqual=0, verbose=True):
28 |     bam_reader = bam_common.pysam_open(bam, in_format='BAM')
29 |     ue = True
30 |     if region is not None:
31 |         ue = False
32 |     bam_iter = bam_reader.fetch(region=region, until_eof=ue)
33 | 
34 |     try:
35 |         total_reads = bam_reader.mapped + bam_reader.unmapped
36 |     except:
37 |         total_reads = None
38 |     if verbose and region is None:
39 |         sys.stdout.write(
40 |             "Gathering fragment statistics from file: {}\n".format(bam))
41 |         bam_iter = tqdm.tqdm(bam_iter, total=total_reads)
42 | 
43 |     tsv = open(out_tsv, "w")
44 |     tsv.write(
45 |         "Read\tRef\tStrand\tRefCov\tReadCov\tReadLength\tReadAlnLength\tRefLength\tRefAlnLength\tMapQual\n")
46 | 
47 |     for r in bam_iter:
48 |         # Skip unmapped reads:
49 |         if r.is_unmapped:
50 |             continue
51 |         # Skip if mapping quality is too low:
52 |         if r.mapq < min_aqual:
53 |             continue
54 |         strand = '-' if r.is_reverse else '+'
55 |         ref = r.reference_name
56 |         ref_cov = r.reference_length / float(chrom_lengths[ref])
57 |         read = r.query_name
58 |         read_length = r.infer_read_length()
59 |         mapq = r.mapping_quality
60 |         read_aln_len = r.query_alignment_length
61 |         read_cov = read_aln_len / float(read_length)
62 |         ref_aln_length = r.reference_length
63 | 
64 |         tsv.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(read, ref, strand, ref_cov,
65 |                                                                     read_cov, read_length, read_aln_len, chrom_lengths[ref], ref_aln_length, mapq))
66 | 
67 |     tsv.flush()
68 |     tsv.close()
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     args = parser.parse_args()
73 |     verbose = not args.Q
74 | 
75 |     # Load reference lengths:
76 |     references = SeqIO.index(args.f, format='fasta')
77 |     chrom_lengths = {name: len(so) for name, so in references.items()}
78 | 
79 |     # Parse fragments:
80 |     _process_bam(args.bam, args.t, chrom_lengths, args.c, args.q, verbose=verbose)
81 | 


--------------------------------------------------------------------------------
/scripts/bam_fill_unaligned.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | 
 6 | from wub.bam import common as bam_common
 7 | from wub.bam import sam_writer
 8 | from wub.util import seq as seq_util
 9 | 
10 | # Parse command line arguments:
11 | parser = argparse.ArgumentParser(
12 |     description="""Generate SAM records for the reads present in the input fastq but missing from
13 |     the input SAM/BAM.
14 |     """)
15 | parser.add_argument(
16 |     '-f', metavar='format', type=str, help="Input/output format (SAM).", default='SAM')
17 | parser.add_argument(
18 |     '-q', metavar='fastq', type=str, help="Input fastq.", required=True)
19 | parser.add_argument(
20 |     'infile', metavar='input_file', type=str, help="Input file.")
21 | parser.add_argument(
22 |     'outfile', metavar='output_file', type=str, help="Output SAM file.")
23 | 
24 | if __name__ == '__main__':
25 |     args = parser.parse_args()
26 | 
27 |     input_iter = bam_common.pysam_open(args.infile, args.f).fetch(until_eof=True)
28 | 
29 |     # Get SAM record names:
30 |     sam_names = [record.query_name for record in input_iter]
31 | 
32 |     writer = sam_writer.SamWriter(args.outfile)
33 | 
34 |     for read in seq_util.read_seq_records(args.q, 'fastq'):
35 |         if read.id not in sam_names:
36 |             qual = seq_util.quality_array_to_string(read.letter_annotations["phred_quality"])
37 |             sam_record = writer.new_sam_record(qname=read.id, flag=4, rname="*", pos=0, mapq=0, cigar="*", rnext="*",
38 |                                                pnext=0, tlen=0, seq=str(read.seq), qual=qual, tags="AS:i:0")
39 |             writer.write(sam_record)
40 | 
41 |     writer.close()
42 | 


--------------------------------------------------------------------------------
/scripts/bam_gc_vs_qual.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from __future__ import print_function
  5 | import argparse
  6 | import tqdm
  7 | 
  8 | import os
  9 | import sys
 10 | import pandas as pd
 11 | 
 12 | import pysam
 13 | from wub.vis import report
 14 | from wub.util import seq as seq_util
 15 | 
 16 | import warnings
 17 | warnings.simplefilter("ignore")
 18 | import seaborn as sns
 19 | 
 20 | # Parse command line arguments:
 21 | parser = argparse.ArgumentParser(
 22 |     description="""
 23 |     Produce a plot of GC content of aligned read and reference portion versus their mean quality values.
 24 |     """)
 25 | parser.add_argument('-f', metavar='reference', type=str, help="Reference fasta.", required=True)
 26 | parser.add_argument(
 27 |     '-q', metavar='aqual', type=int, default=0, help="Minimum alignment quality (0).")
 28 | parser.add_argument(
 29 |     '-r', metavar='report_pdf', type=str, help="Report PDF (bam_gc_vs_qual.pdf).", default="bam_gc_vs_qual.pdf")
 30 | parser.add_argument(
 31 |     '-t', metavar='tsv', type=str, help="Tab separated file to save results (bam_gc_vs_qual.tsv).", default="bam_gc_vs_qual.tsv")
 32 | parser.add_argument(
 33 |     '-Q', action="store_true", help="Be quiet and do not show progress bars.", default=False)
 34 | parser.add_argument(
 35 |     'bam', metavar='bam', type=str, help="Input BAM file.")
 36 | 
 37 | 
 38 | def _process_reads(alignment_file, refs, in_format='BAM', min_aln_qual=0, verbose=False):
 39 |     """
 40 |     Gather information about the GC content and mean quality value of aligned portions of the reads.
 41 |     """
 42 |     if in_format == 'BAM':
 43 |         mode = "rb"
 44 |     elif in_format == 'SAM':
 45 |         mode = "r"
 46 |     else:
 47 |         raise Exception("Invalid format: {}".format(in_format))
 48 | 
 49 |     aln_iter = pysam.AlignmentFile(alignment_file, mode)
 50 | 
 51 |     if verbose and in_format == "BAM":
 52 |         try:
 53 |             total_reads = aln_iter.mapped + aln_iter.unmapped
 54 |         except:
 55 |             total_reads = None
 56 |         sys.stdout.write(
 57 |             "Gathering GC content vs. quality information from file: {}\n".format(alignment_file))
 58 |         if in_format == "BAM":
 59 |             aln_iter = tqdm.tqdm(aln_iter, total=total_reads)
 60 | 
 61 |     rgcs, gcs, quals = [], [], []
 62 |     ref_lengths = []
 63 |     for segment in aln_iter:
 64 |         if segment.is_unmapped:
 65 |             continue
 66 |         if segment.mapping_quality >= min_aln_qual:
 67 |             # Calculate GC content of aligned read portion:
 68 |             aln_seq = segment.query_alignment_sequence
 69 |             gcs.append(seq_util.gc_content(aln_seq))
 70 | 
 71 |             # Calculate GC content of aligned reference:
 72 |             ref_seq = refs[segment.reference_name].seq[segment.reference_start:segment.reference_end]
 73 |             rgcs.append(seq_util.gc_content(ref_seq))
 74 |             ref_lengths.append(segment.reference_length)
 75 | 
 76 |             # Calculate mean quality score of aligned read portion:
 77 |             aln_quals = segment.query_alignment_qualities
 78 |             quals.append(seq_util.mean_qscore(aln_quals, qround=False))
 79 | 
 80 |     aln_iter.close()
 81 | 
 82 |     df = pd.DataFrame({'GC_content': gcs, 'MeanQuality': quals, 'GC_content_ref': rgcs, 'RefAlnLength': ref_lengths})
 83 | 
 84 |     return df
 85 | 
 86 | 
 87 | if __name__ == '__main__':
 88 |     args = parser.parse_args()
 89 |     verbose = not args.Q
 90 |     tag = os.path.basename(args.bam)
 91 | 
 92 |     references = seq_util.read_seq_records_dict(args.f)
 93 |     data = _process_reads(args.bam, references, min_aln_qual=args.q, verbose=verbose)
 94 | 
 95 |     data.to_csv(args.t, sep="\t", index=False)
 96 | 
 97 |     # Plot GC content of aligned read portion vs. mean quality.
 98 |     plotter = report.Report(args.r)
 99 |     sns.jointplot("GC_content", "MeanQuality", kind="reg", data=data)
100 |     plotter.plt.tight_layout()
101 |     plotter.pages.savefig()
102 |     plotter.plt.clf()
103 | 
104 |     # Plot GC content of aligned reference portion vs. mean quality.
105 |     sns.jointplot("GC_content_ref", "MeanQuality", kind="reg", data=data)
106 |     plotter.plt.tight_layout()
107 |     plotter.pages.savefig()
108 |     plotter.plt.clf()
109 | 
110 |     plotter.close()
111 | 


--------------------------------------------------------------------------------
/scripts/bam_ref_base_coverage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import six
 5 | import argparse
 6 | import os
 7 | import pandas as pd
 8 | from Bio import SeqIO
 9 | from wub.bam import stats as bam_stats
10 | 
11 | # Parse command line arguments:
12 | parser = argparse.ArgumentParser(
13 |     description="""Calculate percent covered reference lengths.""")
14 | parser.add_argument(
15 |     '-f', metavar='reference', type=str, help="Reference fasta.", required=True)
16 | parser.add_argument(
17 |     '-c', metavar='region', type=str, help="BAM region (None).", required=False, default=None)
18 | parser.add_argument(
19 |     '-t', metavar='tsv', type=str, default="bam_ref_base_coverage.tsv", help="Output tab separated file (bam_ref_base_coverage.tsv).", required=False)
20 | parser.add_argument(
21 |     '-m', metavar='min_cov', type=int, default=1, help="Minimum base coverage for a position to be counted (1).")
22 | parser.add_argument(
23 |     '-Q', action="store_true", help="Be quiet and do not show progress bars.", default=False)
24 | parser.add_argument(
25 |     'bam', metavar='bam', type=str, help="Input BAM file.")
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     args = parser.parse_args()
30 |     verbose = not args.Q
31 |     tag = args.t
32 |     if tag is None:
33 |         tag = os.path.basename(args.bam)
34 | 
35 |     # Load reference lengths:
36 |     references = SeqIO.index(args.f, format='fasta')
37 |     chrom_lengths = {name: len(so) for name, so in six.iteritems(references)}
38 |     # Parse fragments:
39 |     st = bam_stats.pileup_stats(args.bam, region=args.c, verbose=verbose, with_quals=False)['coverage']
40 | 
41 |     res = {}
42 |     for chrom, chrom_length in six.iteritems(chrom_lengths):
43 |         # No coverage:
44 |         if chrom not in st:
45 |             res[chrom] = 0.0
46 |         else:
47 |             nr_hits = 0
48 |             # Iterate over covered positions and count valid hits:
49 |             for pos, cov in six.iteritems(st[chrom]):
50 |                 if cov >= args.m:
51 |                     nr_hits += 1
52 |             # Calculate percent covered reference length:
53 |             res[chrom] = float(nr_hits * 100) / chrom_length
54 | 
55 |     # Convert results to sorted data frame:
56 |     df = pd.DataFrame({'Chrom': list(res.keys()), 'Percent_cov': list(res.values())})
57 |     df.sort_values(['Percent_cov'], ascending=[0], inplace=True)
58 |     df.to_csv(args.t, sep="\t", index=False)
59 | 


--------------------------------------------------------------------------------
/scripts/bam_ref_tab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | import pandas as pd
 8 | import pysam
 9 | import tqdm
10 | from collections import OrderedDict
11 | 
12 | # Parse command line arguments:
13 | parser = argparse.ArgumentParser(
14 |     description="""Produce a tab separated file with read identifiers and the corresponding references, sorted by reference.""")
15 | parser.add_argument(
16 |     '-t', metavar='read_tsv', type=str, default="bam_ref_tab.tsv", help="Tab separated file to save reference table.", required=False)
17 | parser.add_argument(
18 |     '-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False)
19 | parser.add_argument(
20 |     '-s', action="store_true", help="Save read strand in output (False).", default=False)
21 | parser.add_argument(
22 |     'bam', metavar='bam', type=str, help="Input BAM file.")
23 | 
24 | 
25 | def process_reads(alignment_file, in_format='BAM', save_strand=False, verbose=False):
26 |     """Process reads and extract the corresponding reference.
27 | 
28 |     :param alignment_file: BAM file.
29 |     :param verbose: Verbosity flag.
30 |     :returns: pandas dataframe with reads and references.
31 |     :rtype: dict
32 |     """
33 |     reads, refs, strands = [], [], []
34 |     if in_format == 'BAM':
35 |         mode = "rb"
36 |     elif in_format == 'SAM':
37 |         mode = "r"
38 |     else:
39 |         raise Exception("Invalid format: {}".format(in_format))
40 | 
41 |     aln_iter = pysam.AlignmentFile(alignment_file, mode)
42 | 
43 |     if verbose and in_format == "BAM":
44 |         try:
45 |             total_reads = aln_iter.mapped + aln_iter.unmapped
46 |         except:
47 |             total_reads = None
48 |         sys.stdout.write(
49 |             "Gathering read statistics from file: {}\n".format(alignment_file))
50 |         if in_format == "BAM":
51 |             aln_iter = tqdm.tqdm(aln_iter, total=total_reads)
52 | 
53 |     for segment in aln_iter:
54 |         if segment.is_unmapped:
55 |             continue
56 |         refs.append(segment.reference_name)
57 |         reads.append(segment.query_name)
58 |         if save_strand:
59 |             strand = "-" if segment.is_reverse else "+"
60 |             strands.append(strand)
61 | 
62 |     aln_iter.close()
63 | 
64 |     data = OrderedDict([('Read', reads), ('Reference', refs)])
65 |     if save_strand:
66 |         data['Strand'] = strands
67 |     df = pd.DataFrame(data)
68 | 
69 |     return df
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     args = parser.parse_args()
74 |     verbose = not args.Q
75 | 
76 |     df = process_reads(args.bam, save_strand=args.s, verbose=verbose)
77 |     df.sort_values(['Reference'], ascending=[0], inplace=True)
78 |     df.to_csv(args.t, sep="\t", index=False)
79 | 


--------------------------------------------------------------------------------
/scripts/bam_score_filter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | 
 6 | import pysam
 7 | from wub.bam import filter as bam_filter
 8 | from wub.bam import common as bam_common
 9 | 
10 | # Parse command line arguments:
11 | parser = argparse.ArgumentParser(
12 |     description="""Filter SAM/BAM records by score or other criteria.
13 |     WARNING: the input records must be sorted by name or the filtering will not work
14 |     as expected.
15 |     """)
16 | parser.add_argument(
17 |     '-f', metavar='format', type=str, help="Input/output format (SAM).", default='SAM')
18 | parser.add_argument(
19 |     '-s', metavar='strategy', type=str, help="Filtering strategy: top_per_query, query_coverage, ref_coverage (top_per_query).",
20 |     default="top_per_query", choices=['top_per_query', 'query_coverage', 'ref_coverage'])
21 | parser.add_argument(
22 |     '-q', metavar='query_cover', type=float, help="Minimum query coverage fraction (0.8).", default=0.8)
23 | parser.add_argument(
24 |     'infile', metavar='input_file', type=str, help="Input file.")
25 | parser.add_argument(
26 |     'outfile', metavar='output_file', type=str, help="Output SAM file.")
27 | 
28 | if __name__ == '__main__':
29 |     args = parser.parse_args()
30 | 
31 |     input_iter = bam_common.pysam_open(args.infile, args.f)
32 | 
33 |     if args.s == 'top_per_query':
34 |         output_iter = bam_filter.filter_top_per_query(input_iter.fetch(until_eof=True))
35 |     elif args.s == 'query_coverage':
36 |         output_iter = bam_filter.filter_query_coverage(input_iter.fetch(until_eof=True), args.q)
37 |     elif args.s == 'ref_coverage':
38 |         output_iter = bam_filter.filter_ref_coverage(input_iter.fetch(until_eof=True), args.q, input_iter.header)
39 |     else:
40 |         raise Exception('Filtering strategy not implemented!')
41 | 
42 |     writer = pysam.AlignmentFile(args.outfile, "wh", template=input_iter, header=input_iter.header)
43 |     for record in output_iter:
44 |         writer.write(record)
45 | 
46 |     writer.close()
47 | 


--------------------------------------------------------------------------------
/scripts/bam_soft_clips_tab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | import pandas as pd
 8 | import pysam
 9 | import tqdm
10 | from collections import OrderedDict
11 | 
12 | # Parse command line arguments:
13 | parser = argparse.ArgumentParser(
14 |     description="""Produce a tab separated file with read identifiers and number of soft clipped bases at each end (relative to the original sequence in the fastq).""")
15 | parser.add_argument(
16 |     '-t', metavar='tsv', type=str, default="bam_soft_clips_tab.tsv", help="Output tab separated file.", required=False)
17 | parser.add_argument(
18 |     '-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False)
19 | parser.add_argument(
20 |     'bam', metavar='bam', type=str, help="Input BAM file.")
21 | 
22 | 
23 | def _get_clips(cigar, is_reverse):
24 |     """ Get clips at the start and end relative to the original sequence. """
25 |     clip_start, clip_end = 0, 0
26 | 
27 |     # Consider the first CIGAR tuple:
28 |     if cigar[0][0] == 4:
29 |         clip_start = cigar[0][1]
30 | 
31 |     # Consider the last CIGAR tuple:
32 |     if cigar[-1][0] == 4:
33 |         clip_end = cigar[-1][1]
34 | 
35 |     # Reverse orientation if necessary:
36 |     if is_reverse:
37 |         clip_start, clip_end = clip_end, clip_start
38 |     return clip_start, clip_end
39 | 
40 | 
41 | def process_reads(alignment_file, in_format='BAM', verbose=False):
42 |     """Process reads and extract the corresponding information.
43 | 
44 |     :param alignment_file: BAM file.
45 |     :param verbose: Verbosity flag.
46 |     :returns: pandas dataframe with reads and soft clip lengths.
47 |     :rtype: pandas.DataFrame
48 |     """
49 |     reads, strand, clip_start, clip_end = [], [], [], []
50 |     if in_format == 'BAM':
51 |         mode = "rb"
52 |     elif in_format == 'SAM':
53 |         mode = "r"
54 |     else:
55 |         raise Exception("Invalid format: {}".format(in_format))
56 | 
57 |     aln_iter = pysam.AlignmentFile(alignment_file, mode)
58 | 
59 |     if verbose and in_format == "BAM":
60 |         try:
61 |             total_reads = aln_iter.mapped + aln_iter.unmapped
62 |         except:
63 |             total_reads = None
64 |         sys.stdout.write(
65 |             "Gathering read statistics from file: {}\n".format(alignment_file))
66 |         if in_format == "BAM":
67 |             aln_iter = tqdm.tqdm(aln_iter, total=total_reads)
68 | 
69 |     for segment in aln_iter:
70 |         if segment.is_unmapped:
71 |             continue
72 |         reads.append(segment.query_name)
73 |         strand.append('-' if segment.is_reverse else '+')
74 |         cs, ce = _get_clips(segment.cigartuples, segment.is_reverse)
75 |         clip_start.append(cs)
76 |         clip_end.append(ce)
77 | 
78 |     aln_iter.close()
79 | 
80 |     data = OrderedDict([('Read', reads), ('Strand', strand), ('ClipStart', clip_start), ('ClipEnd', clip_end)])
81 |     df = pd.DataFrame(data)
82 | 
83 |     return df
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     args = parser.parse_args()
88 |     verbose = not args.Q
89 | 
90 |     df = process_reads(args.bam, verbose=verbose)
91 |     df.to_csv(args.t, sep="\t", index=False)
92 | 


--------------------------------------------------------------------------------
/scripts/bias_explorer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import numpy as np
 6 | import pandas as pd
 7 | from wub.vis import report
 8 | import warnings
 9 | with warnings.catch_warnings():
10 |     warnings.simplefilter("ignore")
11 |     import seaborn as sns
12 | warnings.resetwarnings()
13 | _ = sns
14 | 
15 | # Parse command line arguments:
16 | parser = argparse.ArgumentParser(
17 |     description="""
18 |     Simple tool for exploring biases in transcript counts. Takes as input count files generated by bam_count_reads.py (with the -z flag)
19 |     and performs linear regression of log counts against transcript length and GC content.
20 |     """)
21 | parser.add_argument(
22 |     '-r', metavar='report_pdf', type=str, help="Report PDF (bias_explorer.pdf).", default="bias_explorer.pdf")
23 | parser.add_argument('-x', action="store_true",
24 |                     help="Exclude transcripts with zero counts.", default=False)
25 | parser.add_argument(
26 |     'count_file', metavar='count_file', type=str, help="Input counts file with length ang GC content features.")
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     args = parser.parse_args()
31 | 
32 |     data = pd.read_csv(args.count_file, sep="\t")
33 |     data["logCount"] = np.log(np.array(data["Count"]) + 1.0)
34 | 
35 |     if args.x:
36 |         data = data[data.Count > 0]
37 | 
38 |     plotter = report.Report(args.r)
39 | 
40 |     sns.jointplot("GC_content", "logCount", kind="reg", data=data)
41 |     plotter.plt.tight_layout()
42 |     plotter.pages.savefig()
43 |     plotter.plt.clf()
44 | 
45 |     sns.jointplot("Length", "logCount", kind="reg", data=data)
46 |     plotter.plt.tight_layout()
47 |     plotter.pages.savefig()
48 |     plotter.plt.clf()
49 | 
50 |     plotter.close()
51 | 


--------------------------------------------------------------------------------
/scripts/calculate_coverage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | from wub.util import seq as seq_util
 8 | from wub.util import misc
 9 | 
10 | 
11 | # Parse command line arguments:
12 | parser = argparse.ArgumentParser(
13 |     description='Calculate total number of bases and genome coverage if genome size is given.')
14 | parser.add_argument(
15 |     '-f', metavar='format', type=str, help="Input format (fastq).", default='fastq')
16 | parser.add_argument(
17 |     '-s', metavar='genome_size', type=int, help="Genome size (None).", default=None)
18 | parser.add_argument(
19 |             '-p', metavar='results_pickle', type=str, help="Save pickled results in this file.", default=None)
20 | parser.add_argument('input_fastx', nargs='?', help='Input (default: stdin).',
21 |                     type=argparse.FileType('r'), default=sys.stdin)
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     args = parser.parse_args()
26 | 
27 |     in_format = args.f
28 |     input_iterator = seq_util.read_seq_records(
29 |         args.input_fastx, format=in_format)
30 | 
31 |     total_bases = 0
32 |     for record in input_iterator:
33 |         total_bases += len(record)
34 |     results = {'total_bases': total_bases}
35 |     print("Total bases\t{}".format(total_bases))
36 | 
37 |     if args.s is not None:
38 |         results['genome_size'] = args.s
39 |         results['coverage'] = float(total_bases) / args.s
40 |         print("Genome size\t{}".format(results['genome_size']))
41 |         print("Coverage\t{}".format(results['coverage']))
42 | 
43 |     if args.p is not None:
44 |         misc.pickle_dump(results, args.p)
45 | 


--------------------------------------------------------------------------------
/scripts/compare_genomes_dnadiff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from __future__ import print_function
 5 | import six
 6 | import sys
 7 | import argparse
 8 | from wub.util import cmd as cmd_util
 9 | from wub.wrappers import dnadiff
10 | from wub.util import misc
11 | 
12 | # Parse command line arguments:
13 | parser = argparse.ArgumentParser(
14 |     description="""Compare a set of reference sequences (genome) to another set (target assembly) using mummer's dnadiff.
15 |     It prints the alignment results to stdout. All parsed results can be saved in a pickle file.
16 |     """)
17 | parser.add_argument(
18 |     '-p', metavar='results_pickle', type=str, help="Save pickled results in this file (None).", default=None)
19 | parser.add_argument(
20 |     '-r', metavar='raw_file', type=str, help="Save dnadiff report in this file (None).", default=None)
21 | parser.add_argument(
22 |     '-d', metavar='work_dir', type=str, help="Use this working directory instead of a temporary directory (None).", default=None)
23 | parser.add_argument(
24 |     '-k', action="store_true", help="Keep dnadiff result files (False).", default=False)
25 | parser.add_argument(
26 |     '-v', action="store_true", help="Print out dnadiff output (False).", default=False)
27 | parser.add_argument(
28 |     'ref', metavar='reference_fasta', type=str, help="Reference fasta.")
29 | parser.add_argument(
30 |     'target', metavar='target_fasta', type=str, help="Target fasta.")
31 | 
32 | if __name__ == '__main__':
33 |     args = parser.parse_args()
34 | 
35 |     cmd_util.ensure_executable('dnadiff')
36 |     cmd_util.ensure_executable('delta-filter')
37 |     cmd_util.ensure_executable('show-diff')
38 |     cmd_util.ensure_executable('show-snps')
39 |     cmd_util.ensure_executable('show-coords')
40 |     cmd_util.ensure_executable('nucmer')
41 | 
42 |     results, raw_report, log = dnadiff.dnadiff(args.ref, args.target, args.d, not args.k)
43 | 
44 |     if args.v:
45 |         sys.stdout.write(log)
46 | 
47 |     if args.r is not None:
48 |         with open(args.r, 'w') as out_handle:
49 |             out_handle.write(raw_report)
50 | 
51 |     if args.p is not None:
52 |         misc.pickle_dump(results, args.p)
53 | 
54 |     for section, properties in six.iteritems(results['Alignments']):
55 |         print(section, ":\t\tref\tquery")
56 |         for name, prop in properties.iteritems():
57 |             print("\t{}\t{}\t{}".format(name, prop.ref, prop.query))
58 | 


--------------------------------------------------------------------------------
/scripts/compare_genomes_lastal.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import argparse
 6 | 
 7 | from wub.mappers import lastal
 8 | from wub.util import parse
 9 | from wub.util import cmd as cmd_util
10 | from wub.vis import report
11 | from wub.util import misc
12 | 
13 | import warnings
14 | with warnings.catch_warnings():
15 |     warnings.simplefilter("ignore")
16 |     import seaborn as sns
17 | warnings.resetwarnings()
18 | _ = sns
19 | 
20 | # Parse command line arguments:
21 | parser = argparse.ArgumentParser(
22 |     description="""Compare a set of reference sequences (genome) to another set (target assembly) using lastal alignment.
23 |     Accuracy is the total number of matched bases divided by total alignment length. Coverage is total reference covered
24 |     by alignment divided by total length of reference.
25 | 
26 |     Caveats:
27 |      - The lastal alignments are filtered by default (use -f to disable) so only the best scoring alignment is kept per query. Hence some shorter valid
28 |      alignments might be discarded causing an underestimation of coverage.
29 |      - The estimated accuracy is dependent on the scoring of gaps and mismatches. By default gap open and gap extend penalties are set to equal.
30 |     """)
31 | parser.add_argument(
32 |     '-p', metavar='results_pickle', type=str, help="Save pickled results in this file (None).", default=None)
33 | parser.add_argument(
34 |     '-l', metavar='lastal_args', type=str, help="Parameters passed to lastal in the <arg>:value,... format (a:1,b:1).", default="a:1,b:1")
35 | parser.add_argument(
36 |     '-t', metavar='details_tsv', type=str, help="Save details of lastal alignment in this tab-separated file (None).", default=None)
37 | parser.add_argument(
38 |     '-f', help="Do *not* filter for best alignment per query.", default=False, action="store_true")
39 | parser.add_argument(
40 |     '-r', metavar='report_pdf', type=str, help="Report with alignment details plot (None).", default=None)
41 | parser.add_argument(
42 |     'ref', metavar='reference_fasta', type=str, help="Reference fasta.")
43 | parser.add_argument(
44 |     'target', metavar='target_fasta', type=str, help="Target fasta.")
45 | 
46 | if __name__ == '__main__':
47 |     args = parser.parse_args()
48 | 
49 |     cmd_util.ensure_executable('lastal')
50 |     cmd_util.ensure_executable('lastdb')
51 | 
52 |     filter_alignments = not args.f
53 |     lastal_args = parse.args_string_to_dict(args.l)
54 |     stats = lastal.compare_genomes_lastal(
55 |         args.ref, args.target, lastal_options=lastal_args, filter_alns=filter_alignments, cleanup=True)
56 | 
57 |     global_accuracy = (stats['aln_length'].sum() - stats['substitutions'].sum() -
58 |                        stats['deletions'].sum() - stats['insertions'].sum()) / float(stats['aln_length'].sum())
59 |     global_coverage = stats[
60 |         'ref_aln_len'].sum() / float(stats['ref_len'].sum())
61 | 
62 |     sys.stdout.write("Accuracy\tCoverage\n")
63 |     sys.stdout.write("{}\t{}\n".format(global_accuracy, global_coverage))
64 | 
65 |     if args.t is not None:
66 |         stats.to_csv(args.t, sep='\t', index=False)
67 | 
68 |     if args.r is not None:
69 |         plotter = report.Report(args.r)
70 |         data = {'': (stats['coverage'], stats['accuracy'])}
71 |         plotter.plot_arrays(
72 |             data, title="Alignment properties", xlab='Coverage', ylab='Accuracy', legend=False)
73 |         plotter.close()
74 | 
75 |     if args.p is not None:
76 |         res_data = {'Accuracy': global_accuracy, 'Coverage': global_coverage}
77 |         misc.pickle_dump(res_data, args.p)
78 | 


--------------------------------------------------------------------------------
/scripts/convert_alphabet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | from wub.util import seq as seq_util
 8 | 
 9 | # Parse command line arguments:
10 | parser = argparse.ArgumentParser(
11 |     description='Convert between DNA and RNA alphabets.')
12 | parser.add_argument(
13 |     '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq')
14 | parser.add_argument(
15 |     '-o', metavar='out_format', type=str, help="Output format (fastq).", default='fastq')
16 | parser.add_argument(
17 |     '-D', action='store_true', help="RNA->DNA alphabet conversion.", default=False)
18 | parser.add_argument(
19 |     '-R', action='store_true', help="DNA->RNA alphabet conversion.", default=False)
20 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).',
21 |                     type=argparse.FileType('r'), default=sys.stdin)
22 | parser.add_argument('output_fastx', nargs='?', help='Output file (default: stdout).',
23 |                     type=argparse.FileType('w'), default=sys.stdout)
24 | 
25 | 
26 | def record_filter(input_iter, in_format, to_alphabet):
27 |     """ Filter SeqRecord objects by length and mean quality.
28 | 
29 |     :param input_iter: Iterator of SeqRecord objects.
30 |     :param in_format: Input format.
31 |     :param to_alphabet: Convert to this alphabet.
32 |     :returns: SeqRecord object.
33 |     :rtype: generator
34 |     """
35 |     for record in input_iter:
36 |         if to_alphabet == 'DNA':
37 |             yield seq_util.rna_record_to_dna(record)
38 |         elif to_alphabet == 'RNA':
39 |             yield seq_util.dna_record_to_rna(record)
40 |         else:
41 |             raise Exception('Invalid alphabet type')
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     args = parser.parse_args()
46 | 
47 |     input_iterator = seq_util.read_seq_records(
48 |         args.input_fastx, format=args.i)
49 | 
50 |     to_alphabet = None
51 |     if args.D and args.R:
52 |         sys.stderr.write("-D and -R are mutually exclusive!\n")
53 |         sys.exit(1)
54 |     elif not args.D and not args.R:
55 |         sys.stderr.write("Either -D or -R must be specified!\n")
56 |         sys.exit(1)
57 |     elif args.D:
58 |         to_alphabet = 'DNA'
59 |     elif args.R:
60 |         to_alphabet = 'RNA'
61 | 
62 |     output_iterator = record_filter(input_iterator, args.i, to_alphabet)
63 | 
64 |     seq_util.write_seq_records(output_iterator, args.output_fastx, format=args.o)
65 | 


--------------------------------------------------------------------------------
/scripts/correlate_counts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import six
  5 | import argparse
  6 | import sys
  7 | import numpy as np
  8 | from scipy import stats
  9 | from collections import OrderedDict
 10 | import pandas as pd
 11 | from os import path
 12 | from wub.vis import report
 13 | from functools import reduce
 14 | 
 15 | import warnings
 16 | with warnings.catch_warnings():
 17 |     warnings.simplefilter("ignore")
 18 |     import seaborn as sns
 19 | warnings.resetwarnings()
 20 | _ = sns
 21 | 
 22 | # Parse command line arguments:
 23 | parser = argparse.ArgumentParser(
 24 |     description="""Correlate counts produced by multiple runs of bam_count_reads.py.""")
 25 | parser.add_argument(
 26 |     '-r', metavar='report_pdf', type=str, help="Report PDF (bam_multi_qc.pdf).", default="correlate_counts.pdf")
 27 | parser.add_argument(
 28 |     '-c', metavar='corr_type', type=str, help="Correlation statistic - spearman or pearson (spearman).", default="spearman")
 29 | parser.add_argument(
 30 |     '-L', action="store_true", help="Log transform data.", default=False)
 31 | parser.add_argument(
 32 |     '-o', action="store_true", help="Omit lower diagonal.", default=False)
 33 | parser.add_argument(
 34 |     'counts', metavar='input_counts', nargs='*', type=str, help="Input counts as tab separated files.")
 35 | 
 36 | 
 37 | def load_counts(counts, log_transform):
 38 |     """Load statistics from tsv files.
 39 | 
 40 |     :param counts: List of count files.
 41 |     :returns: OrderedDict of count data frames per dataset.
 42 |     :rtype: OrderedDict
 43 |     """
 44 |     stats = OrderedDict()
 45 |     for count_file in counts:
 46 |         name = path.basename(count_file).rsplit('.', 1)[0]
 47 |         if log_transform:
 48 |             name = 'log(' + name + '+1)'
 49 | 
 50 |         tmp = pd.read_csv(count_file, sep="\t")[["Reference", "Count"]]
 51 |         tmp = tmp[tmp.Count > 0]
 52 |         tmp = tmp.rename(columns={"Count": name})
 53 |         stats[name] = tmp
 54 |         if log_transform:
 55 |             stats[name][name] = np.log(stats[name][name] + 1)
 56 |     return stats
 57 | 
 58 | 
 59 | def _get_reference_set(dfs):
 60 |     """Get list of all references."""
 61 |     references = set()
 62 |     for df in six.itervalues(dfs):
 63 |         references = references.union(set(df['Reference']))
 64 |     return sorted(list(references))
 65 | 
 66 | 
 67 | def join_counts(counts):
 68 |     """Join count data frames.
 69 |     :param counts: Dictionary of data frames.
 70 |     :returns: Merged data frame.
 71 |     :rtype: DataFrame
 72 |     """
 73 |     df_merged = reduce(lambda left, right: pd.merge(left, right, how="outer", on=["Reference"]), counts.values())
 74 |     df_merged = df_merged.fillna(0.0)
 75 |     return df_merged
 76 | 
 77 | 
 78 | def _corrfunc(x, y, **kws):
 79 |     """ Annotate grid with correaltion coefficient.
 80 |     Solution from http://stackoverflow.com/a/30942817
 81 |     """
 82 |     if args.c == 'spearman':
 83 |         r, _ = stats.spearmanr(x, y)
 84 |         corr_type = 'Rho'
 85 |     elif args.c == 'pearson':
 86 |         r, _ = stats.pearsonr(x, y)
 87 |         corr_type = 'r'
 88 |     else:
 89 |         raise Exception('Invalid correlation statistic.')
 90 |     correlations.append(r)
 91 |     ax = plotter.plt.gca()
 92 |     ax.annotate("{} = {:.2f}".format(corr_type, r),
 93 |                 xy=(.1, .9), xycoords=ax.transAxes)
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     args = parser.parse_args()
 98 |     plotter = report.Report(args.r)
 99 | 
100 |     if len(args.counts) == 0:
101 |         sys.stderr.write("No count files given!\n")
102 |         sys.exit(1)
103 | 
104 |     counts = load_counts(args.counts, args.L)
105 |     joint_df = join_counts(counts)
106 |     correlations = []
107 | 
108 |     # Solution from http://stackoverflow.com/a/30942817
109 |     g = sns.PairGrid(joint_df, palette=["red"])
110 |     g.map_upper(plotter.plt.scatter, s=10)
111 |     g.map_diag(sns.distplot, kde=False)
112 |     if not args.o:
113 |         g.map_lower(sns.kdeplot, cmap="Blues_d")
114 |         g.map_lower(_corrfunc)
115 |     g.map_upper(_corrfunc)
116 |     plotter.plt.tight_layout()
117 |     plotter.pages.savefig()
118 | 
119 |     plotter.plt.clf()
120 |     correlations = pd.DataFrame(
121 |         {"Distribution of correlation coefficients": correlations})
122 |     sns.boxplot(data=correlations)
123 |     plotter.plt.tight_layout()
124 |     plotter.pages.savefig()
125 | 
126 |     plotter.close()
127 | 


--------------------------------------------------------------------------------
/scripts/fasta_to_mock_fastq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | from wub.util import seq
 8 | from Bio import SeqIO
 9 | 
10 | # Parse command line arguments:
11 | parser = argparse.ArgumentParser(
12 |     description='Convert fasta file to fastq with mock qualities.')
13 | parser.add_argument(
14 |     '-q', metavar='mock_quals', type=int, help="Mock quality value (40).", default=40)
15 | parser.add_argument('input_fasta', nargs='?', help='Input fasta (default: stdin).',
16 |                     type=argparse.FileType('r'), default=sys.stdin)
17 | parser.add_argument('output_fastq', nargs='?', help='Output fastq (default: stdout)',
18 |                     type=argparse.FileType('w'), default=sys.stdout)
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     args = parser.parse_args()
23 | 
24 |     mock_qual = args.q
25 | 
26 |     input_iterator = SeqIO.parse(args.input_fasta, 'fasta')
27 |     output_iterator = (seq.mock_qualities(record, mock_qual) for record in input_iterator)
28 |     SeqIO.write(output_iterator, args.output_fastq, 'fastq')
29 | 


--------------------------------------------------------------------------------
/scripts/fastq_qual_tab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | from wub.util import seq as seq_util
 8 | import pandas as pd
 9 | from collections import OrderedDict
10 | 
11 | # Parse command line arguments:
12 | parser = argparse.ArgumentParser(
13 |     description='Generate a table of read names and mean quality values.')
14 | parser.add_argument(
15 |     '-t', metavar='tsv', type=str, help="Output tab separated file.", default='fastq_qual_tab.tsv')
16 | parser.add_argument('input_fastq', nargs='?', help='Input fastq (default: stdin).',
17 |                     type=argparse.FileType('r'), default=sys.stdin)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     args = parser.parse_args()
22 | 
23 |     input_iterator = seq_util.read_seq_records(
24 |         args.input_fastq, format='fastq')
25 | 
26 |     read = []
27 |     mean_qualities = []
28 | 
29 |     for record in input_iterator:
30 |         read.append(record.id)
31 |         mean_quality = seq_util.mean_qscore(record.letter_annotations["phred_quality"], qround=False)
32 |         mean_qualities.append(mean_quality)
33 | 
34 |     df = pd.DataFrame(OrderedDict([('Read', read), ('MeanQual', mean_qualities)]))
35 |     df = df.set_index("Read")
36 |     df.to_csv(args.t, sep="\t")
37 | 


--------------------------------------------------------------------------------
/scripts/fastq_time_slice.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | import pandas as pd
 8 | from wub.util import seq as seq_util
 9 | import datetime
10 | 
11 | # Parse command line arguments:
12 | parser = argparse.ArgumentParser(
13 |     description="""Filter a fastq file by starting time.""")
14 | parser.add_argument(
15 |     '-t', metavar='time_tsv', type=str, help="Tab separeted file produced by fastq_time_tab.py.", required=True)
16 | parser.add_argument(
17 |     '-s', metavar='start_perc', type=float, help="Start of slice as percent of total time.", required=False, default=0.0)
18 | parser.add_argument(
19 |     '-e', metavar='end_perc', type=float, help="End of slice as percent of total time.", required=False, default=100.0)
20 | parser.add_argument('input_fastq', nargs='?', help='Input fastq (default: stdin).',
21 |                     type=argparse.FileType('r'), default=sys.stdin)
22 | parser.add_argument('output_fastq', nargs='?', help='Output fastq (default: stdout)',
23 |                     type=argparse.FileType('w'), default=sys.stdout)
24 | 
25 | 
26 | def _time_slice(input_iter, start_perc, end_perc, time_df):
27 |     """ Filter for fastq records falling in the specified time range. """
28 |     first = time_df.index.min()
29 |     last = time_df.index.max()
30 | 
31 |     s = first + ((last - first) * start_perc) / 100.0
32 |     e = first + ((last - first) * end_perc) / 100.0
33 | 
34 |     for rec in input_iter:
35 |         desc = rec.description.split()
36 |         tmp_start = desc[4].split("=")[1]
37 |         start_time = datetime.datetime.strptime(tmp_start, "%Y-%m-%dT%H:%M:%SZ")
38 |         if start_time >= s and start_time <= e:
39 |             yield rec
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     args = parser.parse_args()
44 |     time_df = pd.read_csv(args.t, sep="\t", parse_dates=True, index_col="StartTime")
45 | 
46 |     input_iterator = seq_util.read_seq_records(args.input_fastq, format='fastq')
47 | 
48 |     output_iterator = _time_slice(input_iterator, args.s, args.e, time_df)
49 | 
50 |     seq_util.write_seq_records(output_iterator, args.output_fastq, format='fastq')
51 | 


--------------------------------------------------------------------------------
/scripts/fastq_time_tab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | 
 6 | import pandas as pd
 7 | from collections import OrderedDict
 8 | from wub.util import seq as seq_util
 9 | import datetime
10 | 
11 | # Parse command line arguments:
12 | parser = argparse.ArgumentParser(
13 |     description="""Produce a tab separated file with read start times, read and channel numbers sorted by start time.""")
14 | parser.add_argument(
15 |     '-t', metavar='read_tsv', type=str, default="fastq_time_tab.tsv", help="Tab separated file to save read time table.", required=False)
16 | parser.add_argument(
17 |     'fastq', metavar='fastq', type=str, help="Input fastq file.")
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     args = parser.parse_args()
22 | 
23 |     input_iterator = seq_util.read_seq_records(args.fastq, format='fastq')
24 | 
25 |     read, read_nr, channel, start, length = [], [], [], [], []
26 | 
27 |     for rec in input_iterator:
28 |         read.append(rec.id)
29 |         length.append(len(rec.seq))
30 |         desc = rec.description.split()
31 | 
32 |         # Parse out read number:
33 |         tmp_read_nr = int(desc[2].split("=")[1])
34 |         read_nr.append(tmp_read_nr)
35 | 
36 |         # Parse out channel:
37 |         tmp_channel = int(desc[3].split("=")[1])
38 |         channel.append(tmp_channel)
39 | 
40 |         # Parse out start time:
41 |         tmp_start = desc[4].split("=")[1]
42 |         tmp_start = datetime.datetime.strptime(tmp_start, "%Y-%m-%dT%H:%M:%SZ")
43 |         start.append(tmp_start)
44 | 
45 |     df = pd.DataFrame(OrderedDict([('Read', read), ('Channel', channel),
46 |                                    ('ReadNumber', read_nr), ('StartTime', start), ("ReadLength", length)]))
47 | 
48 |     df.sort_values(by="StartTime", inplace=True)
49 |     df = df.set_index("StartTime")
50 | 
51 |     df.to_csv(args.t, sep="\t")
52 | 


--------------------------------------------------------------------------------
/scripts/fastx_ends_tab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | from wub.util import seq as seq_util
 8 | 
 9 | # Parse command line arguments:
10 | parser = argparse.ArgumentParser(
11 |     description='Generate a tab separated file with the first and last -n bases of the sequences.')
12 | parser.add_argument(
13 |     '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq')
14 | parser.add_argument(
15 |     '-n', metavar='nr_bases', type=int, help=".", default=100)
16 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).',
17 |                     type=argparse.FileType('r'), default=sys.stdin)
18 | parser.add_argument('output_tsv', nargs='?', help='Output file (default: stdout).',
19 |                     type=argparse.FileType('w'), default=sys.stdout)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     args = parser.parse_args()
24 | 
25 |     input_iterator = seq_util.read_seq_records(
26 |         args.input_fastx, format=args.i)
27 | 
28 |     args.output_tsv.write("Read\tStartSeq\tEndSeq\n")
29 |     for rec in input_iterator:
30 |         args.output_tsv.write("{}\t{}\t{}\n".format(rec.id, rec.seq[0:args.n], rec.seq[-args.n:]))
31 | 
32 |     args.output_tsv.flush()
33 |     args.output_tsv.close()
34 | 


--------------------------------------------------------------------------------
/scripts/fastx_grep.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | from wub.util import seq as seq_util
 8 | 
 9 | # Parse command line arguments:
10 | parser = argparse.ArgumentParser(
11 |     description='Filter sequence files by read name.')
12 | parser.add_argument(
13 |     '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq')
14 | parser.add_argument(
15 |     '-o', metavar='out_format', type=str, help="Output format (fastq).", default='fastq')
16 | parser.add_argument(
17 |     '-n', metavar='read_names', type=str, help="Comma separated list of read names to select.", default="")
18 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).',
19 |                     type=argparse.FileType('r'), default=sys.stdin)
20 | parser.add_argument('output_fastx', nargs='?', help='Output file (default: stdout).',
21 |                     type=argparse.FileType('w'), default=sys.stdout)
22 | 
23 | 
24 | def record_filter(input_iter, in_format, read_names):
25 |     """ Filter SeqRecord objects by length and mean quality.
26 | 
27 |     :param input_iter: Iterator of SeqRecord objects.
28 |     :param in_format: Input format.
29 |     :param to_alphabet: Convert to this alphabet.
30 |     :returns: SeqRecord object.
31 |     :rtype: generator
32 |     """
33 |     for record in input_iter:
34 |         if record.id in read_names:
35 |             yield record
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     args = parser.parse_args()
40 | 
41 |     input_iterator = seq_util.read_seq_records(
42 |         args.input_fastx, format=args.i)
43 | 
44 |     names = args.n.split(',')
45 | 
46 |     output_iterator = record_filter(input_iterator, args.i, names)
47 | 
48 |     seq_util.write_seq_records(output_iterator, args.output_fastx, format=args.o)
49 | 


--------------------------------------------------------------------------------
/scripts/fastx_length_tab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | from wub.util import seq as seq_util
 8 | 
 9 | # Parse command line arguments:
10 | parser = argparse.ArgumentParser(
11 |     description='Generate a tab separated file with the sequence lengths in the input file.')
12 | parser.add_argument(
13 |     '-i', metavar='in_format', type=str, help="Input format (fasta).", default='fasta')
14 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).',
15 |                     type=argparse.FileType('r'), default=sys.stdin)
16 | parser.add_argument('output_tsv', nargs='?', help='Output file (default: stdout).',
17 |                     type=argparse.FileType('w'), default=sys.stdout)
18 | 
19 | if __name__ == '__main__':
20 |     args = parser.parse_args()
21 | 
22 |     input_iterator = seq_util.read_seq_records(
23 |         args.input_fastx, format=args.i)
24 | 
25 |     args.output_tsv.write("Reference\tLength\n")
26 |     for rec in input_iterator:
27 |         args.output_tsv.write("{}\t{}\n".format(rec.id, len(rec.seq)))
28 | 
29 |     args.output_tsv.flush()
30 |     args.output_tsv.close()
31 | 


--------------------------------------------------------------------------------
/scripts/length_normalise_counts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import pandas as pd
 6 | import numpy as np
 7 | from collections import OrderedDict
 8 | from wub.util import seq as seq_util
 9 | 
10 | # Parse command line arguments:
11 | parser = argparse.ArgumentParser(
12 |     description="""Calculate RPKM values from raw counts and a transcriptome reference.""")
13 | parser.add_argument(
14 |     '-f', metavar='in_trs', type=str, help="Input transcriptome.", required=True)
15 | parser.add_argument('input_counts', nargs=1, help='Input count file.',
16 |                     type=str, default=None)
17 | parser.add_argument('output_count', nargs=1, help='Output RPKM file.',
18 |                     type=str, default=None)
19 | 
20 | 
21 | def _load_transcript_lengths(fasta):
22 |     """ Load transcript lengths. """
23 |     res = {}
24 |     for record in seq_util.read_seq_records(fasta):
25 |         res[record.id] = len(record.seq)
26 |     return res
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     args = parser.parse_args()
31 | 
32 |     # Load transcript lengths:
33 |     trs_lens = _load_transcript_lengths(args.f)
34 | 
35 |     # Load input counts:
36 |     in_df = pd.read_csv(args.input_counts[0], sep="\t")
37 | 
38 |     # Calculate scaling factor:
39 |     million_factor = np.sum(in_df["Count"]) / float(10**6)
40 | 
41 |     # Normalise counts:
42 |     refs, rpkms = [], []
43 |     for row in in_df.itertuples():
44 |         refs.append(row.Reference)
45 |         rpkms.append(row.Count / (million_factor *
46 |                                   (trs_lens[row.Reference] / 1000.0)))
47 | 
48 |     out_data = OrderedDict([('Reference', refs), ('Count', rpkms)])
49 |     out_df = pd.DataFrame(out_data)
50 |     out_df.to_csv(args.output_count[0], sep="\t", index=False)
51 | 


--------------------------------------------------------------------------------
/scripts/merge_tsvs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import pandas as pd
 6 | from functools import reduce
 7 | 
 8 | # Parse command line arguments:
 9 | parser = argparse.ArgumentParser(
10 |     description="""Merge tab separated files on a given field using pandas.""")
11 | parser.add_argument(
12 |     '-j', metavar='join', type=str, help="Join type (outer).", default="outer")
13 | parser.add_argument(
14 |     '-f', metavar='field', type=str, help="Join on this field (Read).", default="Read")
15 | parser.add_argument(
16 |     '-o', metavar='out_tsv', type=str, help="Output tsv (merge_tsvs.tsv).", default="merge_tsvs.tsv")
17 | parser.add_argument(
18 |     '-z', action="store_true", help="Fill NA values with zero.", default=False)
19 | parser.add_argument(
20 |     'tsvs', metavar='input_tsvs', nargs='*', type=str, help="Input tab separated files.")
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     args = parser.parse_args()
25 | 
26 |     dfs = [pd.read_csv(x, sep="\t") for x in args.tsvs]
27 | 
28 |     df_merged = reduce(lambda left, right: pd.merge(left, right, on=args.f, how=args.j), dfs)
29 |     if args.z:
30 |         df_merged = df_merged.fillna(0)
31 | 
32 |     df_merged.to_csv(args.o, sep="\t", index=False)
33 | 


--------------------------------------------------------------------------------
/scripts/multi_length_hist.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | import numpy as np
 7 | from os import path
 8 | from wub.vis import report
 9 | from wub.util import seq as seq_util
10 | 
11 | import warnings
12 | warnings.filterwarnings('ignore')
13 | 
14 | # Parse command line arguments:
15 | parser = argparse.ArgumentParser(
16 |     description="""Plot histograms of length distributions from multiple sequence files.""")
17 | parser.add_argument(
18 |     '-r', metavar='report_pdf', type=str, help="Report PDF.", default="multi_length_hist.pdf")
19 | parser.add_argument(
20 |     '-f', metavar='in_format', type=str, help="Input format (fastq).", default="fastq")
21 | parser.add_argument(
22 |     '-b', metavar='nr_bins', type=int, help="Number of bins (50).", default=50)
23 | parser.add_argument(
24 |     '-l', metavar='min_len', type=int, help="Minimum read length (None).", default=None)
25 | parser.add_argument(
26 |     '-u', metavar='max_len', type=int, help="Maximum read length (None).", default=None)
27 | parser.add_argument(
28 |     '-L', action="store_true", help="Log transform lengths.", default=False)
29 | parser.add_argument(
30 |     'in_files', metavar='input_counts', nargs='*', type=str, help="Input sequence files.")
31 | 
32 | 
33 | def _get_lengths(in_file, in_format, min_length, max_length, do_log):
34 |     """ Iterate over input and accumulate sequence lengths. """
35 |     input_iterator = seq_util.read_seq_records(in_file, format=in_format)
36 |     lengths = []
37 |     for record in input_iterator:
38 |         length = len(record)
39 |         # Filter for minimum read length:
40 |         if (min_length is not None) and (length < min_length):
41 |             continue
42 |         # Filter for maximum read length:
43 |         if (max_length is not None) and (length > max_length):
44 |             continue
45 |         if do_log:
46 |             length = np.log(length)
47 |         lengths.append(length)
48 |     input_iterator.close()
49 |     return lengths
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     args = parser.parse_args()
54 |     plotter = report.Report(args.r)
55 | 
56 |     if len(args.in_files) == 0:
57 |         sys.stderr.write("No input files given!\n")
58 |         sys.exit(1)
59 | 
60 |     data_map = {}
61 |     for in_file in args.in_files:
62 |         name = path.basename(in_file).rsplit('.', 1)[0]
63 |         data_map[name] = _get_lengths(in_file, args.f, args.l, args.u, args.L)
64 | 
65 |     if args.L:
66 |         xlab = 'log(read length)'
67 |     else:
68 |         xlab = 'read length'
69 | 
70 |     plotter.plot_histograms(data_map, title='Read length distributions', xlab=xlab, ylab='Count', bins=args.b, alpha=0.7, legend_loc='best', legend=True, vlines=None)
71 | 
72 |     plotter.close()
73 | 


--------------------------------------------------------------------------------
/scripts/pickle_cat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import pprint
 6 | 
 7 | from wub.util import misc
 8 | 
 9 | # Parse command line arguments:
10 | parser = argparse.ArgumentParser(
11 |     description="""Pretty print the contents of a pickle file.""")
12 | parser.add_argument(
13 |     'pickle', metavar='pickle_file', type=str, help="Input pickle file.")
14 | 
15 | if __name__ == '__main__':
16 |     args = parser.parse_args()
17 | 
18 |     pprint.pprint(misc.pickle_load(args.pickle))
19 | 


--------------------------------------------------------------------------------
/scripts/plot_counts_correlation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import pandas as pd
 6 | import os
 7 | from collections import OrderedDict
 8 | from wub.vis import report
 9 | from matplotlib import pyplot as plt
10 | import seaborn as sns
11 | from scipy.stats import spearmanr
12 | 
13 | # Parse command line arguments:
14 | parser = argparse.ArgumentParser(
15 |     description='Scatter plot of two set of counts.')
16 | parser.add_argument(
17 |     '-r', metavar='report_pdf', type=str, help="Report PDF.", required=False, default="plot_counts_correlation.pdf")
18 | parser.add_argument(
19 |     '-T', metavar='tags', type=str, help="Data tags: tag1,tag2.", required=False, default=None)
20 | parser.add_argument(
21 |     '-t', metavar='merged_data', type=str, help="Merged data TSV.", required=False, default=None)
22 | parser.add_argument(
23 |     '-o', metavar='Correlation_tsv', type=str, help="Correlation TSV.", required=False, default=None)
24 | parser.add_argument(
25 |     'counts_one', metavar='counts_one', type=str, help="Input tab separated file.")
26 | parser.add_argument(
27 |     'counts_two', metavar='counts_two', type=str, help="Input tab separated file.")
28 | 
29 | 
30 | def _create_tagged_column(df, tag):
31 |     df[tag] = df["Count"]
32 |     df = df.drop("Count", axis=1)
33 |     return df
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     args = parser.parse_args()
38 | 
39 |     data_one = pd.read_csv(args.counts_one, sep="\t")
40 |     data_two = pd.read_csv(args.counts_two, sep="\t")
41 | 
42 |     # Set data tags:
43 |     tags = args.T
44 |     if tags is not None:
45 |         tags = args.T.split(",")
46 |     else:
47 |         t1 = os.path.basename(args.counts_one).rsplit(".", 1)[0]
48 |         t2 = os.path.basename(args.counts_two).rsplit(".", 1)[0]
49 |         tags = [t1, t2]
50 | 
51 |     # Set column names:
52 |     data_one = _create_tagged_column(data_one, tags[0])
53 |     data_two = _create_tagged_column(data_two, tags[1])
54 | 
55 |     data_merged = pd.merge(data_one, data_two, on=["Reference"], how="outer")
56 |     data_merged = data_merged.fillna(0.0)
57 | 
58 |     plotter = report.Report(args.r)
59 | 
60 |     g = sns.jointplot(tags[0], tags[1], data=data_merged, stat_func=spearmanr, kind="reg")
61 |     plt.tight_layout()
62 |     plotter.pages.savefig()
63 | 
64 |     plotter.close()
65 | 
66 |     if args.t is not None:
67 |         data_merged.to_csv(args.t, sep="\t", index=False)
68 | 
69 |     if args.o is not None:
70 |         rho, pval = spearmanr(data_merged[tags[0]], data_merged[tags[1]])
71 |         res = pd.DataFrame(OrderedDict([("rho", [rho]), ("pval", [pval])]))
72 |         res.to_csv(args.o, sep="\t", index=False)
73 | 


--------------------------------------------------------------------------------
/scripts/plot_qualities.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | import numpy as np
 7 | 
 8 | from wub.util import seq as seq_util
 9 | from wub.vis import report
10 | 
11 | import warnings
12 | with warnings.catch_warnings():
13 |     warnings.simplefilter("ignore")
14 |     import seaborn as sns
15 | warnings.resetwarnings()
16 | _ = sns
17 | 
18 | # Parse command line arguments:
19 | parser = argparse.ArgumentParser(
20 |     description='Plot the mean quality values across non-overlapping windows in the input sequences.')
21 | parser.add_argument(
22 |     '-w', metavar='win_size', type=int, help="Window size (50).", default=50)
23 | parser.add_argument(
24 |     '-r', metavar='report_pdf', type=str, help="Report pdf (plot_qualities.pdf).", default='plot_qualities.pdf')
25 | parser.add_argument('input_fastx', nargs='?', help='Input (default: stdin).',
26 |                     type=argparse.FileType('r'), default=sys.stdin)
27 | 
28 | 
29 | def _smooth_qualitites(quals, winsize):
30 |     """ Smooth out qualities by taking average of non-overlapping windows. """
31 |     smooth_quals = []
32 |     for i in range(0, len(quals) - winsize, winsize):
33 |         smooth_quals.append(np.mean(quals[i:i + winsize]))
34 |     smooth_quals = np.array(smooth_quals, dtype=float)
35 |     return smooth_quals
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     args = parser.parse_args()
40 | 
41 |     input_iterator = seq_util.read_seq_records(
42 |         args.input_fastx, format="fastq")
43 | 
44 |     plotter = report.Report(args.r)
45 | 
46 |     for record in input_iterator:
47 |         quals = np.array(record.letter_annotations["phred_quality"])
48 |         smooth_quals = _smooth_qualitites(quals, args.w)
49 |         pos = np.arange(len(smooth_quals))
50 |         data_map = {'Mean qualities': (pos, smooth_quals)}
51 |         plotter.plot_arrays(data_map, marker='-', title=record.id, xlab="Window", ylab="Mean quality")
52 | 
53 |     plotter.close()
54 | 


--------------------------------------------------------------------------------
/scripts/plot_sequence_properties.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | import numpy as np
 7 | 
 8 | from wub.util import seq as seq_util
 9 | from wub.vis import report
10 | 
11 | import warnings
12 | with warnings.catch_warnings():
13 |     warnings.simplefilter("ignore")
14 |     import seaborn as sns
15 | warnings.resetwarnings()
16 | _ = sns
17 | 
18 | # Parse command line arguments:
19 | parser = argparse.ArgumentParser(
20 |     description='Plot histograms of lengths and quality values.')
21 | parser.add_argument(
22 |     '-f', metavar='format', type=str, help="Input format (fastq).", default='fastq')
23 | parser.add_argument(
24 |     '-b', metavar='bins', type=int, help="Number of bins on histograms (50).", default=50)
25 | parser.add_argument(
26 |     '-r', metavar='report_pdf', type=str, help="Report pdf (plot_sequence_properties.pdf).", default='plot_sequence_properties.pdf')
27 | parser.add_argument(
28 |     '-j', help="Produce joint plot of lengths and mean quality values (False).", default=False, action="store_true")
29 | parser.add_argument('input_fastx', nargs='?', help='Input (default: stdin).',
30 |                     type=argparse.FileType('r'), default=sys.stdin)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     args = parser.parse_args()
35 | 
36 |     in_format = args.f
37 |     input_iterator = seq_util.read_seq_records(
38 |         args.input_fastx, format=in_format)
39 | 
40 |     # Could be more efficient with dictionaries if we did not have to
41 |     # deal with the joint plot.
42 |     lengths = []
43 |     mean_qualities = []
44 | 
45 |     for record in input_iterator:
46 |         lengths.append(len(record))
47 |         if in_format == 'fastq':
48 |             mean_quality = seq_util.mean_qscore(record.letter_annotations["phred_quality"])
49 |             mean_qualities.append(mean_quality)
50 | 
51 |     plotter = report.Report(args.r)
52 | 
53 |     plotter.plot_histograms(
54 |         {'lengths': lengths}, title="Distribution of sequence lengths (mean={0:.3f})".format(np.mean(lengths)), xlab="Length", ylab="Count", legend=False)
55 | 
56 |     if in_format == 'fastq':
57 |         plotter.plot_histograms(
58 |             {'qualities': mean_qualities}, title="Distribution of mean base qualities (mean={0:.3f})".format(np.mean(mean_qualities)), xlab="Mean base quality", ylab="Count", legend=False)
59 |         if args.j:
60 |             plotter.plot_arrays({'scatter': (lengths, mean_qualities)}, title="Sequence length vs. mean base quality",
61 |                                 xlab="Sequence length", ylab="Mean base quality", legend=False)
62 | 
63 |     plotter.close()
64 | 


--------------------------------------------------------------------------------
/scripts/reads_across_time.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | import pandas as pd
  6 | 
  7 | from wub.vis import report
  8 | import matplotlib.pyplot as plt
  9 | import warnings
 10 | with warnings.catch_warnings():
 11 |     warnings.simplefilter("ignore")
 12 |     import seaborn as sns
 13 | warnings.resetwarnings()
 14 | _ = sns
 15 | 
 16 | # Parse command line arguments:
 17 | parser = argparse.ArgumentParser(
 18 |     description="""
 19 |     Plot read and alignment properties across time.
 20 |     """)
 21 | 
 22 | parser.add_argument(
 23 |     '-i', metavar='time_tab', type=str, help="Tab separated file generated by fastq_time_tab.py", required=True)
 24 | parser.add_argument(
 25 |     '-a', metavar='aln_tab', type=str, help="Tab separated file generated by bam_alignment_length.py", required=True)
 26 | parser.add_argument(
 27 |     '-w', metavar='res_freq', type=float, help="Resampling frequency in minutes.", required=False, default=5)
 28 | parser.add_argument(
 29 |     '-r', metavar='report_pdf', type=str, help="Report PDF (reads_across_time.pdf).", default="reads_across_time.pdf")
 30 | parser.add_argument(
 31 |     '-t', metavar='out_tsv', type=str, help="Output tsv (reads_across_time.tsv).", default="reads_across_time.tsv")
 32 | 
 33 | 
 34 | if __name__ == '__main__':
 35 |     args = parser.parse_args()
 36 | 
 37 |     freq = str(args.w) + "T"
 38 | 
 39 |     time_tab = pd.read_csv(args.i, sep="\t", parse_dates=True)
 40 | 
 41 |     aln_tab = pd.read_csv(args.a, sep="\t")
 42 |     aln_tab = aln_tab.rename(columns={"read_name": "Read",  "aligned_ref_bases": "AlignedRefBases",
 43 |                                       "aligned_read_bases": "AlignedReadBases", "reference": "Reference", "mapping_quality": "MappingQuality"})
 44 |     aln_tab.drop("read_length", axis=1, inplace=True)
 45 | 
 46 |     tt = time_tab.copy()
 47 |     tt = tt.set_index("StartTime").sort_index()
 48 |     tt.index = pd.DatetimeIndex(tt.index)
 49 | 
 50 |     df = pd.merge(aln_tab, time_tab, how='inner', on=['Read'])
 51 |     df = df.set_index("StartTime").sort_index()
 52 |     df.index = pd.DatetimeIndex(df.index)
 53 | 
 54 |     df["AlnRatio"] = df.AlignedReadBases / df.ReadLength
 55 | 
 56 |     df.to_csv(args.t, sep="\t")
 57 | 
 58 |     plotter = report.Report(args.r)
 59 | 
 60 |     tt.ReadLength.resample(freq).mean().plot()
 61 |     plt.ylabel("ReadLength")
 62 |     plotter.plt.tight_layout()
 63 |     plotter.pages.savefig()
 64 |     plotter.plt.clf()
 65 | 
 66 |     df.ReadLength.resample(freq).mean().plot()
 67 |     plt.ylabel("AlignedReadLength")
 68 |     plotter.plt.tight_layout()
 69 |     plotter.pages.savefig()
 70 |     plotter.plt.clf()
 71 | 
 72 |     df.ReadLength.resample(freq).count().plot()
 73 |     plt.ylabel("ReadCount")
 74 |     plotter.plt.tight_layout()
 75 |     plotter.pages.savefig()
 76 |     plotter.plt.clf()
 77 | 
 78 |     df.AlignedReadBases.resample(freq).mean().plot()
 79 |     plt.ylabel("AlignedReadBases")
 80 |     plotter.plt.tight_layout()
 81 |     plotter.pages.savefig()
 82 |     plotter.plt.clf()
 83 | 
 84 |     df.AlignedRefBases.resample(freq).mean().plot()
 85 |     plt.ylabel("AlignedRefBases")
 86 |     plotter.plt.tight_layout()
 87 |     plotter.pages.savefig()
 88 |     plotter.plt.clf()
 89 | 
 90 |     df.AlnRatio.resample(freq).mean().plot()
 91 |     plt.ylabel("AlignedReadBases / ReadLenght")
 92 |     plotter.plt.tight_layout()
 93 |     plotter.pages.savefig()
 94 |     plotter.plt.clf()
 95 | 
 96 |     df.MappingQuality.resample(freq).mean().plot()
 97 |     plt.ylabel("MappingQuality")
 98 |     plotter.plt.tight_layout()
 99 |     plotter.pages.savefig()
100 |     plotter.plt.clf()
101 | 
102 |     plotter.close()
103 | 


--------------------------------------------------------------------------------
/scripts/reads_stats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import os
  4 | from wub.read_stats import contig_stats as cstats
  5 | from wub.util import misc
  6 | 
  7 | 
  8 | def main():
  9 | 
 10 |     savepath = args.savepath
 11 |     fastx = args.fastx
 12 |     tag = args.tag
 13 | 
 14 |     if savepath is None:
 15 |         savepath = os.getcwd()
 16 |     else:
 17 |         savepath = misc.mkdir(savepath)
 18 | 
 19 |     if tag is None:
 20 |         tag = misc.get_fname(fastx)
 21 | 
 22 |     if misc._getextension(fastx) == 'fastq':
 23 |         fq = True
 24 |     else:
 25 |         fq = False
 26 | 
 27 |     rawdata = cstats.GC_per_read(cstats.readfast(fastx), fq=fq)
 28 | 
 29 |     # print os.path.join(savepath, '{}_summary.stats'.format(tag))
 30 | 
 31 |     if args.raw:
 32 |         rawdata.to_csv(os.path.join(savepath, '{}_raw.stats'.format(tag)))
 33 | 
 34 |     summary = cstats.get_stats(df=rawdata)
 35 |     summary.to_csv(os.path.join(savepath, '{}_summary.stats'.format(tag)))
 36 |     # print summary.round(2).to_string()
 37 | 
 38 |     if args.report:
 39 |         from wub.vis import report
 40 |         Plotter = report.Report(os.path.join(savepath, '{}.pdf'.format(tag)))
 41 | 
 42 |         rawdata = rawdata.sort_values('Seqlen', ascending=True)
 43 | 
 44 |         rawdata['cumsum'] = rawdata["Seqlen"].cumsum()
 45 |         rawdata['norm'] = 100.0 * rawdata['cumsum'] / rawdata['cumsum'].max()
 46 | 
 47 |         Plotter.plot_line(data=rawdata, x='Seqlen', y='norm',
 48 |                           title='Normalized cumulative plot', xlab='length (bp)', ylab="normalized (%)",)
 49 | 
 50 |         # df1.sort_values('Seqlen', ascending=False)
 51 |         # df1["cumsum1"] = df1['Seqlen'].cumsum()
 52 |         # Plotter.plot_line(data=rawdata, x='Cumsum1', y=df1.reset_index().index, title='Ordered cumulative sum plot', xlab="contigs ordered largest to smallest", ylab='cumulative sum')
 53 | 
 54 |         Plotter.plot_scatter(data=rawdata, x='GC content (%)', y='Seqlen', title='GC content vs length plot',
 55 |                              xlab="GC content (%)", ylab="length (bp)", alpha=0.5, ylim=0, xlim=0)
 56 |         if 'mean_q' in rawdata:
 57 | 
 58 |             Plotter.plot_scatter(data=rawdata, x='mean_q', y='Seqlen', title='Mean Q score vs length',
 59 |                                  xlab='Mean Q', ylab='length', alpha=0.5, xlim=rawdata['mean_q'].min() - 0.5, ylim=rawdata['Seqlen'].min() - 0.5)
 60 | 
 61 |         Plotter.close()
 62 | 
 63 | 
 64 | if __name__ == "__main__":
 65 |     parser = argparse.ArgumentParser(
 66 |         description='Calculates the GC content and N50')
 67 | 
 68 |     parser.add_argument('--fastx', '-i',
 69 |                         metavar='FILE',
 70 |                         required=True,
 71 |                         help='input file fastq or fasta')
 72 | 
 73 |     parser.add_argument('--raw', '-a',
 74 |                         action='store_true',
 75 |                         required=False,
 76 |                         help='save raw the gc content per read/contig. default[False]')
 77 | 
 78 |     parser.add_argument('--savepath', '-s',
 79 |                         metavar='DIR',
 80 |                         required=False,
 81 |                         default=None,
 82 |                         help='output dir. default[cwd]')
 83 | 
 84 |     parser.add_argument('--report', '-r',
 85 |                         # metavar="TRUE",
 86 |                         action='store_true',
 87 |                         required=False,
 88 |                         default=None,
 89 |                         help="Report PDF default[False]")
 90 | 
 91 |     parser.add_argument('--tag', '-n',
 92 |                         metavar='STR',
 93 |                         required=False,
 94 |                         default=None,
 95 |                         help='output name or tag. default[input name]')
 96 | 
 97 |     args = parser.parse_args()
 98 |     # print args
 99 | 
100 |     main()
101 | 


--------------------------------------------------------------------------------
/scripts/reverse_fastq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | from wub.util import seq as seq_util
 8 | 
 9 | # Parse command line arguments:
10 | parser = argparse.ArgumentParser(
11 |     description='Reverse (but not complement!) sequences and qualities in fastq file.')
12 | parser.add_argument('input_fastq', nargs='?', help='Input fastq (default: stdin).',
13 |                     type=argparse.FileType('r'), default=sys.stdin)
14 | parser.add_argument('output_fastq', nargs='?', help='Output fastq (default: stdout)',
15 |                     type=argparse.FileType('w'), default=sys.stdout)
16 | 
17 | 
18 | def reverse_seq_records(input_iterator):
19 |     """Reverse SeqRecord objects.
20 | 
21 |     :param input_iterator: Iterator of SeqRecord objects.
22 |     :returns: Generator of reversed SeqRecord objects.
23 |     :rtype: generator
24 |     """
25 |     for record in input_iterator:
26 |         yield record[::-1]
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     args = parser.parse_args()
31 | 
32 |     input_iterator = seq_util.read_seq_records(
33 |         args.input_fastq, format='fastq')
34 |     output_iterator = reverse_seq_records(input_iterator)
35 |     seq_util.write_seq_records(
36 |         output_iterator, args.output_fastq, format='fastq')
37 | 


--------------------------------------------------------------------------------
/scripts/sequence_filter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | from wub.util import seq as seq_util
 8 | 
 9 | # Parse command line arguments:
10 | parser = argparse.ArgumentParser(
11 |     description='Filter sequences by length and mean quality value.')
12 | parser.add_argument(
13 |     '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq')
14 | parser.add_argument(
15 |     '-o', metavar='out_format', type=str, help="Output format (fastq).", default='fastq')
16 | parser.add_argument(
17 |     '-q', metavar='min_qual', type=float, help="Minimum mean quality value (0.0).", default=0.0)
18 | parser.add_argument(
19 |     '-l', metavar='min_length', type=int, help="Minimum length (0).", default=0)
20 | parser.add_argument(
21 |     '-c', action='store_true', help="Reverse complement sequences.", default=False)
22 | parser.add_argument(
23 |     '-u', metavar='max_length', type=int, help="Maximum length (None).", default=None)
24 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).',
25 |                     type=argparse.FileType('r'), default=sys.stdin)
26 | parser.add_argument('output_fastx', nargs='?', help='Output file (default: stdout).',
27 |                     type=argparse.FileType('w'), default=sys.stdout)
28 | 
29 | 
30 | def record_filter(input_iter, in_format, min_qual, min_len, max_len, rev_comp):
31 |     """ Filter SeqRecord objects by length and mean quality.
32 | 
33 |     :param input_iter: Iterator of SeqRecord objects.
34 |     :param in_format: Input format.
35 |     :param min_qual: Minimum mean quality.
36 |     :param min_len: Minimum length.
37 |     :param max_len: Maximum length.
38 |     :param rev_comp: Reverse complement sequences if True.
39 |     :returns: SeqRecord object.
40 |     :rtype: generator
41 |     """
42 |     for record in input_iter:
43 |         # Quality filtering:
44 |         if in_format == 'fastq':
45 |             mean_quality = seq_util.mean_qscore(record.letter_annotations["phred_quality"])
46 |             if mean_quality < min_qual:
47 |                 continue
48 |         # Length filtering:
49 |         if len(record) < min_len:
50 |             continue
51 |         if max_len is not None and len(record) > max_len:
52 |             continue
53 |         if rev_comp:
54 |             record = record.reverse_complement()
55 |         yield record
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     args = parser.parse_args()
60 | 
61 |     if args.i == 'fasta' and args.o == 'fastq':
62 |         sys.stderr.write(
63 |             "Cannot produce fastq output from fasta! Use fasta_to_mock_fastq.py instead.\n")
64 |         sys.exit(1)
65 | 
66 |     input_iterator = seq_util.read_seq_records(
67 |         args.input_fastx, format=args.i)
68 | 
69 |     output_iterator = record_filter(input_iterator, args.i, args.q, args.l, args.u, args.c)
70 | 
71 |     seq_util.write_seq_records(output_iterator, args.output_fastx, format=args.o)
72 | 


--------------------------------------------------------------------------------
/scripts/sequence_subtract.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | from wub.util import seq as seq_util
 8 | 
 9 | # Parse command line arguments:
10 | parser = argparse.ArgumentParser(
11 |     description='Filter out sequences present in the first file from the second file.')
12 | parser.add_argument(
13 |     '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq')
14 | parser.add_argument(
15 |     '-o', metavar='out_format', type=str, help="Output format (fastq).", default='fastq')
16 | parser.add_argument('input_fastx_bait', nargs='?', help='First input file (default: stdin).',
17 |                     type=argparse.FileType('r'), default=sys.stdin)
18 | parser.add_argument('input_fastx_target', nargs='?', help='Second input file.',
19 |                     type=argparse.FileType('r'), default=sys.stdin)
20 | parser.add_argument('output_fastx', nargs='?', help='Output file (default: stdout).',
21 |                     type=argparse.FileType('w'), default=sys.stdout)
22 | 
23 | 
24 | def _record_filter(input_iter_bait, input_iter_target):
25 |     """ Filter out SeqRecord objects present in the first iterator. """
26 |     bait_ids = [read.id for read in input_iter_bait]
27 |     for record in input_iter_target:
28 |         if record.id not in bait_ids:
29 |             yield record
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     args = parser.parse_args()
34 | 
35 |     input_iterator_bait = seq_util.read_seq_records(
36 |         args.input_fastx_bait, format=args.i)
37 | 
38 |     input_iterator_target = seq_util.read_seq_records(
39 |         args.input_fastx_target, format=args.i)
40 | 
41 |     output_iterator = _record_filter(input_iterator_bait, input_iterator_target)
42 | 
43 |     seq_util.write_seq_records(output_iterator, args.output_fastx, format=args.o)
44 | 


--------------------------------------------------------------------------------
/scripts/simulate_errors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | import numpy as np
 8 | from Bio.Seq import Seq
 9 | 
10 | from wub.simulate import seq as sim_seq
11 | from wub.util import parse as parse_util
12 | from wub.util import seq as seq_util
13 | 
14 | # Parse command line arguments:
15 | parser = argparse.ArgumentParser(
16 |     description="""Simulate sequencing errors for each input sequence.
17 |     """)
18 | parser.add_argument('-e', metavar='error_rate', type=float,
19 |                     help="Total rate of substitutions insertions and deletions (0.1).", default=0.1)
20 | parser.add_argument('-w', metavar='error_weights', type=str,
21 |                     help="Relative frequency of substitutions,insertions,deletions (1,1,4).", default="1,1,4")
22 | parser.add_argument('-z', metavar='random_seed', type=int,
23 |                     help="Random seed (None).", default=None)
24 | parser.add_argument('input_fasta', nargs='?', help='Input fasta (default: stdin).',
25 |                     type=argparse.FileType('r'), default=sys.stdin)
26 | parser.add_argument('output_fasta', nargs='?', help='Output fasta (default: stdout)',
27 |                     type=argparse.FileType('w'), default=sys.stdout)
28 | 
29 | 
30 | def simulate_errors(input_iter, error_rate, error_weights):
31 |     """Simulate sequencing errors for each SeqRecord object in the input iterator.
32 | 
33 |     :param input_iter: Iterator of SeqRecord objects.
34 |     :para error_rate: Total error rate of substitutions, insertions and deletions.
35 |     :param error_weights: Relative frequency of substitutions,insertions,deletions.
36 |     :returns: Generator of SeqRecord objects.
37 |     :rtype: generator
38 |     """
39 |     for record in input_iter:
40 |         mutated_seq = sim_seq.simulate_sequencing_errors(record.seq, error_rate, error_weights).seq
41 |         record.seq = Seq(mutated_seq)
42 |         yield record
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     args = parser.parse_args()
47 | 
48 |     # Set random seed:
49 |     if args.z is not None:
50 |         np.random.seed(args.z)
51 | 
52 |     # Process error weights:
53 |     error_weights = np.array(parse_util.separated_list_to_floats(args.w))
54 |     # Normalise error weights to probabilities:
55 |     error_weights = parse_util.normalise_array(error_weights)
56 |     error_weights = dict(
57 |         zip(['substitution', 'insertion', 'deletion'], error_weights))
58 | 
59 |     input_iterator = seq_util.read_seq_records(args.input_fasta, format='fasta')
60 | 
61 |     simulation_iterator = simulate_errors(input_iterator, args.e, error_weights)
62 | 
63 |     seq_util.write_seq_records(
64 |         simulation_iterator, args.output_fasta, format='fasta')
65 | 


--------------------------------------------------------------------------------
/scripts/simulate_genome.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | import numpy as np
 8 | 
 9 | from wub.simulate import genome as sim_genome
10 | from wub.util import parse as parse_util
11 | from wub.util import seq as seq_util
12 | 
13 | # Parse command line arguments:
14 | parser = argparse.ArgumentParser(
15 |     description="""Simulate genome sequence with the specified number of chromosomes,
16 |     length distribution (truncated gamma) and base composition.""")
17 | parser.add_argument(
18 |     '-n', metavar='nr_chrom', type=int, help="Number of chromosomes (23).", default=23)
19 | parser.add_argument('-m', metavar='mean_length', type=int,
20 |                     help="Mean length of chromosomes (5000000).", default=5000000)
21 | parser.add_argument(
22 |     '-a', metavar='gamma_shape', type=float, help="Gamma shape parameter (1).", default=1.0)
23 | parser.add_argument(
24 |     '-l', metavar='low_trunc', type=int, help="Lower truncation point (None).", default=None)
25 | parser.add_argument(
26 |     '-u', metavar='high_trunc', type=int, help="Upper truncation point (None).", default=None)
27 | parser.add_argument('-b', metavar='base_freqs', type=str,
28 |                     help="Relative base frequencies in A,C,G,T order (1,1,1,1) or \"random\".", default="1,1,1,1")
29 | parser.add_argument('-z', metavar='random_seed', type=int,
30 |                     help="Random seed (None).", default=None)
31 | parser.add_argument('output_fasta', nargs='?', help='Output fasta (default: stdout)',
32 |                     type=argparse.FileType('w'), default=sys.stdout)
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     args = parser.parse_args()
37 | 
38 |     # Set random seed:
39 |     if args.z is not None:
40 |         np.random.seed(args.z)
41 | 
42 |     if args.b == "random":
43 |         base_frequencies = np.random.uniform(size=4)
44 |         base_frequencies = base_frequencies / np.sum(base_frequencies)
45 |     else:
46 |         base_frequencies = np.array(parse_util.separated_list_to_floats(args.b))
47 |     # Normalise relative base frequencies to probabilities:
48 |     base_frequencies = parse_util.normalise_array(base_frequencies)
49 | 
50 |     simulation_iterator = sim_genome.simulate_genome(
51 |         args.n, args.m, args.a, args.l, args.u, base_frequencies)
52 |     seq_util.write_seq_records(simulation_iterator, args.output_fasta, format='fasta')
53 | 


--------------------------------------------------------------------------------
/scripts/simulate_sequences.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | import numpy as np
 8 | 
 9 | from wub.simulate import seq as sim_seq
10 | from wub.util import parse as parse_util
11 | from wub.util import seq as seq_util
12 | 
13 | # Parse command line arguments:
14 | parser = argparse.ArgumentParser(
15 |     description="""Simulate sequences of fixed length and specified base composition.""")
16 | parser.add_argument(
17 |     '-n', metavar='nr_seq', type=int, help="Number of sequences (1).", default=1)
18 | parser.add_argument('-m', metavar='length', type=int,
19 |                     help="Length of simulated sequences (3000).", default=3000)
20 | parser.add_argument('-b', metavar='base_freqs', type=str,
21 |                     help="Relative base frequencies in A,C,G,T order (1,1,1,1).", default="1,1,1,1")
22 | parser.add_argument('-z', metavar='random_seed', type=int,
23 |                     help="Random seed (None).", default=None)
24 | parser.add_argument('output_fasta', nargs='?', help='Output fasta (default: stdout)',
25 |                     type=argparse.FileType('w'), default=sys.stdout)
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     args = parser.parse_args()
30 | 
31 |     # Set random seed:
32 |     if args.z is not None:
33 |         np.random.seed(args.z)
34 | 
35 |     base_frequencies = np.array(parse_util.separated_list_to_floats(args.b))
36 |     # Normalise relative base frequencies to probabilities:
37 |     base_frequencies = parse_util.normalise_array(base_frequencies)
38 | 
39 |     simulation_iterator = (seq_util.new_dna_record(sim_seq.simulate_sequence(args.m, base_frequencies), "seq_{}".format(i)) for i in range(args.n))
40 | 
41 |     seq_util.write_seq_records(simulation_iterator, args.output_fasta, format='fasta')
42 | 


--------------------------------------------------------------------------------
/scripts/split_fastx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import six
 5 | import argparse
 6 | import sys
 7 | from os import path
 8 | 
 9 | from wub.util import seq as seq_util
10 | 
11 | # Parse command line arguments:
12 | parser = argparse.ArgumentParser(
13 |     description='Split sequence records in file to one record per file or batches of records.')
14 | parser.add_argument(
15 |     '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq')
16 | parser.add_argument(
17 |     '-o', metavar='out_format', type=str, help="Output format (fastq).", default='fastq')
18 | parser.add_argument(
19 |     '-b', metavar='batch_size', type=int, help="Batch size (None).", default=None)
20 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).',
21 |                     type=argparse.FileType('r'), default=sys.stdin)
22 | parser.add_argument('output_dir', nargs='?', help='Output directory (default: .)', default='.')
23 | 
24 | 
25 | def batch_iterator(iterator, batch_size):
26 |     """Returns lists of length batch_size.
27 |     Taken from the biopython wiki: http://biopython.org/wiki/Split_large_file
28 | 
29 |     This is a generator function, and it returns lists of the
30 |     entries from the supplied iterator.  Each list will have
31 |     batch_size entries, although the final list may be shorter.
32 | 
33 |     :param iterator: Input iterator.
34 |     :param batch_size: Batch size.
35 |     :returns: Generator of lists.
36 |     :rtype: generator
37 |     """
38 |     entry = True  # Make sure we loop once
39 |     while entry:
40 |         batch = []
41 |         while len(batch) < batch_size:
42 |             try:
43 |                 entry = six.next(iterator)
44 |             except StopIteration:
45 |                 entry = None
46 |             if entry is None:
47 |                 # End of file
48 |                 break
49 |             batch.append(entry)
50 |         if batch:
51 |             yield batch
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     args = parser.parse_args()
56 | 
57 |     input_iterator = seq_util.read_seq_records(
58 |         args.input_fastx, format=args.i)
59 | 
60 |     if args.b is None:
61 |         # Splitting one record per file:
62 |         for record in input_iterator:
63 |             bn = path.basename(args.input_fastx.name)
64 |             ext = bn.rsplit('.', 1)[-1]
65 |             fh = open(path.join(args.output_dir, "{}.{}".format(record.id, ext)), 'w')
66 |             seq_util.write_seq_records([record], fh, format=args.o)
67 |             fh.flush()
68 |             fh.close()
69 |     else:
70 |         # Split into batches:
71 |         input_iterator = batch_iterator(input_iterator, args.b)
72 |         i = 0
73 |         for records in input_iterator:
74 |             bn = path.basename(args.input_fastx.name)
75 |             fh = open(path.join(args.output_dir, "batch_{}_{}".format(i, bn)), 'w')
76 |             seq_util.write_seq_records(records, fh, format=args.o)
77 |             fh.flush()
78 |             fh.close()
79 |             i += 1
80 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.5.1
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:wub/__init__.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | 
20 | [aliases]
21 | test = pytest
22 | 
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from setuptools import setup, find_packages
 5 | from glob import glob
 6 | 
 7 | with open('README.md') as readme_file:
 8 |     readme = readme_file.read()
 9 | 
10 | requirements = [
11 |     'six',
12 |     'pytest',
13 |     'pycmd',
14 |     'biopython',
15 |     'numpy',
16 |     'matplotlib',
17 |     'seaborn',
18 |     'editdistance',
19 |     'pandas>=0.20.2',
20 |     'pysam',
21 |     'tqdm',
22 |     'statsmodels'
23 | ]
24 | 
25 | test_requirements = [
26 |     'pytest',
27 |     'pycmd',
28 |     'editdistance',
29 |     'numpy',
30 | ]
31 | 
32 | setup(
33 |     name='Wub',
34 |     version='0.5.1',
35 |     description="Tools and software components developed by the ONT Applications group.",
36 |     long_description=readme,
37 |     author="ONT Applications Group",
38 |     author_email='Apps@nanoporetech.com',
39 |     url='',
40 |     packages=find_packages(exclude=["scripts"]),
41 |     package_dir={'wub':
42 |                  'wub'},
43 |     include_package_data=True,
44 |     install_requires=requirements,
45 |     zip_safe=False,
46 |     keywords='wub',
47 |     classifiers=[
48 |         'Development Status :: 2 - Pre-Alpha',
49 |         'Intended Audience :: Developers',
50 |         'Natural Language :: English',
51 |         "Programming Language :: Python :: 2",
52 |         'Programming Language :: Python :: 2.6',
53 |         'Programming Language :: Python :: 2.7',
54 |         'Programming Language :: Python :: 3.4',
55 |     ],
56 |     tests_require=test_requirements,
57 |     scripts=[x for x in glob('scripts/*.py') if x != 'scripts/__init__.py']
58 | )
59 | 


--------------------------------------------------------------------------------
/wub/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __author__ = 'ONT Applications Group'
4 | __email__ = 'Apps@nanoporetech.com'
5 | __version__ = '0.5.1'
6 | 


--------------------------------------------------------------------------------
/wub/bam/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/bam/__init__.py


--------------------------------------------------------------------------------
/wub/bam/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import pysam
 4 | 
 5 | 
 6 | def pysam_open(alignment_file, in_format='BAM'):
 7 |     """Open SAM/BAM file using pysam.
 8 | 
 9 |     :param alignment_file: Input file.
10 |     :param in_format: Format (SAM or BAM).
11 |     :returns: pysam.AlignmentFile
12 |     :rtype: pysam.AlignmentFile
13 |     """
14 |     if in_format == 'BAM':
15 |         mode = "rb"
16 |     elif in_format == 'SAM':
17 |         mode = "r"
18 |     else:
19 |         raise Exception("Invalid format: {}".format(in_format))
20 | 
21 |     aln_iter = pysam.AlignmentFile(alignment_file, mode)
22 |     return aln_iter
23 | 


--------------------------------------------------------------------------------
/wub/bam/filter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Filter SAM/BAM records by various criteria."""
 3 | 
 4 | import itertools
 5 | 
 6 | 
 7 | def get_alignment_score(segement):
 8 |     """Get alignment score from pysam segment.
 9 | 
10 |     :param segment: Pysam aligned segment.
11 |     :returns: Alignment score.
12 |     :rtype: int
13 |     """
14 | 
15 |     score = 0
16 |     try:
17 |         score = segement.get_tag('AS')
18 |     except:
19 |         pass
20 |     return score
21 | 
22 | 
23 | def filter_top_per_query(records_iter):
24 |     """Filter pysam records keeping top scoring per query. Assumes
25 |     records are sorted by name.
26 | 
27 |     :param records_iter: Iterator of pysam aligned segments.
28 |     :returns: Generator of filtered records.
29 |     :rtype: generator
30 |     """
31 |     buff = []
32 |     for rec in itertools.chain(records_iter, [None]):
33 |         if len(buff) == 0:
34 |             buff.append(rec)
35 |         elif rec is None or buff[-1].query_name != rec.query_name:
36 |             sorted_buff = sorted(buff, key=get_alignment_score, reverse=True)
37 |             buff = [rec]
38 |             yield sorted_buff[0]
39 |         else:
40 |             buff.append(rec)
41 | 
42 | 
43 | def filter_query_coverage(records_iter, minimum_coverage):
44 |     """Filter pysam records keeping the ones with sufficient query coverage.
45 | 
46 |     :param records_iter: Iterator of pysam aligned segments.
47 |     :param minimum_coverage: Minimum fraction of covered query.
48 |     :returns: Generator of filtered records.
49 |     :rtype: generator
50 |     """
51 |     for rec in records_iter:
52 |         if rec.is_unmapped:
53 |             yield rec
54 |         elif (float(rec.query_alignment_length) / rec.infer_query_length()) >= minimum_coverage:
55 |             yield rec
56 | 
57 | 
58 | def filter_ref_coverage(records_iter, minimum_coverage, header):
59 |     """Filter pysam records keeping the ones with sufficient reference coverage.
60 | 
61 |     :param records_iter: Iterator of pysam aligned segments.
62 |     :param minimum_coverage: Minimum fraction of covered reference.
63 |     :param header: SAM header with reference lengths.
64 |     :returns: Generator of filtered records.
65 |     :rtype: generator
66 |     """
67 |     ref_lengths = dict((h['SN'], int(h['LN'])) for h in header['SQ'])
68 |     for rec in records_iter:
69 |         if rec.is_unmapped:
70 |             yield rec
71 |         elif (float(rec.query_alignment_length) / ref_lengths[rec.reference_name]) >= minimum_coverage:
72 |             yield rec
73 | 


--------------------------------------------------------------------------------
/wub/bam/read_counter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Count reads per reference in BAM/SAM file."""
  3 | import sys
  4 | 
  5 | import pysam
  6 | import numpy as np
  7 | from collections import defaultdict
  8 | from wub.util import seq as seq_util
  9 | import tqdm
 10 | 
 11 | 
 12 | def count_reads(alignment_file, in_format='BAM', min_aln_qual=0, verbose=False, reads_gc=False):
 13 |     """Count reads mapping to references in a BAM file.
 14 | 
 15 |     :param alignment_file: BAM file.
 16 |     :param min_aln_qual: Minimum mapping quality.
 17 |     :param verbose: Verbose if True.
 18 |     :param read_gc: Calculate mean GC content of reads for each reference.
 19 |     :returns: Dictionary with read counts per reference and read GC contents.
 20 |     :rtype: tuple of dicts
 21 |     """
 22 |     counts = defaultdict(int)
 23 |     gc_means = defaultdict(list)
 24 |     if in_format == 'BAM':
 25 |         mode = "rb"
 26 |     elif in_format == 'SAM':
 27 |         mode = "r"
 28 |     else:
 29 |         raise Exception("Invalid format: {}".format(in_format))
 30 | 
 31 |     aln_iter = pysam.AlignmentFile(alignment_file, mode)
 32 | 
 33 |     if verbose and in_format == "BAM":
 34 |         try:
 35 |             total_reads = aln_iter.mapped + aln_iter.unmapped
 36 |         except:
 37 |             total_reads = None
 38 |         sys.stdout.write(
 39 |             "Gathering read statistics from file: {}\n".format(alignment_file))
 40 |         if in_format == "BAM":
 41 |             aln_iter = tqdm.tqdm(aln_iter, total=total_reads)
 42 | 
 43 |     for segment in aln_iter:
 44 |         if segment.is_unmapped:
 45 |             continue
 46 |         if segment.mapping_quality >= min_aln_qual:
 47 |             counts[segment.reference_name] += 1
 48 |             if reads_gc:
 49 |                 gc_means[segment.reference_name].append(seq_util.gc_content(segment.query_alignment_sequence))
 50 | 
 51 |     gc_cont = {}
 52 |     if reads_gc:
 53 |         # Calculate mean of mean GC contents:
 54 |         for trs, gc_ms in gc_means.items():
 55 |             gc_cont[trs] = np.mean(gc_ms)
 56 |     aln_iter.close()
 57 | 
 58 |     return dict(counts), gc_cont
 59 | 
 60 | 
 61 | def count_reads_realtime(alignment_file='-', in_format='SAM', min_aln_qual=0, yield_freq=1, verbose=False):
 62 |     """Online counting of reads mapping to references in a SAM/BAM stream from stdin.
 63 | 
 64 |     :param alignment_file: BAM file (stdin).
 65 |     :param min_aln_qual: Minimum mapping quality.
 66 |     :param yield_freq: Yield frequency.
 67 |     :param verbose: Minimum mapping quality.
 68 |     :returns: Generator of dictionary with read counts per reference.
 69 |     :rtype: generator
 70 |     """
 71 |     counts = defaultdict(int)
 72 |     if in_format == 'BAM':
 73 |         mode = "rb"
 74 |     elif in_format == 'SAM':
 75 |         mode = "r"
 76 |     else:
 77 |         raise Exception("Invalid format: {}".format(in_format))
 78 | 
 79 |     aln_iter = pysam.AlignmentFile(alignment_file, mode)
 80 | 
 81 |     if verbose:
 82 |         sys.stdout.write(
 83 |             "Online counting of read statistics from file: {}\n".format(alignment_file))
 84 |         aln_iter = iter(tqdm.tqdm(aln_iter))
 85 | 
 86 |     nr_mapped = 0
 87 |     while True:
 88 |         try:
 89 |             segment = aln_iter.next()
 90 |         except StopIteration:
 91 |             # Final yield:
 92 |             yield counts
 93 |             return
 94 | 
 95 |         if segment.is_unmapped:
 96 |             continue
 97 |         if segment.mapping_quality >= min_aln_qual:
 98 |             counts[segment.reference_name] += 1
 99 |         nr_mapped += 1
100 | 
101 |         if nr_mapped % yield_freq == 0:
102 |             yield counts
103 | 
104 |     aln_iter.close()
105 | 


--------------------------------------------------------------------------------
/wub/bam/sam_writer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import six
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | class SamWriter:
 8 | 
 9 |     """ Simple class to write SAM files. """
10 | 
11 |     def __init__(self, out_file, header=None):
12 |         """ Initialise SAM writer object """
13 |         self.out_file = out_file
14 |         self.header = header
15 |         self.out_handler = open(out_file, 'w')
16 |         if header is not None:
17 |             self._write_header()
18 | 
19 |     def _write_header(self):
20 |         """Write SAM header."""
21 |         for record_type, records in six.iteritems(self.header):
22 |             for record in records:
23 |                 self.out_handler.write("@{}".format(record_type))
24 |                 for key, value in six.iteritems(record):
25 |                     self.out_handler.write("\t{}:{}".format(key, value))
26 |                 self.out_handler.write("\n")
27 | 
28 |     def new_sam_record(self, qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, tags):
29 |         """Create new SAM record structure.
30 | 
31 |         :param self: object
32 |         :param qname: Read name.
33 |         :param rname: Reference name.
34 |         :param pos: Position in reference.
35 |         :param mapq: Mapping quality.
36 |         :param cigar: CIGAR string.
37 |         :param rnext: Reference of next read.
38 |         :param pnext: Position of next read.
39 |         :param tlen: Template length.
40 |         :param seq: Read sequence.
41 |         :param qual: Base qualities.
42 |         :param tags: Optional tags.
43 |         :returns: SAM record.
44 |         :rtype: OrderedDict
45 |         """
46 |         record = OrderedDict()
47 | 
48 |         record['QNAME'] = qname
49 |         record['FLAG'] = flag
50 |         record['RNAME'] = rname
51 |         record['POS'] = pos
52 |         record['MAPQ'] = mapq
53 |         record['CIGAR'] = cigar
54 |         record['RNEXT'] = rnext
55 |         record['PNEXT'] = pnext
56 |         record['TLEN'] = tlen
57 |         record['SEQ'] = seq
58 |         record['QUAL'] = qual
59 |         record['TAGS'] = tags
60 | 
61 |         return record
62 | 
63 |     def write(self, record):
64 |         """Write SAM record to file.
65 | 
66 |         :param self: object
67 |         :param record: SAM record.
68 |         :returns: None
69 |         :rtype: object
70 |         """
71 |         self.out_handler.write("{}\n".format("\t".join(map(lambda x: str(x), six.itervalues(record)))))
72 | 
73 |     def close(self):
74 |         """Close SAM file.
75 | 
76 |         :param self: object
77 |         :returns: None
78 |         :rtype: object
79 |         """
80 |         self.out_handler.flush()
81 |         self.out_handler.close()
82 | 


--------------------------------------------------------------------------------
/wub/mappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/mappers/__init__.py


--------------------------------------------------------------------------------
/wub/parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/parsers/__init__.py


--------------------------------------------------------------------------------
/wub/parsers/blastn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """ Parser functions for blastn outfmt 6. """
 3 | 
 4 | 
 5 | def _parse_coord_line(line):
 6 |     """ Parse a line from a blast outfmt 6 file. """
 7 |     fields = line.split()
 8 |     aln_record = {
 9 |         'query': fields[0],
10 |         'ref': fields[1],
11 |         'identity': float(fields[2]),
12 |         'aln_length': int(fields[3]),
13 |         'mismatch': int(fields[4]),
14 |         'gapopen': int(fields[5]),
15 |         'query_start': int(fields[6]),
16 |         'query_end': int(fields[7]),
17 |         'ref_start': int(fields[8]),
18 |         'ref_end': int(fields[9]),
19 |         'evalue': float(fields[10]),
20 |         'bitscore': float(fields[11]),
21 |         'strand': '+'
22 |     }
23 | 
24 |     if aln_record['ref_start'] > aln_record['ref_end']:
25 |         aln_record['strand'] = '-'
26 |         aln_record['ref_start'], aln_record['ref_end'] = aln_record[
27 |             'ref_end'], aln_record['ref_start']
28 |     return aln_record
29 | 
30 | 
31 | def parse_coords(input_object):
32 |     """ Parse coordinates file produced by blastn outfmt 6.
33 | 
34 |     :param input_object: Input path or file hanlder.
35 |     :returns: List of dictionaries with parsed records.
36 |     :rtype: list
37 |     """
38 |     if type(input_object) == str:
39 |         input_object = open(input_object, 'r')
40 |     records = []
41 |     for line in input_object:
42 |         line = line.strip()
43 |         records.append(_parse_coord_line(line))
44 |     return records
45 | 


--------------------------------------------------------------------------------
/wub/parsers/mummer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """ Parser functions for mummer. """
 3 | 
 4 | 
 5 | def _parse_coord_line(line):
 6 |     """ Parse a line from a mummer coordinate file. """
 7 |     fields = line.replace("|", "").split()
 8 |     aln_record = {
 9 |         'ref_start': int(fields[0]),
10 |         'ref_end': int(fields[1]),
11 |         'query_start': int(fields[2]),
12 |         'query_end': int(fields[3]),
13 |         'ref_len': int(fields[4]),
14 |         'query_len': int(fields[5]),
15 |         'identity': float(fields[6]),
16 |         'ref': fields[7],
17 |         'query': fields[8],
18 |     }
19 |     return aln_record
20 | 
21 | 
22 | def parse_coords(input_object):
23 |     """ Parse coordinates file produced by mummer.
24 | 
25 |     :param input_object: Input path or file hanlder.
26 |     :returns: List of dictionaries with parsed records.
27 |     :rtype: list
28 |     """
29 |     if type(input_object) == str:
30 |         input_object = open(input_object, 'r')
31 |     records = []
32 |     for line in input_object:
33 |         line = line.strip()
34 |         if line.count('/') > 0:
35 |             continue
36 |         if line.count('NUCMER') > 0:
37 |             continue
38 |         if line.count('[') > 0:
39 |             continue
40 |         if line.count('=') > 0:
41 |             continue
42 |         if len(line) == 0:
43 |             continue
44 |         records.append(_parse_coord_line(line))
45 |     return records
46 | 


--------------------------------------------------------------------------------
/wub/read_stats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/read_stats/__init__.py


--------------------------------------------------------------------------------
/wub/read_stats/contig_stats.py:
--------------------------------------------------------------------------------
  1 | from Bio import SeqIO
  2 | import pandas as pd
  3 | from wub.util.misc import _getextension
  4 | from wub.util.seq import mean_qscore
  5 | 
  6 | 
  7 | def readfast(fast):
  8 |     """ reads a fasta or fastq file.
  9 | 
 10 |     :param fast: fastq or fasta
 11 |     :return: list of records with attr
 12 |     :rtype: generator object
 13 | 
 14 |     """
 15 | 
 16 |     extension = _getextension(fast)
 17 |     for rec in SeqIO.parse(open(fast), extension):
 18 | 
 19 |         yield rec
 20 | 
 21 | 
 22 | def _cumsum(df, col):
 23 |     '''
 24 |     Calculates the cumulative sum of column
 25 | 
 26 |     :param df: dataframe with sequence length
 27 |     :param col: identify the sequence length
 28 |     :return: dataframe with cumulative sum of length
 29 |     :rtype: dataframe
 30 |     '''
 31 |     df = df.sort_values(by=col, ascending=False).reset_index(drop=True)
 32 |     df['cumsum'] = df[col].cumsum()
 33 |     return df
 34 | 
 35 | 
 36 | def N50(df, col, percent=50):
 37 |     """ Calculate the N50 by default however, by changing percent to 75, N75 can be calculated.
 38 | 
 39 |     :param df: dataframe with seqlen column
 40 |     :param col: column with sequence length
 41 |     :param percent: percentage to be calculated
 42 |     :return: N50 Value
 43 |     :rtype: int
 44 | 
 45 |     """
 46 |     df1 = _cumsum(df, col)
 47 |     df1['cumsum'] = df1[col].cumsum()
 48 |     n50 = df1['cumsum'].max() * percent / 100
 49 |     return df1.where(df1['cumsum'] >= n50)[col].dropna().head(1).tolist()[0]
 50 | 
 51 | 
 52 | def L50(df, col, percent=50):
 53 |     """ Calculate the L50 by default however, by changing percent to 75, N75 can be calculated
 54 | 
 55 |     :param df: dataframe with seqlen column
 56 |     :param col: column with sequence length
 57 |     :param percent: percentage to be calculated
 58 |     :return: N50 Value
 59 |     :rtype: int
 60 | 
 61 |     """
 62 | 
 63 |     df1 = _cumsum(df, col).copy()
 64 |     return df1[df1 >= N50(df, col, percent)][col].count()
 65 | 
 66 | 
 67 | def GC_per_read(seq_rec, fq=False):
 68 |     """ Calculates the number of bases per sequence, GC content and mean Q score if fastq is given
 69 | 
 70 |     :param seq_rec: sequence records with attr from biopython
 71 |     :param fq: boolean
 72 |     :return: dataframe
 73 |     :rtype: dataframe
 74 |     """
 75 | 
 76 |     d = []
 77 | 
 78 |     bases = ["A", "T", "C", "G", 'N']
 79 |     # total_lengths = 0
 80 |     for rec in seq_rec:
 81 |         tmp = {"SeqID": rec.id, "Seqlen": len(rec.seq), "A": 0, "T": 0, "G": 0, "C": 0, "N": 0}
 82 |         for base in bases:
 83 |             tmp[base] += rec.seq.count(base)
 84 | 
 85 |         if fq:
 86 |             tmp['mean_q'] = round(mean_qscore(rec.letter_annotations[
 87 |                                   "phred_quality"], qround=False), 2)
 88 | 
 89 |         d.append(tmp)
 90 | 
 91 |     raw = pd.DataFrame(d).set_index('SeqID')
 92 |     raw['GC content (%)'] = raw.apply(lambda x: float(
 93 |         (x['G']) + x['C']) / x['Seqlen'] * 100.0, axis=1)
 94 | 
 95 |     for base in bases:
 96 |         raw[base + ' (%)'] = (raw[base] / raw["Seqlen"]) * 100.0
 97 |     raw["other base"] = raw['Seqlen'] - raw[bases].sum(axis=1)
 98 |     return raw
 99 | 
100 | 
101 | def get_stats(df):
102 |     """ Calcualtes the summary stats
103 | 
104 |     :param df: dataframe from GC_per_read
105 |     :return: summary Series
106 |     :rtype: Series
107 | 
108 |     """
109 |     stats = pd.Series({})
110 |     df = df.copy()
111 |     Mbase = 1000000.0
112 | 
113 |     bases = ["A", "T", "C", "G", 'N']
114 | 
115 |     total_len = int(df["Seqlen"].sum())
116 |     total_bases = df[bases].sum().sum()
117 | 
118 |     stats['N75'] = N50(df, 'Seqlen', 75)
119 |     stats['N50'] = N50(df, 'Seqlen', 50)
120 |     stats['N25'] = N50(df, 'Seqlen', 25)
121 | 
122 |     stats['L75'] = L50(df, "Seqlen", 75)
123 |     stats['L50'] = L50(df, "Seqlen", 50)
124 |     stats['L25'] = L50(df, "Seqlen", 25)
125 | 
126 |     stats['Max contig'] = df['Seqlen'].max()
127 |     stats['Min contig'] = df['Seqlen'].min()
128 |     stats['Avg length'] = df['Seqlen'].mean()
129 |     stats['Length SD'] = df['Seqlen'].std()
130 |     stats['Total length (Mb)'] = total_len / Mbase
131 | 
132 |     stats['Total bases (Mb)'] = total_len / Mbase
133 |     stats['Other bases (Mb)'] = (total_len - total_bases) / Mbase
134 |     stats['No. contigs'] = df['Seqlen'].count()
135 | 
136 |     stats["Greater then 10 Kb"] = df[df['Seqlen'] >= 10000.0].Seqlen.count()
137 |     stats["Greater then 100 Kb"] = df[df['Seqlen'] >= 100000.0].Seqlen.count()
138 |     stats["Greater then 500 Kb"] = df[df['Seqlen'] >= 500000.0].Seqlen.count()
139 |     stats["Greater then 1 Mb"] = df[df['Seqlen'] >= 1000000.0].Seqlen.count()
140 | 
141 |     stats['Yield > 10kb (Mb)'] = df[df['Seqlen'] >= 10000.0]['Seqlen'].sum() / Mbase
142 |     stats['Yield > 50kb (Mb)'] = df[df['Seqlen'] >= 50000.0]['Seqlen'].sum() / Mbase
143 | 
144 |     if 'mean_q' in df.columns:
145 |         stats['Max Qscore'] = df['mean_q'].max()
146 |         stats['Min Qscore'] = df['mean_q'].min()
147 |         stats['Avg Qscore'] = df['mean_q'].mean()
148 |         stats['Qscore SD'] = df['mean_q'].std()
149 |         stats['Yield >Q6 (Mb)'] = df[df['mean_q'] >= 6.0]['Seqlen'].sum() / Mbase
150 |         stats['Yield >Q9 (Mb)'] = df[df['mean_q'] >= 9.0]['Seqlen'].sum() / Mbase
151 | 
152 |     stats["GC content"] = float(df[['G', "C"]].sum().sum()) / total_len * 100.0
153 |     for base in bases:
154 | 
155 |         stats[base + ' (%)'] = float(df[base].sum()) / total_len * 100.0
156 | 
157 |     return stats.round(2)
158 | 


--------------------------------------------------------------------------------
/wub/simulate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/simulate/__init__.py


--------------------------------------------------------------------------------
/wub/simulate/dist.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np
 4 | 
 5 | """Sample from various distributions."""
 6 | 
 7 | 
 8 | def sample_truncated_gamma(mean, shape, low=None, high=None):
 9 |     """A naive rejection approach to sample from truncated gamma distribution.
10 |     Note that truncation points ae included in the sample.
11 | 
12 |     :param mean: Mean of the distribution.
13 |     :param shape: Shape parameter.
14 |     :param low: Lower truncation point.
15 |     :param high: Upper truncation point.
16 |     :returns: Random sample from the specified distribution.
17 |     :rtype: float
18 | 
19 |     """
20 | 
21 |     scale = float(mean) / shape
22 |     while True:
23 |         sample = np.random.gamma(scale=scale, shape=shape, size=1)
24 |         if low is not None and sample < low:
25 |             continue
26 |         if high is not None and sample > high:
27 |             continue
28 |         return float(sample)
29 | 


--------------------------------------------------------------------------------
/wub/simulate/genome.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import six
  4 | import sys
  5 | import numpy as np
  6 | from collections import OrderedDict, namedtuple
  7 | 
  8 | from wub.util import seq as seq_util
  9 | from wub.simulate import seq as sim_seq
 10 | from wub.simulate import dist
 11 | 
 12 | Fragment = namedtuple('Fragment', 'chrom uid start end seq')
 13 | 
 14 | 
 15 | def simulate_genome(number_chromosomes, mean_length, gamma_shape, low_truncation, high_truncation, base_frequencies):
 16 |     """Generator function for simulating chromosomes in a genome.
 17 |     Chromosome lengths are sampled from a truncated gamma distribution.
 18 | 
 19 |     :param number_chromosomes: Number of simulated chromosomes.
 20 |     :param mean_length: Mean length of simulated chromosomes.
 21 |     :param gamma_shape: Shape parameter of the chromosome length distribution.
 22 |     :param low_truncation: Minimum chromosome length.
 23 |     :param high_truncation: Maximum chromosome length.
 24 |     :param base_frequencies: Array of base frequencies in the ACGT order.
 25 |     :returns: A generator of SeqRecord objects.
 26 |     :rtype: generator
 27 | 
 28 |     """
 29 |     chrom_info = OrderedDict(
 30 |         ('chr' + str(i),
 31 |          int(dist.sample_truncated_gamma(mean_length, gamma_shape, low_truncation, high_truncation)))
 32 |         for i in range(number_chromosomes))
 33 |     sim_iter = (seq_util.new_dna_record(sim_seq.simulate_sequence(length, base_frequencies), name)
 34 |                 for name, length in six.iteritems(chrom_info))
 35 |     return sim_iter
 36 | 
 37 | 
 38 | def sample_chromosome(chromosomes):
 39 |     """Sample a random chromosome.
 40 | 
 41 |     :param chromosomes: A collection of SeqRecord object.
 42 |     :returns: A randomly sampled element from the input collection.
 43 |     :rtype: SeqRecord
 44 |     """
 45 |     indexes = range(len(chromosomes))
 46 |     pick = np.random.choice(indexes)
 47 |     return chromosomes[pick]
 48 | 
 49 | 
 50 | def simulate_fragment(chromosome, mean_length, gamma_shape, low_truncation, high_truncation, fragment_number):
 51 |     """Simulate a fragment from a chromosome.
 52 | 
 53 |     :param chromosome: Chromosome to simulate fragment from, SeqRecord object.
 54 |     :param mean_length: Mean length of simulated fragment.
 55 |     :param gamma_shape: Shape parameter of length distribution.
 56 |     :param low_truncation: Minimum read length.
 57 |     :param high_truncation: Maximum read length.
 58 |     :param fragment_number: The unique identifier of fragment in simulation (number of fragment).
 59 |     :returns: A named tuple with chromosome id, fragment number, start, end and sequence.
 60 |     :rtype: namedtuple
 61 |     """
 62 |     fragment_length = int(dist.sample_truncated_gamma(
 63 |         mean_length, gamma_shape, low_truncation, high_truncation))
 64 |     upper_boundary = len(chromosome) - fragment_length
 65 |     # Special case when upper boundary is less than the read length. Maybe
 66 |     # should handle this by rejection?
 67 |     if upper_boundary < fragment_length:
 68 |         start = 0
 69 |         end = len(chromosome)
 70 |     else:
 71 |         start = np.random.randint(0, upper_boundary)
 72 |         end = start + fragment_length
 73 |     fragment_sequence = chromosome.seq[start:end]
 74 |     return Fragment(chromosome.id, fragment_number, start, end, fragment_sequence)
 75 | 
 76 | 
 77 | def simulate_fragments(chromosomes, mean_length, gamma_shape, low_truncation, high_truncation, number_fragments):
 78 |     """Simulate a fragments from a set of chromosomes. Chromosomes are picked randomly for each fragment.
 79 | 
 80 |     :param chromosomes: Chromosomes to simulate fragment from, a list of SeqRecord objects.
 81 |     :param mean_length: Mean length of simulated fragments.
 82 |     :param gamma_shape: Shape parameter of length distribution.
 83 |     :param low_truncation: Minimum read length.
 84 |     :param high_truncation: Maximum read length.
 85 |     :param number_fragments: Number of fragments to simulate.
 86 |     :returns: An iterator named tuples with chromosome id, fragment number, start, end and sequence.
 87 |     :rtype: generator
 88 |     """
 89 |     fragment_uid = 0
 90 |     while True:
 91 |         if fragment_uid >= number_fragments:
 92 |             break
 93 |         chromosome = sample_chromosome(chromosomes)
 94 |         fragment = simulate_fragment(
 95 |             chromosome, mean_length, gamma_shape, low_truncation, high_truncation, fragment_uid)
 96 |         if (fragment.end - fragment.start) > 0:
 97 |             fragment_uid += 1
 98 |             yield fragment
 99 |         else:
100 |             sys.stderr.write(
101 |                 "Skipped zero length fragment! Consider increase minimum read length!\n")
102 | 


--------------------------------------------------------------------------------
/wub/simulate/seq.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import numpy as np
  4 | import functools
  5 | from collections import namedtuple
  6 | 
  7 | from wub.util import seq as seq_util
  8 | 
  9 | uniform_probs = [0.25, 0.25, 0.25, 0.25]
 10 | 
 11 | strand_directions = ['+', '-']
 12 | 
 13 | MutatedSeq = namedtuple(
 14 |     'MutatedSeq', 'seq real_qual real_subst real_del real_ins cigar')
 15 | 
 16 | cigar_operations = {'match': 'M', 'substitution': 'M', 'insertion': 'I', 'deletion': 'D'}
 17 | 
 18 | 
 19 | def sample_direction(forward_prob):
 20 |     return np.random.choice(strand_directions, p=[forward_prob, 1 - forward_prob])
 21 | 
 22 | 
 23 | def random_base(probs=uniform_probs):
 24 |     """Generate a random DNA base.
 25 | 
 26 |     :param probs: Probabilities of sampling a base, in the ACGT order.
 27 |     :returns: A sampled base.
 28 |     :rtype: str
 29 |     """
 30 |     return np.random.choice(seq_util.bases, p=probs)
 31 | 
 32 | 
 33 | def random_base_except(excluded, probs=uniform_probs):
 34 |     """Generate a random base according to the specified probabilities with the exclusion of the specified base.
 35 | 
 36 |     :param excluded: Exclude this base from sampling.
 37 |     :param probs: Base sampling probabilities in the ACGT order.
 38 |     :returns: A sampled base.
 39 |     :rtype: str
 40 |     """
 41 |     if len(probs) != len(seq_util.bases):
 42 |         raise ValueError('Probability vector has wrong length!')
 43 |     # Filter out excluded base:
 44 |     bp_dict = dict((x, y)
 45 |                    for x, y in zip(seq_util.bases, probs) if x != excluded)
 46 |     filtered_bases = list(bp_dict.keys())
 47 |     norm_probs = np.array(list(bp_dict.values()), dtype=float)
 48 |     # Re-normalise probabilities:
 49 |     norm_probs = norm_probs / np.sum(norm_probs)
 50 |     return np.random.choice(filtered_bases, p=norm_probs)
 51 | 
 52 | 
 53 | def simulate_sequence(length, probs=uniform_probs):
 54 |     """Simulate sequence of specified length and base composition.
 55 | 
 56 |     :param length: Length of simulated sequence.
 57 |     :param probs: Base composition vector in the ACGT order.
 58 |     :returns: Simulated sequence.
 59 |     :rtype: str
 60 |     """
 61 |     return ''.join(np.random.choice(seq_util.bases, size=length, p=probs))
 62 | 
 63 | 
 64 | def sample_error_type(error_weights):
 65 |     """Sample error type from error weights dictionary.
 66 | 
 67 |     :param error_weights: A dcitionary with (type, probability) pairs.
 68 |     :returns: Error type
 69 |     :rtype: str
 70 |     """
 71 |     return np.random.choice(list(error_weights.keys()), p=list(error_weights.values()))
 72 | 
 73 | 
 74 | def cigar_list_to_string(cigar_list):
 75 |     """Sample error type from error weights dictionary.
 76 | 
 77 |     :param error_weights: A dcitionary with (type, probability) pairs.
 78 |     :returns: Error type
 79 |     :rtype: str
 80 |     """
 81 | 
 82 |     tmp = map(lambda x: str(x[0]) + str(x[1]), cigar_list)
 83 |     return ''.join(tmp)
 84 | 
 85 | 
 86 | def compress_raw_cigar_list(raw_cigar):
 87 |     """Sample error type from error weights dictionary.
 88 | 
 89 |     :param error_weights: A dcitionary with (type, probability) pairs.
 90 |     :returns: Error type
 91 |     :rtype: str
 92 |     """
 93 | 
 94 |     raw_cigar[0] = [raw_cigar[0]]
 95 | 
 96 |     def cigar_op_compose(a, b):
 97 |         x = a.pop()
 98 |         if x[1] == b[1]:
 99 |             a.append((x[0] + b[0], x[1]))
100 |         else:
101 |             a.extend([x, b])
102 |         return a
103 | 
104 |     cigar = functools.reduce(cigar_op_compose, raw_cigar)
105 |     return cigar
106 | 
107 | 
108 | def simulate_sequencing_errors(sequence, error_rate, error_weights):
109 |     """Simulate substitutions, deletions and insertions.
110 | 
111 |     :param sequence: Input sequence.
112 |     :param error_rate: Total error rate.
113 |     :param error_weights: A dictionary with error types as keys and probabilities as values.
114 |     The possible error types are: substitution, deletion, insertion.
115 |     :returns: A named tuple with elements: mutated sequence, realised quality, number of realised substitutions,
116 |     number of realised deletions, number of realised insertions, cigar string.
117 |     :rtype: namedtuple
118 |     """
119 |     if len(sequence) == 0:
120 |         raise Exception('Cannot simulate sequencing errors on empty sequence!')
121 | 
122 |     new_bases = []
123 | 
124 |     realised_substitutions = 0
125 |     realised_deletions = 0
126 |     realised_insertions = 0
127 |     raw_cigar_list = []
128 | 
129 |     for position, base in enumerate(sequence):
130 |         if np.random.uniform() < error_rate:
131 |             error_type = sample_error_type(error_weights)
132 | 
133 |             if error_type == 'substitution':
134 |                 new_base = random_base_except(base)
135 |                 realised_substitutions += 1
136 |                 raw_cigar_list.append((1, cigar_operations[error_type]))
137 | 
138 |             elif error_type == 'deletion':
139 |                 new_base = ''
140 |                 realised_deletions += 1
141 |                 raw_cigar_list.append((1, cigar_operations[error_type]))
142 | 
143 |             elif error_type == 'insertion':
144 |                 new_base = base + random_base()
145 |                 realised_insertions += 1
146 |                 raw_cigar_list.append((1, cigar_operations['match']))
147 |                 raw_cigar_list.append((1, cigar_operations[error_type]))
148 | 
149 |             else:
150 |                 raise Exception("Unhandled error type: {}".format(error_type))
151 |         else:
152 |             raw_cigar_list.append((1, cigar_operations['match']))
153 |             new_base = base
154 |         new_bases.append(new_base)
155 | 
156 |     new_sequence = ''.join(new_bases)
157 |     cigar = cigar_list_to_string(compress_raw_cigar_list(raw_cigar_list))
158 | 
159 |     realised_events = realised_substitutions + \
160 |         realised_deletions + realised_insertions
161 |     realised_quality = seq_util.prob_to_phred(
162 |         round(float(realised_events) / float(len(sequence)), 3))
163 |     mutated_record = MutatedSeq(
164 |         new_sequence, realised_quality, realised_substitutions, realised_deletions, realised_insertions, cigar)
165 |     return mutated_record
166 | 
167 | 
168 | def add_errors(seq, nr_errors, error_type):
169 |     """Introduce a specified number of errors in the target sequence at random positions.
170 | 
171 |     :param seq: Input DNA sequence.
172 |     :param nr_errors: Number of mismatches to introduce.
173 |     :returns: Mutated sequence.
174 |     :rtype: str
175 |     """
176 |     seq = list(seq)
177 |     positions = np.random.choice(np.arange(len(seq)), size=nr_errors, replace=False)
178 |     if error_type == 'substitution':
179 |         for pos in positions:
180 |             seq[pos] = random_base_except(seq[pos])
181 |     elif error_type == 'deletion':
182 |         for pos in positions:
183 |             seq[pos] = ''
184 |     elif error_type == 'insertion':
185 |         for pos in positions:
186 |             seq[pos] = seq[pos] + random_base()
187 |     else:
188 |         raise Exception('Invalid error type')
189 |     return ''.join(seq)
190 | 


--------------------------------------------------------------------------------
/wub/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/wub/tests/data/test_bam_stats/stat_ref.fas:
--------------------------------------------------------------------------------
1 | >seq_0
2 | GTACAGCGGACAGTATGAAGGAAACTGACAACGCGAGTCACGTAATGGAGATGGATCCCAGACTGTTGCCCGGACCGATTCAAGCACA
3 | 


--------------------------------------------------------------------------------
/wub/tests/data/test_bam_stats/stat_test.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/tests/data/test_bam_stats/stat_test.bam


--------------------------------------------------------------------------------
/wub/tests/data/test_bam_stats/stat_test.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/tests/data/test_bam_stats/stat_test.bam.bai


--------------------------------------------------------------------------------
/wub/tests/data/test_blastn_parse/blastn_test.coords:
--------------------------------------------------------------------------------
1 | seq_0	seq_0	100.00	100	0	0	1	100	201	300	3e-50	 181
2 | seq_0	seq_0	100.00	100	0	0	1	100	600	501	3e-50	 181
3 | 


--------------------------------------------------------------------------------
/wub/tests/data/test_nucmer_parse/nucmer_test.coords:
--------------------------------------------------------------------------------
1 | /nfs/vnx-home/bsipos/gt/r2/scripts/read_err.fas /nfs/vnx-home/bsipos/gt/r2/scripts/adapter.fas
2 | NUCMER
3 | 
4 |     [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
5 | =====================================================================================
6 |      454      500  |        1       49  |       47       49  |    90.00  | seq_0	adapter
7 |      956     1003  |        3       50  |       48       48  |    92.00  | seq_0	adapter
8 | 


--------------------------------------------------------------------------------
/wub/tests/test_bam_compare.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import tempfile
 4 | import os
 5 | 
 6 | from wub.bam import compare
 7 | 
 8 | 
 9 | class TestBamCompare(unittest.TestCase):
10 | 
11 |     """Test BAM comparison test."""
12 | 
13 |     def _generate_test_data(self):
14 |         """Generate test data for dnadiff test."""
15 |         fh_sam_one = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, mode='w')
16 |         self.sam_one = fh_sam_one.name
17 | 
18 |         data = """@SQ	SN:chr0	LN:827
19 | @SQ	SN:chr1	LN:6379
20 | @PG	ID:bwa	PN:bwa	VN:0.7.15-r1142-dirty	CL:bwa mem genome.fas reads.fq
21 | r0_chr1_4118_4168_+/q17/s0/d0/i1	0	chr1	4119	60	8M1I42M	*	0	0	CATTTGGTACCATTGTGATCCGCTCTTAGAAACTTTTGGCACTTTATCGCG	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:1	MD:Z:50	AS:i:43	XS:i:0
22 | r1_chr1_72_122_+/q12/s0/d2/i1	4	*	0	0	*	*	0	0	AGCGCAGTGGTCGACTTAGCTTATTCACGAGAGCCTTCCAACTGGCCAG	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:0	XS:i:0
23 | r2_chr0_279_329_-/q17/s0/d1/i0	16	chr0	280	60	16M1D33M	*	0	0	AGAACTTGCAAGCGCGGCTCCAGCCTTTCAGGACGAGACCCTCCAAGAC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:1	MD:Z:16^C33	AS:i:42	XS:i:0
24 | r3_chr1_60_110_+/q14/s1/d1/i0	0	chr1	61	51	36M1D13M	*	0	0	GGTGTTTTATATAGCGCAGTGTCGACTTAGCTTATTGCGACGAGCCTTC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:2	MD:Z:36^C0A12	AS:i:37	XS:i:0
25 | r4_chr1_1268_1318_+/q12/s1/d2/i0	0	chr1	1269	28	19M1D23M1D6M	*	0	0	GTATTCCATCGAGCTGGATCAGTTTAGGAGTGTGCCTAGGTATATCCC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:3	MD:Z:19^G5G17^C6	AS:i:30	XS:i:0
26 | r5_chr0_576_626_-/q12/s1/d2/i0	4	*	0	0	*	*	0	0	GCAAATTTTACAGATGATAAAACACCGAATATTCAGACCGTGTAAATA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:0	XS:i:0
27 | r6_chr0_509_559_-/q12/s0/d3/i0	16	chr0	510	60	20M1D27M	*	0	0	TGTGGTAGGAGCGGAGCGGGCCCACACCCCCATCCCCCGCGAAATAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:1	MD:Z:20^A27	AS:i:40	XS:i:0
28 | r7_chr1_2417_2467_-/q12/s1/d1/i1	16	chr1	2418	41	6M1D37M1I6M	*	0	0	AGCCGATCATCCCGTCCCTGTTCACTCCTACGTCTTGGCTTGGAAAGTGT	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:3	MD:Z:6^G27G15	AS:i:34	XS:i:0
29 | r8_chr0_661_711_-/q11/s0/d3/i1	4	*	0	0	*	*	0	0	GTCTGAGGCGCCATATTAGGCGGGCAAAATGGACTATGACTGTGGCAG	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:0	XS:i:0
30 | r9_chr0_523_573_-/q14/s0/d1/i1	16	chr0	524	59	3M1I25M1D21M	*	0	0	AGCGGGGACCCACACCCCCATCCCCCGCGAATAATTCAACGTTCGCATTA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:2	MD:Z:28^A21	AS:i:39	XS:i:0
31 | r10_chr1_2417_2467_-/q12/s1/d1/i1	16	chr1	2418	41	6M1D37M1I6M	*	0	0	AGCCGATCATCCCGTCCCTGTTCACTCCTACGTCTTGGCTTGGAAAGTGT	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:3	MD:Z:6^G27G15	AS:i:34	XS:i:0
32 | """
33 | 
34 |         fh_sam_one.write(data)
35 |         fh_sam_one.flush()
36 |         fh_sam_one.close()
37 | 
38 |         data = """@SQ	SN:chr0	LN:827
39 | @SQ	SN:chr1	LN:6379
40 | @PG	ID:bwa	PN:bwa	VN:0.7.15-r1142-dirty	CL:bwa mem genome.fas reads.fq
41 | r0_chr1_4118_4168_+/q17/s0/d0/i1	0	chr1	4119	60	8M1I42M	*	0	0	CATTTGGTACCATTGTGATCCGCTCTTAGAAACTTTTGGCACTTTATCGCG	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:1	MD:Z:50	AS:i:43	XS:i:0
42 | r1_chr1_72_122_+/q12/s0/d2/i1	4	*	0	0	*	*	0	0	AGCGCAGTGGTCGACTTAGCTTATTCACGAGAGCCTTCCAACTGGCCAG	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:0	XS:i:0
43 | r2_chr0_279_329_-/q17/s0/d1/i0	16	chr0	280	60	16M1D33M	*	0	0	AGAACTTGCAAGCGCGGCTCCAGCCTTTCAGGACGAGACCCTCCAAGAC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:1	MD:Z:16^C33	AS:i:42	XS:i:0
44 | r3_chr1_60_110_+/q14/s1/d1/i0	0	chr1	61	51	36M1D13M	*	0	0	GGTGTTTTATATAGCGCAGTGTCGACTTAGCTTATTGCGACGAGCCTTC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:2	MD:Z:36^C0A12	AS:i:37	XS:i:0
45 | r4_chr1_1268_1318_+/q12/s1/d2/i0	0	chr1	1269	28	19M1D23M1D6M	*	0	0	GTATTCCATCGAGCTGGATCAGTTTAGGAGTGTGCCTAGGTATATCCC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:3	MD:Z:19^G5G17^C6	AS:i:30	XS:i:0
46 | r5_chr0_576_626_-/q12/s1/d2/i0	4	*	0	0	*	*	0	0	GCAAATTTTACAGATGATAAAACACCGAATATTCAGACCGTGTAAATA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:0	XS:i:0
47 | r6_chr0_509_559_-/q12/s0/d3/i0	16	chr0	510	60	20M1D27M	*	0	0	TGTGGTAGGAGCGGAGCGGGCCCACACCCCCATCCCCCGCGAAATAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:1	MD:Z:20^A27	AS:i:40	XS:i:0
48 | r7_chr1_2417_2467_-/q12/s1/d1/i1	16	chr1	2418	41	6M1D37M1I6M	*	0	0	AGCCGATCATCCCGTCCCTGTTCACTCCTACGTCTTGGCTTGGAAAGTGT	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:3	MD:Z:6^G27G15	AS:i:34	XS:i:0
49 | r8_chr0_661_711_-/q11/s0/d3/i1	4	*	0	0	*	*	0	0	GTCTGAGGCGCCATATTAGGCGGGCAAAATGGACTATGACTGTGGCAG	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:0	XS:i:0
50 | r9_chr0_523_573_-/q14/s0/d1/i1	16	chr0	726	59	4M25M1D21M	*	0	0	AGCGGGGACCCACACCCCCATCCCCCGCGAATAATTCAACGTTCGCATTA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:2	MD:Z:28^A21	AS:i:39	XS:i:0
51 | r10_chr1_2417_2467_-/q12/s1/d1/i1	16	chr1	2420	41	2H4M1D37M1I6M	*	0	0	CCGATCATCCCGTCCCTGTTCACTCCTACGTCTTGGCTTGGAAAGTGT	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:3	MD:Z:6^G27G15	AS:i:34	XS:i:0
52 | """
53 |         fh_sam_two = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, mode='w')
54 |         self.sam_two = fh_sam_two.name
55 | 
56 |         fh_sam_two.write(data)
57 |         fh_sam_two.flush()
58 |         fh_sam_two.close()
59 | 
60 |     def _cleanup_test_data(self):
61 |         """Cleanup test dataset."""
62 |         os.unlink(self.sam_one)
63 |         os.unlink(self.sam_two)
64 | 
65 |     def test_bam_read_counter(self):
66 |         """Test read_counter wrapper."""
67 |         self._generate_test_data()
68 |         res = compare.bam_compare(self.sam_one, self.sam_two, in_format='SAM')
69 |         self.assertAlmostEqual(res['AlignedSimilarity'], 0.8680, places=3)
70 |         self._cleanup_test_data()
71 | 


--------------------------------------------------------------------------------
/wub/tests/test_bam_read_counter.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import tempfile
 4 | import os
 5 | 
 6 | from wub.bam import read_counter
 7 | 
 8 | error_rate = 0.1
 9 | ref_length = 5000
10 | 
11 | 
12 | class TestBamReadCounter(unittest.TestCase):
13 | 
14 |     """Test BAM read counter wrapper."""
15 | 
16 |     def _generate_test_data(self):
17 |         """Generate test data for dnadiff test."""
18 |         fh_sam = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, mode='w')
19 |         self.sam = fh_sam.name
20 | 
21 |         data = """@SQ	SN:chr0	LN:827
22 | @SQ	SN:chr1	LN:6379
23 | @PG	ID:bwa	PN:bwa	VN:0.7.15-r1142-dirty	CL:bwa mem genome.fas reads.fq
24 | r0_chr1_4118_4168_+/q17/s0/d0/i1	0	chr1	4119	60	8M1I42M	*	0	0	CATTTGGTACCATTGTGATCCGCTCTTAGAAACTTTTGGCACTTTATCGCG	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:1	MD:Z:50	AS:i:43	XS:i:0
25 | r1_chr1_72_122_+/q12/s0/d2/i1	4	*	0	0	*	*	0	0	AGCGCAGTGGTCGACTTAGCTTATTCACGAGAGCCTTCCAACTGGCCAG	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:0	XS:i:0
26 | r2_chr0_279_329_-/q17/s0/d1/i0	16	chr0	280	60	16M1D33M	*	0	0	AGAACTTGCAAGCGCGGCTCCAGCCTTTCAGGACGAGACCCTCCAAGAC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:1	MD:Z:16^C33	AS:i:42	XS:i:0
27 | r3_chr1_60_110_+/q14/s1/d1/i0	0	chr1	61	51	36M1D13M	*	0	0	GGTGTTTTATATAGCGCAGTGTCGACTTAGCTTATTGCGACGAGCCTTC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:2	MD:Z:36^C0A12	AS:i:37	XS:i:0
28 | r4_chr1_1268_1318_+/q12/s1/d2/i0	0	chr1	1269	28	19M1D23M1D6M	*	0	0	GTATTCCATCGAGCTGGATCAGTTTAGGAGTGTGCCTAGGTATATCCC	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:3	MD:Z:19^G5G17^C6	AS:i:30	XS:i:0
29 | r5_chr0_576_626_-/q12/s1/d2/i0	4	*	0	0	*	*	0	0	GCAAATTTTACAGATGATAAAACACCGAATATTCAGACCGTGTAAATA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:0	XS:i:0
30 | r6_chr0_509_559_-/q12/s0/d3/i0	16	chr0	510	60	20M1D27M	*	0	0	TGTGGTAGGAGCGGAGCGGGCCCACACCCCCATCCCCCGCGAAATAA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:1	MD:Z:20^A27	AS:i:40	XS:i:0
31 | r7_chr1_2417_2467_-/q12/s1/d1/i1	16	chr1	2418	41	6M1D37M1I6M	*	0	0	AGCCGATCATCCCGTCCCTGTTCACTCCTACGTCTTGGCTTGGAAAGTGT	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:3	MD:Z:6^G27G15	AS:i:34	XS:i:0
32 | r8_chr0_661_711_-/q11/s0/d3/i1	4	*	0	0	*	*	0	0	GTCTGAGGCGCCATATTAGGCGGGCAAAATGGACTATGACTGTGGCAG	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	AS:i:0	XS:i:0
33 | r9_chr0_523_573_-/q14/s0/d1/i1	16	chr0	524	59	3M1I25M1D21M	*	0	0	AGCGGGGACCCACACCCCCATCCCCCGCGAATAATTCAACGTTCGCATTA	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:2	MD:Z:28^A21	AS:i:39	XS:i:0
34 | """
35 | 
36 |         fh_sam.write(data)
37 |         fh_sam.flush()
38 |         fh_sam.close()
39 | 
40 |     def _cleanup_test_data(self):
41 |         """Cleanup test dataset."""
42 |         os.unlink(self.sam)
43 | 
44 |     def test_bam_read_counter(self):
45 |         """Test read_counter wrapper."""
46 |         self._generate_test_data()
47 |         res = read_counter.count_reads(self.sam, in_format='SAM')
48 |         self.assertEqual(res[0]['chr0'], 3)
49 |         self.assertEqual(res[0]['chr1'], 4)
50 |         self._cleanup_test_data()
51 | 


--------------------------------------------------------------------------------
/wub/tests/test_bam_stats.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import six
 4 | from os import path
 5 | from collections import OrderedDict
 6 | from Bio import SeqIO
 7 | from wub.bam import stats
 8 | from wub.util import seq as seq_util
 9 | 
10 | 
11 | class TestBamStats(unittest.TestCase):
12 | 
13 |     """Test BAM statistics functions."""
14 | 
15 |     def test_error_and_read_stats(self):
16 |         """Test the gathering of error and read statistics."""
17 |         top = path.dirname(__file__)
18 |         ref_fasta = path.join(top, "data/test_bam_stats/stat_ref.fas")
19 |         bam = path.join(top, "data/test_bam_stats/stat_test.bam")
20 |         refs = seq_util.read_seq_records_dict(ref_fasta)
21 |         res = stats.error_and_read_stats(
22 |             bam, refs, context_sizes=(1, 1), region=None, min_aqual=0, verbose=False)
23 | 
24 |         # Test evenets:
25 |         self.assertEqual(res['events']['AGA'], {'*': 1, 'G': 2})
26 |         self.assertEqual(res['events']['CGA'], {'-': 1, 'G': 2})
27 |         self.assertEqual(res['events']['ACA'], {'C': 2, 'T': 1})
28 | 
29 |         # Test indel properties:
30 |         self.assertEqual(res['indel_dists']['insertion_lengths'], {8: 1})
31 |         self.assertEqual(res['indel_dists']['insertion_composition'], {'G': 8})
32 |         self.assertEqual(res['indel_dists']['deletion_lengths'], {9: 1})
33 | 
34 |         # Test read statistics:
35 |         self.assertEqual(res['read_stats'], {'alignment_lengths': [87], 'mapping_quals': [47], 'unaligned_lengths': [], 'unaligned_quals': [], 'mqfail_alignment_lengths': [], 'mapped': 1, 'unmapped': 0, 'mqfail_aligned_quals': [], 'aligned_quals': [40], 'aligned_lengths': [87]})
36 | 
37 |     def test_read_stats(self):
38 |         """Test the gathering read statistics."""
39 |         top = path.dirname(__file__)
40 |         bam = path.join(top, "data/test_bam_stats/stat_test.bam")
41 |         res = stats.read_stats(bam, region=None, min_aqual=0, verbose=False)
42 | 
43 |         self.maxDiff = None
44 |         target = {'aligned_lengths': [87], 'read_stats': OrderedDict([('name', ['r0_seq_1_0_87_+/q93/s0/d0/i0']), ('ref', ['seq_0']), ('coverage', [1.0]), ('direction', ['+']), ('aln_length', [96]), ('insertion', [8]), ('deletion', [9]), ('mismatch', [1]), ('match', [78]), ('identity', [0.9873417721518988]), ('accuracy', [0.8125]), ('clipps', [0])]), 'aligned_quals': [
45 |             40], 'base_stats': {'deletion': 9, 'mismatch': 1, 'identity': 0.9873417721518988, 'insertion': 8, 'clipps': 0, 'aln_length': 96, 'match': 78, 'accuracy': 0.8125}, 'mapping_quals': [47], 'mqfail_alignment_lengths': [], 'alignment_lengths': [87], 'mqfail_aligned_quals': [], 'unaligned_lengths': [], 'unaligned_quals': [], 'mapped': 1, 'unmapped': 0}
46 |         self.assertEqual(res, target)
47 | 
48 |     def test_pileup_stats(self):
49 |         """Test the gathering read statistics."""
50 |         top = path.dirname(__file__)
51 |         bam = path.join(top, "data/test_bam_stats/stat_test.bam")
52 |         res = stats.pileup_stats(bam, region=None, verbose=False)
53 | 
54 |         self.assertEqual(res, {'coverage': {'seq_0': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1, 28: 1, 29: 1, 30: 1, 31: 1, 32: 1, 33: 1, 34: 1, 35: 1, 36: 1, 37: 1, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1, 46: 1, 47: 1, 48: 1, 49: 1, 50: 1, 51: 1, 52: 1, 53: 1, 54: 1, 55: 1, 56: 1, 57: 1, 58: 1, 59: 1, 60: 1, 61: 1, 62: 1, 63: 1, 64: 1, 65: 1, 66: 1, 67: 1, 68: 1, 69: 1, 70: 1, 71: 1, 72: 1, 73: 1, 74: 1, 75: 1, 76: 1, 77: 1, 78: 1, 79: 1, 80: 1, 81: 1, 82: 1, 83: 1, 84: 1, 85: 1, 86: 1, 87: 1}}, 'qualities': {'seq_0': {0: [40], 1: [40], 2: [40], 3: [40], 4: [40], 5: [
55 |                          40], 6: [40], 7: [40], 8: [40], 9: [40], 10: [40], 11: [40], 12: [40], 13: [40], 14: [40], 15: [40], 16: [40], 17: [40], 18: [40], 19: [40], 20: [40], 21: [40], 22: [40], 23: [40], 24: [40], 34: [40], 35: [40], 36: [40], 37: [40], 38: [40], 39: [40], 40: [40], 41: [40], 42: [40], 43: [40], 44: [40], 45: [40], 46: [40], 47: [40], 48: [40], 49: [40], 50: [40], 51: [40], 52: [40], 53: [40], 54: [40], 55: [40], 56: [40], 57: [40], 58: [40], 59: [40], 60: [40], 61: [40], 62: [40], 63: [40], 64: [40], 65: [40], 66: [40], 67: [40], 68: [40], 69: [40], 70: [40], 71: [40], 72: [40], 73: [40], 74: [40], 75: [40], 76: [40], 77: [40], 78: [40], 79: [40], 80: [40], 81: [40], 82: [40], 83: [40], 84: [40], 85: [40], 86: [40], 87: [40]}}})
56 | 
57 |     def test_fragment_stats(self):
58 |         """Test the gathering of fragment statistics."""
59 |         top = path.dirname(__file__)
60 |         bam = path.join(top, "data/test_bam_stats/stat_test.bam")
61 |         ref = path.join(top, "data/test_bam_stats/stat_ref.fas")
62 |         references = SeqIO.index(ref, format='fasta')
63 |         chrom_lengths = {name: len(so) for name, so in six.iteritems(references)}
64 |         res = stats.frag_coverage(bam, chrom_lengths, region=None, min_aqual=0, verbose=False)
65 | 
66 |         self.maxDiff = None
67 |         target = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68 |                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69 |                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70 |                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
71 |         self.assertEqual(list(res['frags_fwd']['seq_0']), target)
72 | 


--------------------------------------------------------------------------------
/wub/tests/test_blastn_coord_parse.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from os import path
 4 | from wub.parsers import blastn
 5 | 
 6 | 
 7 | class TestBlastnCoordParse(unittest.TestCase):
 8 | 
 9 |     def test_nucmer_coord_parse(self):
10 |         """Test blastn outfmt 6 cooridnate parsing."""
11 |         top = path.dirname(__file__)
12 |         coord_file = path.join(top, "data/test_blastn_parse/blastn_test.coords")
13 |         records = blastn.parse_coords(coord_file)
14 |         self.assertEqual(records,  [{'gapopen': 0, 'query_end': 100, 'mismatch': 0, 'ref_end': 300, 'query': 'seq_0', 'identity': 100.0, 'bitscore': 181.0, 'query_start': 1, 'ref_start': 201, 'strand': '+', 'aln_length': 100, 'ref': 'seq_0', 'evalue': 3e-50}, {'gapopen': 0, 'query_end': 100, 'mismatch': 0, 'ref_end': 600, 'query': 'seq_0', 'identity': 100.0, 'bitscore': 181.0, 'query_start': 1, 'ref_start': 501, 'strand': '-', 'aln_length': 100, 'ref': 'seq_0', 'evalue': 3e-50}])
15 | 


--------------------------------------------------------------------------------
/wub/tests/test_contig_stats.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from wub.read_stats import contig_stats
 4 | import pandas as pd
 5 | 
 6 | 
 7 | class TestContigStats(unittest.TestCase):
 8 |     """Test N50 utility function."""
 9 | 
10 |     def test_N50(self):
11 |         """Test calculation of N50."""
12 |         sequence_lengths = pd.DataFrame({'dummy': [2, 3, 4, 5, 6, 7, 8, 9, 10]})
13 |         self.assertEqual(contig_stats.N50(sequence_lengths, 'dummy'), 8)
14 | 


--------------------------------------------------------------------------------
/wub/tests/test_example.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | 
 4 | class ExampleTest(unittest.TestCase):
 5 | 
 6 |     def setUp(self):
 7 |         pass
 8 | 
 9 |     def tearDown(self):
10 |         pass
11 | 
12 |     def test_success(self):
13 |         self.assertTrue(True)
14 | 


--------------------------------------------------------------------------------
/wub/tests/test_mappers_lastal.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | import unittest
 3 | import tempfile
 4 | import os
 5 | 
 6 | from wub.mappers import lastal
 7 | from wub.util import seq as seq_util
 8 | from wub.simulate import seq as sim_seq
 9 | from wub.util import cmd as cmd_util
10 | 
11 | error_rate = 0.1
12 | ref_length = 1000
13 | 
14 | 
15 | class TestMappersLastal(unittest.TestCase):
16 | 
17 |     def setUp(self):
18 |         fh_ref = tempfile.NamedTemporaryFile(suffix=".fas", delete=False, mode='w')
19 |         self.ref_fasta = fh_ref.name
20 |         fh_target = tempfile.NamedTemporaryFile(suffix=".fas", delete=False, mode='w')
21 |         self.target_fasta = fh_target.name
22 | 
23 |         self.ref = sim_seq.simulate_sequence(ref_length)
24 |         nr_errors = int(len(self.ref) * error_rate)
25 |         self.target = sim_seq.add_errors(self.ref, nr_errors, 'substitution')
26 | 
27 |         left_flanking = sim_seq.simulate_sequence(50)
28 |         right_flanking = sim_seq.simulate_sequence(50)
29 | 
30 |         self.ref = left_flanking + self.ref + right_flanking
31 |         self.target = left_flanking + self.target + right_flanking
32 | 
33 |         fh_ref.write(">ref\n{}\n".format(self.ref))
34 |         fh_ref.flush()
35 |         fh_ref.close()
36 | 
37 |         fh_target.write(">target\n{}\n".format(self.target))
38 |         fh_target.flush()
39 |         fh_target.close()
40 | 
41 |     def tearDown(self):
42 |         os.unlink(self.ref_fasta)
43 |         os.unlink(self.target_fasta)
44 | 
45 |     def test_parse_lastal_identical(self):
46 |         raw = """\
47 | # batch 0
48 | a score=23 EG2=3.8e+06 E=5.2e-13
49 | s Simulomonas 0 23 + 23 ATGCGGGGGATAGGACCATATCT
50 | s tig00000000 0 23 + 23 ATGCGGGGGATAGGACCATATCT
51 |         """
52 |         parsed = six.next(lastal.parse_lastal(raw))
53 |         acc = seq_util.alignment_stats(parsed.r_aln, parsed.q_aln).accuracy
54 |         self.assertEqual(acc, 1.0)
55 |         self.assertEqual(parsed.score, 23)
56 | 
57 |     def test_parse_lastal_difference(self):
58 |         raw = """\
59 | # batch 0
60 | a score=23 EG2=3.8e+06 E=5.2e-13
61 | s Simulomonas 0 23 + 23 TTGCGGGGGATAGGACCATATCT
62 | s tig00000000 0 23 + 23 ATGCGGGGGATAGGACCATATCT
63 |         """
64 |         parsed = six.next(lastal.parse_lastal(raw))
65 |         acc = seq_util.alignment_stats(parsed.r_aln, parsed.q_aln).accuracy
66 |         self.assertAlmostEqual(acc, 0.9565, places=3)
67 | 
68 |     def test_parse_lastal_zero(self):
69 |         raw = """\
70 | # batch 0
71 | a score=23 EG2=3.8e+06 E=5.2e-13
72 | s Simulomonas 0 23 + 23 CCCTCCCCCCCCCCCTTCCCCAC
73 | s tig00000000 0 23 + 23 ATGCGGGGGATAGGACCATATCT
74 |         """
75 |         parsed = six.next(lastal.parse_lastal(raw))
76 |         acc = seq_util.alignment_stats(parsed.r_aln, parsed.q_aln).accuracy
77 |         self.assertAlmostEqual(acc, 0.0, places=3)
78 | 
79 |     @unittest.skipIf(not cmd_util.find_executable('lastal'),
80 |                      "Lastal binary not found, skipping integration tests.")
81 |     def test_lastal_compare_genomes(self):
82 |         tmp = lastal.compare_genomes_lastal(
83 |             self.ref_fasta, self.target_fasta)
84 |         substs = tmp['substitutions'][0]
85 |         self.assertEqual(int(ref_length * error_rate), substs)
86 | 


--------------------------------------------------------------------------------
/wub/tests/test_nucmer_coord_parse.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from os import path
 4 | from wub.parsers import mummer
 5 | 
 6 | 
 7 | class TestNucmerCoordParse(unittest.TestCase):
 8 | 
 9 |     """Test numcmer cooridnate parsing."""
10 | 
11 |     def test_nucmer_coord_parse(self):
12 |         """Test parsing of nucmer coordinate files."""
13 |         top = path.dirname(__file__)
14 |         coord_file = path.join(top, "data/test_nucmer_parse/nucmer_test.coords")
15 |         records = mummer.parse_coords(coord_file)
16 |         self.assertEqual(records, [{'query_len': 49, 'query_end': 49, 'ref_end': 500, 'ref_len': 47, 'query': 'adapter', 'query_start': 1, 'ref_start': 454, 'ref': 'seq_0', 'identity': 90.0}, {'query_len': 48, 'query_end': 50, 'ref_end': 1003, 'ref_len': 48, 'query': 'adapter', 'query_start': 3, 'ref_start': 956, 'ref': 'seq_0', 'identity': 92.0}])
17 | 


--------------------------------------------------------------------------------
/wub/tests/test_simulate_genome.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | import unittest
 3 | 
 4 | from wub.simulate import genome as sim_genome
 5 | 
 6 | 
 7 | class TestSimulateGenome(unittest.TestCase):
 8 | 
 9 |     """Test genome simulation utilities."""
10 | 
11 |     def test_simulate_genome(self):
12 |         """Test genome simulator."""
13 |         record = six.next(sim_genome.simulate_genome(number_chromosomes=1, mean_length=1000,
14 |                                                      gamma_shape=50, low_truncation=1000, high_truncation=1001, base_frequencies=[0.25] * 4))
15 |         self.assertEqual(len(record), 1000)
16 | 
17 |     def test_simulate_fragment(self):
18 |         """Test fragment simulator."""
19 |         chrom = six.next(sim_genome.simulate_genome(number_chromosomes=1, mean_length=1000,
20 |                                                     gamma_shape=50, low_truncation=1000, high_truncation=1001, base_frequencies=[0.25] * 4))
21 |         frag = sim_genome. simulate_fragment(
22 |             chrom, mean_length=50, gamma_shape=50, low_truncation=50, high_truncation=51, fragment_number=0)
23 |         self.assertEqual(frag.end - frag.start, 50)
24 | 
25 |     def test_simulate_fragment_edge(self):
26 |         """Test fragment simulator (edge case)."""
27 |         chrom = six.next(sim_genome.simulate_genome(number_chromosomes=1, mean_length=1000,
28 |                                                     gamma_shape=50, low_truncation=1000, high_truncation=1001, base_frequencies=[0.25] * 4))
29 |         frag = sim_genome. simulate_fragment(
30 |             chrom, mean_length=2000, gamma_shape=50, low_truncation=2000, high_truncation=2001, fragment_number=0)
31 |         self.assertEqual(frag.end - frag.start, 1000)
32 | 


--------------------------------------------------------------------------------
/wub/tests/test_simulate_seq.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import editdistance
 4 | import numpy as np
 5 | from wub.simulate import seq as sim_seq
 6 | from wub.util import seq as seq_util
 7 | 
 8 | 
 9 | class TestSimulateSeq(unittest.TestCase):
10 | 
11 |     """Test sequence simulation utilities."""
12 | 
13 |     def test_simulate_sequencing_errors(self):
14 |         """Test function simulating sequencing errors."""
15 |         error_rate = 0.1
16 |         error_weights = {'substitution': 1.0 / 6,
17 |                          'insertion': 1.0 / 6,
18 |                          'deletion': 4.0 / 6}
19 |         sequence = sim_seq.simulate_sequence(5000)
20 |         mutated_record = sim_seq.simulate_sequencing_errors(
21 |             sequence, error_rate, error_weights)
22 |         distance = editdistance.eval(sequence, mutated_record.seq)
23 |         expected_errors = len(sequence) * error_rate
24 |         errors_sd = np.sqrt(len(sequence) * error_rate * (1 - error_rate))
25 |         # Should pass 0.9973 proportion of cases:
26 |         self.assertTrue(expected_errors - errors_sd * 3 < distance < expected_errors +
27 |                         errors_sd * 3, msg="expected: {} realised:{}".format(expected_errors, distance))
28 | 
29 |     def test_add_errors(self):
30 |         """Test function adding sequencing errors."""
31 |         seq = "ATGCATGCATGC"
32 |         mut_seq = sim_seq.add_errors(seq, 6, 'substitution')
33 |         self.assertSequenceEqual(seq_util.alignment_stats(seq, mut_seq), (12, 6, 0, 0, 0.5))
34 | 
35 |     def test_compress_raw_cigar_list(self):
36 |         """Test compression of raw cigar lists."""
37 |         cigar_list = [
38 |             (1, 'M'), (1, 'M'), (1, 'M'), (1, 'D'), (1, 'D'), (1, 'M'), (1, 'I'), (1, 'M')]
39 |         compressed = sim_seq.compress_raw_cigar_list(cigar_list)
40 |         expected = [(3, 'M'), (2, 'D'), (1, 'M'), (1, 'I'), (1, 'M')]
41 |         self.assertSequenceEqual(compressed, expected)
42 | 
43 |     def test_cigar_list_to_string(self):
44 |         """Test formatting of cigar strings."""
45 |         cigar_list = [(3, 'M'), (2, 'D'), (1, 'M'), (1, 'I'), (1, 'M')]
46 |         cigar_string = sim_seq.cigar_list_to_string(cigar_list)
47 |         expected = "3M2D1M1I1M"
48 |         self.assertEqual(cigar_string, expected)
49 | 


--------------------------------------------------------------------------------
/wub/tests/test_util_parse.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from wub.util import parse
 4 | import numpy as np
 5 | 
 6 | 
 7 | class TestUtilParse(unittest.TestCase):
 8 | 
 9 |     """Test parsing utilities."""
10 | 
11 |     def test_separated_list_to_floats(self):
12 |         """Test parsing of separated lists."""
13 |         string = "0.1,0.2,0.3"
14 |         parsed = (0.1, 0.2, 0.3)
15 |         self.assertSequenceEqual(parse.separated_list_to_floats(string), parsed)
16 | 
17 |     def test_args_string_to_dict(self):
18 |         """Test parsing of dictionaries encoded in separated strings."""
19 |         string = "a:0.1,b:0.2,c:0.3"
20 |         parsed = (("a", "0.1"), ("b", "0.2"), ("c", "0.3"))
21 |         self.assertSequenceEqual(tuple(parse.args_string_to_dict(string).items()), parsed)
22 | 
23 |     def test_args_string_to_dict_empty(self):
24 |         """Test parsing of dictionaries encoded in separated strings (empty input)."""
25 |         string = ""
26 |         self.assertEqual(len(tuple(parse.args_string_to_dict(string).items())), 0)
27 | 
28 |     def test_normalise_array(self):
29 |         """Test array normalization."""
30 |         a = np.array([2, 2, 2, 2])
31 |         a_norm = np.array([0.25, 0.25, 0.25, 0.25], dtype=float)
32 |         self.assertTrue(all(parse.normalise_array(a) == a_norm))
33 | 


--------------------------------------------------------------------------------
/wub/tests/test_util_seq.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from wub.util import seq
 4 | from Bio.SeqRecord import SeqRecord
 5 | 
 6 | 
 7 | class TestUtilSeq(unittest.TestCase):
 8 | 
 9 |     """Test sequence utilities."""
10 | 
11 |     def test_new_dna_record(self):
12 |         """Test the construction of new DNA SeqRecord."""
13 |         sequence = seq.new_dna_record("ATGC", "test")
14 |         self.assertEqual(type(sequence), SeqRecord)
15 | 
16 |     def test_mock_qualities(self):
17 |         """Test quality mocking function."""
18 |         sequence = seq.new_dna_record("ATGC", "test")
19 |         mock_qual = 40
20 |         qual_seq = seq.mock_qualities(sequence, mock_qual)
21 |         self.assertSequenceEqual(
22 |             qual_seq.letter_annotations["phred_quality"], [mock_qual] * len(qual_seq))
23 | 
24 |     def test_reverse_complement(self):
25 |         """Test reverse complementing."""
26 |         sequence = "ATGCNXatgcnx-"
27 |         revcomp = "-xngcatXNGCAT"
28 |         self.assertEqual(seq.reverse_complement(sequence), revcomp)
29 | 
30 |     def test_prob_to_phred(self):
31 |         """Test error probability to phred score conversion."""
32 |         self.assertEqual(seq.prob_to_phred(0.5), 3)
33 | 
34 |     def test_prob_to_phred_max(self):
35 |         """Test error probability to phred score conversion (very small error)."""
36 |         self.assertEqual(seq.prob_to_phred(1 * 10 ** -10), 93)
37 | 
38 |     def test_phred_to_prob(self):
39 |         """Test error probability to phred score conversion."""
40 |         self.assertAlmostEqual(seq.phred_to_prob(3), 0.5, places=2)
41 | 
42 |     def test_mean_qscore_large(self):
43 |         """Test mean q score calculation (large identical input)."""
44 |         scores = [30] * 5000
45 |         self.assertEqual(seq.mean_qscore(scores), 30)
46 | 
47 |     def test_mean_qscore(self):
48 |         """Test mean q score calculation."""
49 |         scores = [14, 10]
50 |         self.assertEqual(seq.mean_qscore(scores), 12)
51 | 
52 |     def test_alignment_stats(self):
53 |         """Test calculation of alignment statistics."""
54 |         seq1 = "ATGCTG-AAAAA"
55 |         seq2 = "TTG-TGCAAAAA"
56 |         self.assertEqual(
57 |             tuple(seq.alignment_stats(seq1, seq2)), (12, 1, 1, 1, 0.75))
58 | 


--------------------------------------------------------------------------------
/wub/tests/test_wrappers_dnadiff.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import tempfile
 4 | import os
 5 | 
 6 | from wub.wrappers import dnadiff
 7 | from wub.util import cmd as cmd_util
 8 | from wub.simulate import seq as sim_seq
 9 | 
10 | error_rate = 0.1
11 | ref_length = 5000
12 | 
13 | 
14 | class TestWrappersDnadiff(unittest.TestCase):
15 | 
16 |     """Test dnadiff wrapper."""
17 | 
18 |     def _generate_test_data(self):
19 |         """Generate test data for dnadiff test."""
20 |         fh_ref = tempfile.NamedTemporaryFile(suffix=".fas", delete=False, mode='w')
21 |         self.ref_fasta = fh_ref.name
22 |         fh_target = tempfile.NamedTemporaryFile(suffix=".fas", delete=False, mode='w')
23 |         self.target_fasta = fh_target.name
24 | 
25 |         self.ref = sim_seq.simulate_sequence(ref_length)
26 |         nr_errors = int(len(self.ref) * error_rate)
27 |         self.target = sim_seq.add_errors(self.ref, nr_errors, 'substitution')
28 | 
29 |         fh_ref.write(">ref\n{}\n".format(self.ref))
30 |         fh_ref.flush()
31 |         fh_ref.close()
32 | 
33 |         fh_target.write(">target\n{}\n".format(self.target))
34 |         fh_target.flush()
35 |         fh_target.close()
36 | 
37 |     def _cleanup_test_data(self):
38 |         """Cleanup test dataset."""
39 |         os.unlink(self.ref_fasta)
40 |         os.unlink(self.target_fasta)
41 | 
42 |     @unittest.skipIf(not cmd_util.find_executable('dnadiff'),
43 |                      "Dnadiff binary not found, skipping integration tests.")
44 |     def test_dnadiff(self):
45 |         """Test dnadiff wrapper."""
46 |         self._generate_test_data()
47 |         res, _, _ = dnadiff.dnadiff(self.ref_fasta, self.target_fasta)
48 |         self.assertAlmostEqual(
49 |             res['Alignments']['1-to-1']['AvgIdentity'].ref, 90.0, places=0)
50 |         self._cleanup_test_data()
51 | 


--------------------------------------------------------------------------------
/wub/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/util/__init__.py


--------------------------------------------------------------------------------
/wub/util/cmd.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Utilities related to running external commands."""
 3 | 
 4 | from distutils.spawn import find_executable as exefind_distutils
 5 | import sys
 6 | 
 7 | 
 8 | def find_executable(command):
 9 |     """Find executable in path corresponding to a command.
10 | 
11 |     :param command: Command.
12 |     :returns: Path to executable of False.
13 |     :rtype: str
14 |     """
15 |     # In the future we might want to eliminate the dependency of
16 |     # distutils.
17 |     return exefind_distutils(command)
18 | 
19 | 
20 | def ensure_executable(command):
21 |     """Find executable in path corresponding to a command and abort if not found.
22 | 
23 |     :param command: Command.
24 |     :returns: None
25 |     :rtype: object
26 |     """
27 |     # In the future we might want to eliminate the dependency of
28 |     # distutils.
29 |     if not find_executable(command):
30 |         sys.stderr.write(
31 |             "Required command \"{}\" not found in path! Aborting.!\n".format(command))
32 |         sys.exit(127)
33 | 


--------------------------------------------------------------------------------
/wub/util/misc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Yet uncategorised utility functions."""
 3 | 
 4 | import pickle
 5 | import os.path
 6 | 
 7 | 
 8 | def get_fname(fname):
 9 |     """ get the file name without extension.
10 | 
11 |     :param fname: file name
12 |     :return: file name
13 |     :rtype: str
14 | 
15 |     """
16 |     return os.path.splitext(os.path.basename(fname))[0]
17 | 
18 | 
19 | def get_extension(fname):
20 |     """ get the file extension.
21 | 
22 |     :param fname: file name
23 |     :return: file extention
24 |     :rtype: str format '.*'
25 | 
26 |     """
27 |     return os.path.splitext(os.path.basename(fname))[1]
28 | 
29 | 
30 | def _getextension(fast):
31 |     """ finds and check for the correct extension. If extension is not correct it will return Exception and exit.
32 | 
33 |     :param fast: fastq or fasta file
34 |     :return: "fastq" or "fasta"
35 |     :rtype: str
36 | 
37 |     """
38 | 
39 |     extension = get_extension(fast)
40 |     if extension in ('.fa', '.fasta'):
41 |         extension = "fasta"
42 |     elif extension in ('.fq', '.fastq'):
43 |         extension = "fastq"
44 |     else:
45 |         raise Exception('Incorrect file format')
46 |         exit()
47 |         # print >> sys.stderr, "Incorrect file format"
48 |     return extension
49 | 
50 | 
51 | def mkdir(path):
52 |     """ if the dir does not exists it create it
53 | 
54 |     :param path: dir path
55 |     :return: path
56 |     :rtype: str
57 | 
58 |     """
59 |     if not os.path.exists(path):
60 |         os.makedirs(path)
61 |     return path
62 | 
63 | 
64 | def pickle_load(fname):
65 |     """ Load object from pickle.
66 | 
67 |     :param fname: Input pickle file name.
68 |     :returns: Object loaded from pickle file.
69 |     :rtype: object
70 | 
71 |     """
72 |     fh = open(fname, 'rb')
73 |     data = pickle.load(fh)
74 |     fh.close()
75 |     return data
76 | 
77 | 
78 | def pickle_dump(obj, fname):
79 |     """Pickle object to file.
80 | 
81 |     :param obj: Object to be pickled.
82 |     :fname: Output file name.
83 |     :returns: The name of output file.
84 |     :rtype: str
85 | 
86 |     """
87 |     fh = open(fname, 'wb')
88 |     pickle.dump(obj, fh)
89 |     fh.flush()
90 |     fh.close()
91 |     return fname
92 | 


--------------------------------------------------------------------------------
/wub/util/parse.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np
 4 | import six
 5 | from collections import OrderedDict
 6 | 
 7 | """Utilities to parse strings into various data sructures."""
 8 | 
 9 | 
10 | def separated_list_to_floats(separated_list, separator=","):
11 |     """ Convert a separated list into a list of floats.
12 | 
13 |     :param separated_list: A separated list as string.
14 |     :param separator: List separator.
15 |     :returns: List of floats.
16 |     :rtype: list
17 |     """
18 |     return [float(element) for element in separated_list.split(separator)]
19 | 
20 | 
21 | def args_string_to_dict(args_string, elements_separator=",", keyvalue_separator=":"):
22 |     """ Convert a two-level separated list into a dictionary.
23 | 
24 |     :param args_string: Two-level separated string.
25 |     :param elements_separator: Separator between elements.
26 |     :param keyvalue_separator: Separator between key/value pairs.
27 |     :returns: dict
28 |     :rtype: dict
29 |     """
30 |     if len(args_string) == 0:
31 |         return {}
32 |     pairs = [pair.strip() for pair in args_string.split(elements_separator)]
33 |     elements = OrderedDict(pair.split(keyvalue_separator) for pair in pairs)
34 |     parsed = OrderedDict((k.strip(), v.strip()) for k, v in six.iteritems(elements))
35 |     return parsed
36 | 
37 | 
38 | def interval_string_to_tuples(interval_string, elements_separator="|", interval_separator=","):
39 |     """ Convert a two-level separated list into a dictionary.
40 | 
41 |     :param interval_string: Two-level separated string.
42 |     :param elements_separator: Separator between elements.
43 |     :param keyvalue_separator: Separator between interval boundaries.
44 |     :returns: tuple
45 |     :rtype: tuple
46 |     """
47 |     if len(interval_string) == 0:
48 |         return tuple()
49 |     pairs = [pair.strip() for pair in interval_string.split(elements_separator)]
50 |     elements = OrderedDict(pair.split(interval_separator) for pair in pairs)
51 |     parsed = tuple((int(k.strip()), int(v.strip())) for k, v in six.iteritems(elements))
52 |     return parsed
53 | 
54 | 
55 | def normalise_array(array):
56 |     """ Normalise numpy array so the elments sum to 1.0.
57 | 
58 |     :param array: Input array.
59 |     :returns: Normalised array.
60 |     :rtype: numpy.array
61 |     """
62 |     temporary_array = array.astype(float)
63 |     return temporary_array / np.sum(temporary_array)
64 | 


--------------------------------------------------------------------------------
/wub/vis/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __author__ = 'ONT Applications Group'
4 | __email__ = 'Apps@nanoporetech.com'
5 | __version__ = '0.1.0'
6 | 


--------------------------------------------------------------------------------
/wub/wrappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/wrappers/__init__.py


--------------------------------------------------------------------------------
/wub/wrappers/dnadiff.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """ Wrapper for mummer's dnadiff """
  3 | 
  4 | import six
  5 | import os
  6 | import re
  7 | from collections import defaultdict
  8 | from collections import namedtuple
  9 | import subprocess
 10 | import tempfile
 11 | from subprocess import STDOUT
 12 | 
 13 | dnadiff_extensions = (
 14 |     '.1coords', '.1delta', '.delta', '.mcoords', '.mdelta', '.qdiff', '.rdiff', '.report', '.snps', '.unref', '.unqry')
 15 | 
 16 | Property = namedtuple('Property', 'ref query')
 17 | PropertyWithPerc = namedtuple('PropertyWithPerc', 'ref ref_perc query query_perc')
 18 | 
 19 | 
 20 | def dnadiff(reference, query, working_directory=None, cleanup=True):
 21 |     """Run dnadiff on reference and query fasta and parse results.
 22 | 
 23 |     :param reference: Reference fasta.
 24 |     :param query: Query fasta.
 25 |     :param working_directory: Write output in this directory if specified.
 26 |     :param cleanup: Delete dnadiff output after parsing if True.
 27 |     :returns: Parsed results, raw report and log.
 28 |     :rtype: 3-tuple
 29 |     """
 30 |     reference = os.path.abspath(reference)
 31 |     query = os.path.abspath(query)
 32 |     work_dir = working_directory
 33 | 
 34 |     if not os.path.exists(reference):
 35 |         raise Exception("Reference fasta {} does not exists!".format(reference))
 36 |     if not os.path.exists(query):
 37 |         raise Exception("Target fasta {} does not exists!".format(query))
 38 |     if work_dir is not None and not os.path.exists(work_dir):
 39 |         raise Exception("Working directory {} does not exists!".format(work_dir))
 40 | 
 41 |     if work_dir is None:
 42 |         work_dir = tempfile.mkdtemp(prefix='dnadiff_')
 43 | 
 44 |     old_dir = os.getcwd()
 45 |     os.chdir(work_dir)
 46 | 
 47 |     command = ['dnadiff', reference, query]
 48 |     try:
 49 |         log = subprocess.check_output(command, stderr=STDOUT)
 50 |     finally:
 51 |         os.chdir(old_dir)
 52 | 
 53 |     report_file = os.path.join(work_dir, 'out.report')
 54 |     output = open(report_file, 'r').read()
 55 | 
 56 |     results = parse_dnadiff_report(report_file)
 57 | 
 58 |     if cleanup:
 59 |         cleanup_dnadiff_report(work_dir)
 60 |         if working_directory is None:
 61 |             os.rmdir(work_dir)
 62 | 
 63 |     return results, output, log
 64 | 
 65 | 
 66 | def cleanup_dnadiff_report(directory, prefix='out'):
 67 |     """Cleanup dnadiff output files in the specified directory.
 68 | 
 69 |     :param directory: Output directory.
 70 |     :param prefix: Output prefix.
 71 |     :returns: None
 72 |     :rtype: object
 73 |     """
 74 |     for ext in dnadiff_extensions:
 75 |         name = prefix + ext
 76 |         path = os.path.join(directory, name)
 77 |         if os.path.exists(path):
 78 |             os.unlink(path)
 79 | 
 80 | 
 81 | def _parse_dnadiff_into_sections(report_file):
 82 |     """Parse dnadiff output lines into sections."""
 83 |     report_fh = open(report_file, 'r')
 84 |     section = "NO_SECTION"
 85 |     sections = defaultdict(list)
 86 |     for line in report_fh:
 87 |         line = line.strip()
 88 |         if len(line) == 0:
 89 |             continue
 90 |         if line.startswith('/') or line.startswith('NUCMER') or line.startswith('[REF]'):
 91 |             continue
 92 |         if line.startswith('['):
 93 |             section = line
 94 |             section = section.replace('[', '')
 95 |             section = section.replace(']', '')
 96 |         else:
 97 |             sections[section].append(line)
 98 |     return sections
 99 | 
100 | 
101 | def _parse_percent_field(field):
102 |     """Parse dnadiff field with percent value."""
103 |     tmp = field.split('(')
104 |     perc = tmp[1].replace(')', '')
105 |     perc = perc.replace('%', '')
106 |     return float(tmp[0]), float(perc)
107 | 
108 | 
109 | def _parse_simple_section(lines):
110 |     """Parse a simple dnadiff report section."""
111 |     results = {}
112 |     for line in lines:
113 |         tmp = re.split("\s+", line)
114 |         if '%' not in tmp[1] and '%' not in tmp[2]:
115 |             results[tmp[0]] = Property(float(tmp[1]), float(tmp[2]))
116 |         else:
117 |             ref_prop, ref_prop_perc = _parse_percent_field(tmp[1])
118 |             query_prop, query_prop_perc = _parse_percent_field(tmp[2])
119 |             results[tmp[0]] = PropertyWithPerc(ref_prop, ref_prop_perc, query_prop, query_prop_perc)
120 |     return results
121 | 
122 | 
123 | def _parse_complex_section(lines):
124 |     """Parse a complex dnadiff report section."""
125 |     section = "NO_SECTION"
126 |     sections = defaultdict(list)
127 |     results = defaultdict(dict)
128 |     # Parse alignment section into subsections:
129 |     for line in lines:
130 |         if len(line) == 0:
131 |             continue
132 |         # FIXME: Very specific to current dnadiff output:
133 |         if line.startswith('1-to-1') or line.startswith('M-to-M') or re.match("Total(S|G|I)", line):
134 |             tmp = re.split("\s+", line)
135 |             section = tmp[0]
136 |             results[section]['Number'] = Property(float(tmp[1]), float(tmp[2]))
137 |         else:
138 |             sections[section].append(line)
139 | 
140 |     # Parse subsections and update results dictionary:
141 |     for section, lines in six.iteritems(sections):
142 |         parsed = _parse_simple_section(lines)
143 |         for name, prop in six.iteritems(parsed):
144 |             results[section][name] = prop
145 |     return results
146 | 
147 | 
148 | def parse_dnadiff_report(report_file):
149 |     """Parse dnadiff report file.
150 | 
151 |     :param report_file: dnadiff report output.
152 |     :returns: Data structure with parsed results.
153 |     :rtype: dict
154 |     """
155 |     sections = _parse_dnadiff_into_sections(report_file)
156 | 
157 |     results_sequences = _parse_simple_section(sections['Sequences'])
158 |     results_bases = _parse_simple_section(sections['Bases'])
159 |     results_features = _parse_simple_section(sections['Feature Estimates'])
160 |     results_alignments = _parse_complex_section(sections['Alignments'])
161 |     results_snps = _parse_complex_section(sections['SNPs'])
162 | 
163 |     results = {
164 |         'Sequences': results_sequences,
165 |         'Bases': results_bases,
166 |         'Features': results_features,
167 |         'Alignments': results_alignments,
168 |         'SNPs': results_snps,
169 |     }
170 |     return results
171 | 


--------------------------------------------------------------------------------