├── .circleci └── config.yml ├── .gitignore ├── .gitlab-ci.yml ├── COPYRIGHT ├── LICENSE.md ├── MANIFEST.in ├── Makefile ├── ONT_logo.png ├── README.md ├── docs ├── .gitignore ├── Makefile ├── _static │ └── .gitignore ├── cmd_tools.py ├── conf.py ├── index.rst ├── make.bat ├── requirements.txt └── usage.rst ├── pytest.ini ├── scripts ├── __init__.py ├── _template_script.py ├── add_errors.py ├── annotate_length.py ├── bam_accuracy.py ├── bam_alignment_length.py ├── bam_alignment_qc.py ├── bam_alignments_compare.py ├── bam_count_reads.py ├── bam_cov.py ├── bam_fill_unaligned.py ├── bam_frag_coverage.py ├── bam_gc_vs_qual.py ├── bam_multi_qc.py ├── bam_ref_base_coverage.py ├── bam_ref_tab.py ├── bam_score_filter.py ├── bam_soft_clips_tab.py ├── bias_explorer.py ├── calculate_coverage.py ├── compare_genomes_dnadiff.py ├── compare_genomes_lastal.py ├── convert_alphabet.py ├── correlate_counts.py ├── fasta_to_mock_fastq.py ├── fastq_qual_tab.py ├── fastq_time_slice.py ├── fastq_time_tab.py ├── fastx_ends_tab.py ├── fastx_grep.py ├── fastx_length_tab.py ├── length_normalise_counts.py ├── merge_tsvs.py ├── multi_length_hist.py ├── pickle_cat.py ├── plot_counts_correlation.py ├── plot_gffcmp_stats.py ├── plot_qualities.py ├── plot_sequence_properties.py ├── reads_across_time.py ├── reads_stats.py ├── reverse_fastq.py ├── sequence_filter.py ├── sequence_subtract.py ├── simulate_errors.py ├── simulate_genome.py ├── simulate_sequences.py ├── simulate_sequencing_simple.py └── split_fastx.py ├── setup.cfg ├── setup.py └── wub ├── __init__.py ├── bam ├── __init__.py ├── common.py ├── compare.py ├── filter.py ├── read_counter.py ├── sam_writer.py └── stats.py ├── mappers ├── __init__.py └── lastal.py ├── parsers ├── __init__.py ├── blastn.py └── mummer.py ├── read_stats ├── __init__.py └── contig_stats.py ├── simulate ├── __init__.py ├── dist.py ├── genome.py └── seq.py ├── tests ├── __init__.py ├── data │ ├── test_bam_stats │ │ ├── stat_ref.fas │ │ ├── stat_test.bam │ │ └── stat_test.bam.bai │ ├── test_blastn_parse │ │ └── blastn_test.coords │ └── test_nucmer_parse │ │ └── nucmer_test.coords ├── test_bam_compare.py ├── test_bam_read_counter.py ├── test_bam_stats.py ├── test_blastn_coord_parse.py ├── test_contig_stats.py ├── test_example.py ├── test_mappers_lastal.py ├── test_nucmer_coord_parse.py ├── test_simulate_genome.py ├── test_simulate_seq.py ├── test_util_parse.py ├── test_util_seq.py └── test_wrappers_dnadiff.py ├── util ├── __init__.py ├── cmd.py ├── misc.py ├── parse.py └── seq.py ├── vis ├── __init__.py └── report.py └── wrappers ├── __init__.py └── dnadiff.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # This configuration was automatically generated from a CircleCI 1.0 config. 2 | # It should include any build commands you had along with commands that CircleCI 3 | # inferred from your project structure. We strongly recommend you read all the 4 | # comments in this file to understand the structure of CircleCI 2.0, as the idiom 5 | # for configuration has changed substantially in 2.0 to allow arbitrary jobs rather 6 | # than the prescribed lifecycle of 1.0. In general, we recommend using this generated 7 | # configuration as a reference rather than using it in production, though in most 8 | # cases it should duplicate the execution of your original 1.0 config. 9 | version: 2 10 | jobs: 11 | build: 12 | working_directory: ~/nanoporetech/wub 13 | parallelism: 1 14 | shell: /bin/bash --login 15 | # CircleCI 2.0 does not support environment variables that refer to each other the same way as 1.0 did. 16 | # If any of these refer to each other, rewrite them so that they don't or see https://circleci.com/docs/2.0/env-vars/#interpolating-environment-variables-to-set-other-environment-variables . 17 | environment: 18 | CIRCLE_ARTIFACTS: /tmp/circleci-artifacts 19 | CIRCLE_TEST_REPORTS: /tmp/circleci-test-results 20 | # In CircleCI 1.0 we used a pre-configured image with a large number of languages and other packages. 21 | # In CircleCI 2.0 you can now specify your own image, or use one of our pre-configured images. 22 | # The following configuration line tells CircleCI to use the specified docker image as the runtime environment for you job. 23 | # We have selected a pre-built image that mirrors the build environment we use on 24 | # the 1.0 platform, but we recommend you choose an image more tailored to the needs 25 | # of each job. For more information on choosing an image (or alternatively using a 26 | # VM instead of a container) see https://circleci.com/docs/2.0/executor-types/ 27 | # To see the list of pre-built images that CircleCI provides for most common languages see 28 | # https://circleci.com/docs/2.0/circleci-images/ 29 | docker: 30 | - image: circleci/python:3.6.1 31 | command: /sbin/init 32 | steps: 33 | # Machine Setup 34 | # If you break your build into multiple jobs with workflows, you will probably want to do the parts of this that are relevant in each 35 | # The following `checkout` command checks out your code to your working directory. In 1.0 we did this implicitly. In 2.0 you can choose where in the course of a job your code should be checked out. 36 | - checkout 37 | # Prepare for artifact and test results collection equivalent to how it was done on 1.0. 38 | # In many cases you can simplify this from what is generated here. 39 | # 'See docs on artifact collection here https://circleci.com/docs/2.0/artifacts/' 40 | - run: mkdir -p $CIRCLE_ARTIFACTS $CIRCLE_TEST_REPORTS 41 | # Dependencies 42 | # This would typically go in either a build or a build-and-test job when using workflows 43 | # Restore the dependency cache 44 | - restore_cache: 45 | keys: 46 | # This branch if available 47 | - v1-dep-{{ .Branch }}- 48 | # Default branch if not 49 | - v1-dep-master- 50 | # Any branch if there are none on the default branch - this should be unnecessary if you have your default branch configured correctly 51 | - v1-dep- 52 | # The following line was run implicitly in your 1.0 builds based on what CircleCI inferred about the structure of your project. In 2.0 you need to be explicit about which commands should be run. In some cases you can discard inferred commands if they are not relevant to your project. 53 | # You can remove the conditional and simply install the requirements file you use 54 | #- run: #if [ -e requirements.txt ]; then pip install -r requirements.txt; else pip install -r requirements.pip; fi 55 | - run: sudo apt-get update 56 | - run: sudo apt-get install python3 python3-pip 57 | - run: sudo pip install -e ./ 58 | # Save dependency cache 59 | - save_cache: 60 | key: v1-dep-{{ .Branch }}-{{ epoch }} 61 | paths: 62 | # This is a broad list of cache paths to include many possible development environments 63 | # You can probably delete some of these entries 64 | - vendor/bundle 65 | - ~/virtualenvs 66 | - ~/.m2 67 | - ~/.ivy2 68 | - ~/.bundle 69 | - ~/.go_workspace 70 | - ~/.gradle 71 | - ~/.cache/bower 72 | # Test 73 | # This would typically be a build job when using workflows, possibly combined with build 74 | # The following line was run implicitly in your 1.0 builds based on what CircleCI inferred about the structure of your project. In 2.0 you need to be explicit about which commands should be run. In some cases you can discard inferred commands if they are not relevant to your project. 75 | # Python test inference is not supported by the config translator. 76 | # Put the commands that you use to run your tests here. 77 | # If run your tests in parallel containers you can use this third party nose plugin 78 | # https://github.com/dlanger/nose-parallel 79 | # If you use django you can switch to the nose django runner to make use of this. 80 | - run: sudo make test 81 | # Teardown 82 | # If you break your build into multiple jobs with workflows, you will probably want to do the parts of this that are relevant in each 83 | # Save test results 84 | - store_test_results: 85 | path: /tmp/circleci-test-results 86 | # Save artifacts 87 | - store_artifacts: 88 | path: /tmp/circleci-artifacts 89 | - store_artifacts: 90 | path: /tmp/circleci-test-results 91 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #misc 62 | .env/ 63 | .idea/ 64 | ._.DS_Store 65 | *.DS_Store 66 | *.pyc -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: ubuntu:xenial 2 | 3 | stages: 4 | - test 5 | - pages 6 | - release 7 | 8 | before_script: 9 | - apt-get update 10 | - apt-get install -y software-properties-common 11 | - apt-add-repository universe 12 | - apt-get update 13 | - apt-get install -y python3-pip make python3-numpy python3-matplotlib python3-biopython python3-pandas mummer last-align cython zlib1g-dev lbzip2 libbz2-dev liblzma-dev libhdf5-serial-dev 14 | - pip3 install --upgrade sphinx sphinx-argparse sphinx_rtd_theme pytest pycmd futures packaging appdirs pysam 15 | - pip3 install -e ./ 16 | 17 | 18 | do_testing: 19 | stage: test 20 | script: 21 | - alias python=python3;make test 22 | except: 23 | - tags 24 | 25 | pages: 26 | stage: pages 27 | script: 28 | - make docs 29 | - mv docs/_build/html public 30 | artifacts: 31 | paths: 32 | - public/ 33 | only: 34 | - master 35 | except: 36 | - tags 37 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | 2 | (c) 2016 Oxford Nanopore Technologies Ltd. 3 | 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | 2 | include README.md 3 | include LICENSE 4 | 5 | recursive-exclude * __pycache__ 6 | recursive-exclude * *.py[co] 7 | 8 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | MODULE=wub 2 | 3 | .PHONY: clean clean-test clean-pyc clean-build docs com help 4 | 5 | .DEFAULT_GOAL := help 6 | 7 | define PRINT_HELP_PYSCRIPT 8 | import re, sys 9 | 10 | for line in sys.stdin: 11 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 12 | if match: 13 | target, help = match.groups() 14 | print("%-20s %s" % (target, help)) 15 | endef 16 | export PRINT_HELP_PYSCRIPT 17 | 18 | help: 19 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 20 | 21 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 22 | 23 | 24 | clean-build: ## remove build artifacts 25 | rm -fr build/ 26 | rm -fr dist/ 27 | rm -fr .eggs/ 28 | find . -name '*.egg-info' -exec rm -fr {} + 29 | find . -name '*.egg' -exec rm -f {} + 30 | 31 | clean-pyc: ## remove Python file artifacts 32 | find . -name '*.pyc' -exec rm -f {} + 33 | find . -name '*.pyo' -exec rm -f {} + 34 | find . -name '*~' -exec rm -f {} + 35 | find . -name '__pycache__' -exec rm -fr {} + 36 | 37 | clean-test: ## remove test and coverage artifacts 38 | rm -f .coverage 39 | rm -fr htmlcov/ 40 | 41 | lint: ## check style with flake8 42 | @(flake8 --max-line-length=120 $(MODULE) | grep -v "E501 line too long") || true 43 | @(flake8 --max-line-length=120 scripts/*.py | grep -v "E501 line too long") || true 44 | 45 | test: ## run tests quickly with the default Python 46 | py.test -s 47 | 48 | coverage: ## check code coverage quickly with the default Python 49 | coverage run --source $(MODULE) --omit="*/tests/*,*__init__.py" `which py.test` 50 | coverage report -m --omit="*/tests/*,*__init__.py" 51 | coverage html 52 | 53 | docs: ## generate Sphinx HTML documentation, including API docs 54 | @cd docs; make clean html 55 | 56 | servedocs: docs ## compile the docs watching for changes 57 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 58 | 59 | release: clean ## package and upload a release 60 | python setup.py sdist upload 61 | python setup.py bdist_wheel upload 62 | 63 | dist: clean ## builds source and wheel package 64 | python setup.py sdist 65 | python setup.py bdist_wheel 66 | ls -l dist 67 | 68 | install: clean ## install the package to the active Python's site-packages 69 | python setup.py install 70 | 71 | com: ## commit all changes to git 72 | git commit -a 73 | -------------------------------------------------------------------------------- /ONT_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/ONT_logo.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![ONT_logo](/ONT_logo.png) 2 | ----------------------------- 3 | 4 | This repository is now unsupported and we do not recommend its use. Please contact Oxford Nanopore: support@nanoporetech.com for help with your application. 5 | 6 | Wub 7 | ================================================================== 8 | 9 | [![CircleCI](https://circleci.com/gh/nanoporetech/wub.svg?style=svg)](https://circleci.com/gh/nanoporetech/wub) [![Documentation Status](https://readthedocs.org/projects/wub/badge/?version=latest)](http://wub.readthedocs.io/en/latest/?badge=latest) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square)](http://bioconda.github.io/recipes/wub/README.html) 10 | 11 | Tools and software library developed by the Oxford Nanopore Technologies Applications group. 12 | 13 | ## Features: 14 | 15 | - Simple sequence and error simulation tools. 16 | - Tools to visualise basic sequence properties. 17 | - Fastq and fasta utilities. 18 | - Tools to calculate read and genome assembly accuracy. 19 | - Transcriptome alignment QC tools. 20 | - Read couting and related utilities. 21 | - BAM utilities. 22 | - Miscellaneous utilities. 23 | 24 | Getting Started 25 | =============== 26 | 27 | ## Installation 28 | 29 | Set up a virtual environment 30 | 31 | ``` 32 | virtualenv --system-site-packages wub_env 33 | source wub_env/bin/activate 34 | pip install --upgrade pip 35 | pip install requests[security] 36 | ``` 37 | 38 | Then install the package via pip: 39 | 40 | ``` 41 | pip install git+https://github.com/nanoporetech/wub.git 42 | ``` 43 | 44 | If you installed the package in a virtual environment then do not forget to 45 | load it before using the package: 46 | 47 | ``` 48 | source wub_env/bin/activate 49 | ``` 50 | 51 | Run the following to leave the virtual environment: 52 | 53 | ``` 54 | deactivate 55 | ``` 56 | 57 | You can also clone the repository and install using `setup.py`: 58 | 59 | ``` 60 | git clone https://github.com/nanoporetech/wub.git 61 | cd wub 62 | python setup.py install 63 | ``` 64 | 65 | Install the package in developer mode: 66 | 67 | ``` 68 | python setup.py develop 69 | ``` 70 | 71 | Run the tests: 72 | 73 | ``` 74 | make test 75 | ``` 76 | 77 | Build the documentation: 78 | 79 | ``` 80 | make docs 81 | ``` 82 | 83 | Issue `make help` to get a list of `make` targets. 84 | 85 | Documentation 86 | ----------------- 87 | 88 | Online documentation is avalaible at [wub.readthedocs.io](http://wub.readthedocs.io/en/latest/). 89 | 90 | Contributing 91 | ---------------- 92 | 93 | - Please fork the repository and create a merge request to contribute. 94 | - Please respect the structure outlined in `scripts/_template_script.py` from command line tools so documentation can be generated automatically. 95 | - All non-trivial functions should have at least one test (with the exception of plotting functions). 96 | - Use your best judgement when deciding whether to put a piece of code in a script or make it more reusable by incorporating it into the module. 97 | - Use [bumpversion](http://bit.ly/2cSUryt) to manage package versioning. 98 | - The code should be [PEP8](https://www.python.org/dev/peps/pep-0008) compliant, which can be tested by `make lint`. 99 | 100 | Help 101 | ==== 102 | 103 | ## Licence and Copyright 104 | 105 | (c) 2016 Oxford Nanopore Technologies Ltd. 106 | 107 | This Source Code Form is subject to the terms of the Mozilla Public 108 | License, v. 2.0. If a copy of the MPL was not distributed with this 109 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 110 | 111 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | *.rst 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | MODULE = wub 10 | 11 | # User-friendly check for sphinx-build 12 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 13 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 14 | endif 15 | 16 | # Internal variables. 17 | PAPEROPT_a4 = -D latex_paper_size=a4 18 | PAPEROPT_letter = -D latex_paper_size=letter 19 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 20 | # the i18n builder cannot share the environment and doctrees with the others 21 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 22 | 23 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 24 | 25 | help: 26 | @echo "Please use \`make ' where is one of" 27 | @echo " html to make standalone HTML files" 28 | @echo " dirhtml to make HTML files named index.html in directories" 29 | @echo " singlehtml to make a single large HTML file" 30 | @echo " pickle to make pickle files" 31 | @echo " json to make JSON files" 32 | @echo " htmlhelp to make HTML files and a HTML help project" 33 | @echo " qthelp to make HTML files and a qthelp project" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | rm -f $(MODULE).rst 54 | rm -f modules.rst 55 | 56 | html: 57 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 58 | @echo 59 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 60 | 61 | dirhtml: 62 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 63 | @echo 64 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 65 | 66 | singlehtml: 67 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 68 | @echo 69 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 70 | 71 | pickle: 72 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 73 | @echo 74 | @echo "Build finished; now you can process the pickle files." 75 | 76 | json: 77 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 78 | @echo 79 | @echo "Build finished; now you can process the JSON files." 80 | 81 | htmlhelp: 82 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 83 | @echo 84 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 85 | ".hhp project file in $(BUILDDIR)/htmlhelp." 86 | 87 | qthelp: 88 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 89 | @echo 90 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 91 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 92 | @echo "# 93 | @echo "To view the help file:" 94 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/wub.qhc" 95 | 96 | devhelp: 97 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 98 | @echo 99 | @echo "Build finished." 100 | @echo "To view the help file:" 101 | @echo "# mkdir -p $$HOME/.local/share/devhelp/wub" 102 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/wub" 103 | @echo "# devhelp" 104 | 105 | epub: 106 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 107 | @echo 108 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 109 | 110 | latex: 111 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 112 | @echo 113 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 114 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 115 | "(use \`make latexpdf' here to do that automatically)." 116 | 117 | latexpdf: 118 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 119 | @echo "Running LaTeX files through pdflatex..." 120 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 121 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 122 | 123 | latexpdfja: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through platex and dvipdfmx..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | text: 130 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 131 | @echo 132 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 133 | 134 | man: 135 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 136 | @echo 137 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 138 | 139 | texinfo: 140 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 141 | @echo 142 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 143 | @echo "Run \`make' in that directory to run these through makeinfo" \ 144 | "(use \`make info' here to do that automatically)." 145 | 146 | info: 147 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 148 | @echo "Running Texinfo files through makeinfo..." 149 | make -C $(BUILDDIR)/texinfo info 150 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 151 | 152 | gettext: 153 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 154 | @echo 155 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 156 | 157 | changes: 158 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 159 | @echo 160 | @echo "The overview file is in $(BUILDDIR)/changes." 161 | 162 | linkcheck: 163 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 164 | @echo 165 | @echo "Link check complete; look for any errors in the above output " \ 166 | "or in $(BUILDDIR)/linkcheck/output.txt." 167 | 168 | doctest: 169 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 170 | @echo "Testing of doctests in the sources finished, look at the " \ 171 | "results in $(BUILDDIR)/doctest/output.txt." 172 | 173 | xml: 174 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 175 | @echo 176 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 177 | 178 | pseudoxml: 179 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 180 | @echo 181 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 182 | -------------------------------------------------------------------------------- /docs/_static/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/docs/_static/.gitignore -------------------------------------------------------------------------------- /docs/cmd_tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Helper script to generate cmd_line.rst file for all scripts in bin which 4 | have and parser object defined in their global scope - taken from tang 5 | 6 | """ 7 | from __future__ import print_function 8 | import sys 9 | import os 10 | import imp 11 | 12 | scripts_rel = 'scripts' 13 | attr_name = 'parser' 14 | blacklist = ['__init__.py'] 15 | 16 | location = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..') 17 | scripts_abs = os.path.join(location, scripts_rel) 18 | scripts = sorted(filter(lambda s: s[0] != '.' and s not in blacklist, os.listdir(scripts_abs))) 19 | 20 | sys.stderr.write("Found following scripts:\n{}\n{}\n{}\n".format( 21 | location, scripts_abs, scripts 22 | )) 23 | 24 | 25 | print (""" 26 | .. _command_line_tools: 27 | 28 | Command line tools 29 | ================== 30 | """) 31 | 32 | for script in scripts: 33 | script_name, script_ext = os.path.splitext(script) 34 | if script_ext == '.pyc': 35 | continue 36 | 37 | try: 38 | mod_name = '{}.{}'.format(scripts_rel, script_name) 39 | # mod = __import__(mod_name, globals(), locals(), [attr_name]) 40 | mod = imp.load_source(script_name, os.path.join(scripts_abs, script)) 41 | script = script.replace('.py', '') 42 | 43 | print ('.. _{}:\n\n{}\n{}'.format(script, script, '-' * len(script))) 44 | if hasattr(mod, attr_name): 45 | print (""" 46 | .. argparse:: 47 | :ref: {}.{} 48 | :prog: {} 49 | """.format(mod_name, attr_name, script_name)) 50 | else: 51 | print ('No documentation available') 52 | 53 | except Exception as e: 54 | # Wha' yer' gonna do? 55 | sys.stderr.write('Error making docs for {}:\n{}\n'.format(script_name, e)) 56 | pass 57 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | Welcome to the documentation of the wub package! 3 | ================================================ 4 | 5 | 6 | Command line tools 7 | ------------------ 8 | 9 | .. toctree:: 10 | 11 | cmd_list 12 | 13 | 14 | Full API reference 15 | ------------------ 16 | 17 | .. toctree:: 18 | :maxdepth: 4 19 | 20 | usage 21 | modules 22 | 23 | 24 | Indices and tables 25 | ================== 26 | 27 | * :ref:`genindex` 28 | * :ref:`modindex` 29 | * :ref:`search` 30 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\wub.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\wub.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mock 2 | sphinx-argparse 3 | pysam 4 | tqdm 5 | biopython 6 | numpy 7 | pandas 8 | editdistance 9 | matplotlib 10 | h5py 11 | futures 12 | git+https://github.com/nanoporetech/wub.git 13 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | Usage 3 | ===== 4 | 5 | To use wub package in a project:: 6 | 7 | import wub 8 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = wub 3 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /scripts/_template_script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | 6 | # Parse command line arguments: 7 | parser = argparse.ArgumentParser( 8 | description='Template script.') 9 | parser.add_argument( 10 | '-i', metavar='input', type=str, help="Input.") 11 | 12 | 13 | if __name__ == '__main__': 14 | args = parser.parse_args() 15 | -------------------------------------------------------------------------------- /scripts/add_errors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | from Bio.Seq import Seq 8 | 9 | from wub.simulate import seq as sim_seq 10 | from wub.util import seq as seq_util 11 | 12 | # Parse command line arguments: 13 | parser = argparse.ArgumentParser( 14 | description="""Add a specified number of errors to random sites for each input sequence.""") 15 | parser.add_argument('-n', metavar='nr_errors', type=int, 16 | help="Number of errors to introduce (0).", default=0) 17 | parser.add_argument('-t', metavar='error_type', type=str, 18 | help="Error type: substitution, insertion or deletion.", choices=['substitution', 'insertion', 'deletion'], default='substitution') 19 | parser.add_argument('input_fasta', nargs='?', help='Input fasta (default: stdin).', 20 | type=argparse.FileType('r'), default=sys.stdin) 21 | parser.add_argument('output_fasta', nargs='?', help='Output fasta (default: stdout)', 22 | type=argparse.FileType('w'), default=sys.stdout) 23 | 24 | 25 | def add_fixed_errors(input_iter, nr_errors, error_type): 26 | """Simulate sequencing errors for each SeqRecord object in the input iterator. 27 | 28 | :param input_iter: Iterator of SeqRecord objects. 29 | :para nr_errors: Number of errors to introduce. 30 | :param error_type: Error type: substitution, insertion or deletion. 31 | :returns: Generator of SeqRecord objects. 32 | :rtype: generator 33 | """ 34 | for record in input_iter: 35 | mutated_seq = sim_seq.add_errors(record.seq, nr_errors, error_type) 36 | record.seq = Seq(mutated_seq) 37 | yield record 38 | 39 | 40 | if __name__ == '__main__': 41 | args = parser.parse_args() 42 | 43 | input_iterator = seq_util.read_seq_records( 44 | args.input_fasta, format='fasta') 45 | 46 | simulation_iterator = add_fixed_errors(input_iterator, args.n, args.t) 47 | 48 | seq_util.write_seq_records( 49 | simulation_iterator, args.output_fasta, format='fasta') 50 | -------------------------------------------------------------------------------- /scripts/annotate_length.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | from wub.util import seq as seq_util 8 | 9 | # Parse command line arguments: 10 | parser = argparse.ArgumentParser( 11 | description='Add sequence length to sequence record descriptions.') 12 | parser.add_argument( 13 | '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq') 14 | parser.add_argument( 15 | '-o', metavar='out_format', type=str, help="Output format (fastq).", default='fastq') 16 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).', 17 | type=argparse.FileType('r'), default=sys.stdin) 18 | parser.add_argument('output_fastx', nargs='?', help='Output file (default: stdout).', 19 | type=argparse.FileType('w'), default=sys.stdout) 20 | 21 | 22 | def _record_annotate_length(input_iter): 23 | """ Add sequence length to record description. 24 | """ 25 | for record in input_iter: 26 | record.description = record.description + " seq_length={}".format(len(record.seq)) 27 | yield record 28 | 29 | 30 | if __name__ == '__main__': 31 | args = parser.parse_args() 32 | 33 | if args.i == 'fasta' and args.o == 'fastq': 34 | sys.stderr.write( 35 | "Cannot produce fastq output from fasta! Use fasta_to_mock_fastq.py instead.\n") 36 | sys.exit(1) 37 | 38 | input_iterator = seq_util.read_seq_records( 39 | args.input_fastx, format=args.i) 40 | 41 | output_iterator = _record_annotate_length(input_iterator) 42 | 43 | seq_util.write_seq_records(output_iterator, args.output_fastx, format=args.o) 44 | -------------------------------------------------------------------------------- /scripts/bam_accuracy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | 6 | import os 7 | import sys 8 | import pandas as pd 9 | from collections import OrderedDict 10 | from scipy.stats import gaussian_kde 11 | from scipy.optimize import minimize_scalar 12 | 13 | from wub.util import misc 14 | from wub.vis import report 15 | from wub.bam import stats 16 | 17 | # Parse command line arguments: 18 | parser = argparse.ArgumentParser( 19 | description="""Produce accuracy statistics of the input BAM file. Calculates global accuracy and identity and various per-read statistics. 20 | The input BAM file must be sorted by coordinates and indexed. 21 | """) 22 | parser.add_argument( 23 | '-c', metavar='region', type=str, help="BAM region (None).", required=False, default=None) 24 | parser.add_argument( 25 | '-g', metavar='global_tsv', type=str, default=None, help="Tab separated file to save global statistics (None).", required=False) 26 | parser.add_argument( 27 | '-l', metavar='read_tsv', type=str, default=None, help="Tab separated file to save per-read statistics (None).", required=False) 28 | parser.add_argument( 29 | '-t', metavar='bam_tag', type=str, default=None, help="Dataset tag (BAM basename).", required=False) 30 | parser.add_argument( 31 | '-q', metavar='aqual', type=int, default=0, help="Minimum alignment quality (0).") 32 | parser.add_argument( 33 | '-e', action="store_true", default=False, help="Include hard and soft clipps in alignment length when calculating accuracy (False).") 34 | parser.add_argument( 35 | '-r', metavar='report_pdf', type=str, help="Report PDF (bam_accuracy.pdf).", default="bam_accuracy.pdf") 36 | parser.add_argument( 37 | '-p', metavar='results_pickle', type=str, help="Save pickled results in this file (None).", default=None) 38 | parser.add_argument( 39 | '-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False) 40 | parser.add_argument( 41 | 'bam', metavar='bam', type=str, help="Input BAM file.") 42 | 43 | 44 | def estimate_mode(acc): 45 | """ Estimate the mode of a set of float values between 0 and 1. 46 | 47 | :param acc: Data. 48 | :returns: The mode of the sample 49 | :rtype: float 50 | """ 51 | # Taken from sloika. 52 | if len(acc) > 1: 53 | da = gaussian_kde(acc) 54 | optimization_result = minimize_scalar(lambda x: -da(x), bounds=(0, 1), method='brent') 55 | if optimization_result.success: 56 | try: 57 | mode = optimization_result.x[0] 58 | except IndexError: 59 | mode = optimization_result.x 60 | except TypeError: 61 | mode = optimization_result.x 62 | else: 63 | sys.stderr.write("Mode computation failed") 64 | mode = 0 65 | else: 66 | mode = acc[0] 67 | return mode 68 | 69 | 70 | def base_stats_qc(st, report): 71 | """ Plot base statistics. 72 | 73 | :param st: Statistics dict. 74 | :param report: Plotter object. 75 | :returns: None 76 | """ 77 | 78 | bs = st.copy() 79 | del bs['accuracy'] 80 | del bs['identity'] 81 | plotter.plot_bars_simple( 82 | bs, title="Basewise statistics", xlab="Type", ylab="Count") 83 | plotter.plot_bars_simple(OrderedDict([('Identity ({})'.format(st['identity']), st['identity']), ('Accuracy ({})'.format( 84 | st['accuracy']), st['accuracy'])]), title="Precision statistics: length weighted means", xlab="Type", ylab="Count") 85 | 86 | 87 | def read_precision_qc(st, report): 88 | """ Plot read precision statistics. 89 | 90 | :param st: Statistics dict. 91 | :param report: Plotter object. 92 | :returns: Mode of accuracy and identity. 93 | :rtype: dict 94 | """ 95 | accuracy_mode = estimate_mode(st['accuracy']) 96 | report.plot_histograms(OrderedDict([('Accuracy', st[ 97 | 'accuracy'])]), title="Distribution of per-read accuracies", xlab="Accuracy", ylab="Count", legend=True, 98 | vlines={'Mode:{0:.4f}'.format(accuracy_mode): accuracy_mode}) 99 | 100 | identity_mode = estimate_mode(st['identity']) 101 | report.plot_histograms(OrderedDict([('Identity', st[ 102 | 'identity'])]), title="Distribution of per-read identitities", xlab="Identity", ylab="Count", legend=True, 103 | vlines={'Mode:{0:.4f}'.format(identity_mode): identity_mode}) 104 | 105 | modes = {'accuracy_mode': accuracy_mode, 'identity_mode': identity_mode} 106 | return modes 107 | 108 | 109 | if __name__ == '__main__': 110 | args = parser.parse_args() 111 | tag = args.t if args.t is not None else os.path.basename(args.bam) 112 | 113 | plotter = report.Report(args.r) 114 | 115 | read_stats = stats.read_stats( 116 | args.bam, region=args.c, min_aqual=args.q, with_clipps=args.e, verbose=not args.Q) 117 | read_stats['tag'] = tag 118 | base_stats = read_stats['base_stats'] 119 | precision_stats = read_stats['read_stats'] 120 | 121 | base_stats_qc(base_stats, plotter) 122 | modes = read_precision_qc(precision_stats, plotter) 123 | 124 | plotter.close() 125 | 126 | global_stats = OrderedDict([ 127 | ('Accuracy', [read_stats['base_stats']['accuracy']]), 128 | ('AccuracyMode', modes['accuracy_mode']), 129 | ('Identity', [read_stats['base_stats']['identity']]), 130 | ('IdentityMode', modes['identity_mode']), 131 | ('Mapped', [read_stats['mapped']]), 132 | ('Unmapped', [read_stats['unmapped']]), 133 | ('Tag', [read_stats['tag']]), ]) 134 | global_stats = pd.DataFrame(global_stats) 135 | 136 | if args.g is not None: 137 | global_stats.to_csv(args.g, sep="\t", index=False) 138 | 139 | if args.l is not None: 140 | read_df = pd.DataFrame(precision_stats) 141 | read_df.to_csv(args.l, sep="\t", index=False) 142 | 143 | if args.p is not None: 144 | misc.pickle_dump(read_stats, args.p) 145 | -------------------------------------------------------------------------------- /scripts/bam_alignment_length.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | 6 | import sys 7 | import pandas as pd 8 | from collections import OrderedDict 9 | import tqdm 10 | 11 | from wub.bam import common as bam_common 12 | 13 | # Parse command line arguments: 14 | parser = argparse.ArgumentParser( 15 | description="""Produce a tab separated file of alignment lengths and other information. 16 | Rows are sorted by number of aligned reference bases unless the -x option is specified. 17 | """) 18 | parser.add_argument( 19 | '-t', metavar='tsv_file', type=str, help="Tab separated file to save alignment lengths (bam_alignment_length.tsv).", required=False, default="bam_alignment_length.tsv") 20 | parser.add_argument( 21 | '-q', metavar='aqual', type=int, default=0, help="Minimum alignment quality (0).") 22 | parser.add_argument( 23 | '-x', action="store_true", help="Sort by number of read bases instead of number of aligned reference bases.", default=False) 24 | parser.add_argument( 25 | '-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False) 26 | parser.add_argument( 27 | 'bam', metavar='bam', type=str, help="Input BAM file.") 28 | 29 | 30 | if __name__ == '__main__': 31 | args = parser.parse_args() 32 | verbose = not args.Q 33 | 34 | bam_reader = bam_common.pysam_open(args.bam, in_format='BAM') 35 | 36 | if verbose: 37 | sys.stdout.write( 38 | "Gathering read and alignment lengths from file: {}\n".format(args.bam)) 39 | try: 40 | total_reads = bam_reader.mapped + bam_reader.unmapped 41 | except: 42 | total_reads = None 43 | bam_reader = tqdm.tqdm(bam_reader, total=total_reads) 44 | 45 | read_names = [] 46 | ref_names = [] 47 | ref_lengths = [] 48 | read_lengths = [] 49 | aln_lengths = [] 50 | mapping_quals = [] 51 | 52 | # Gather alignment information: 53 | for record in bam_reader: 54 | if (not record.is_unmapped) and (record.mapping_quality > args.q): 55 | read_names.append(record.query_name) 56 | ref_names.append(record.reference_name) 57 | read_lengths.append(len(record.query_sequence)) 58 | aln_lengths.append(record.query_alignment_length) 59 | ref_lengths.append(record.reference_length) 60 | mapping_quals.append(record.mapping_quality) 61 | 62 | # Construct data frame: 63 | data = OrderedDict([('read_name', read_names), 64 | ('aligned_ref_bases', ref_lengths), 65 | ('aligned_read_bases', aln_lengths), 66 | ('read_length', read_lengths), 67 | ('reference', ref_names), 68 | ('mapping_quality', mapping_quals) 69 | ]) 70 | 71 | df = pd.DataFrame(data) 72 | del data, read_names, ref_names, mapping_quals 73 | del read_lengths, aln_lengths, ref_lengths 74 | 75 | # Sort data frame and save tsv: 76 | sort_by = 'aligned_ref_bases' 77 | if args.x: 78 | sort_by = 'aligned_read_bases' 79 | 80 | df.sort_values([sort_by], ascending=[0], inplace=True) 81 | df.to_csv(args.t, sep="\t", index=False) 82 | -------------------------------------------------------------------------------- /scripts/bam_alignments_compare.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import six 5 | import argparse 6 | import os 7 | import pandas as pd 8 | from collections import OrderedDict 9 | from wub.util import misc 10 | from wub.bam import compare as bam_compare 11 | from wub.vis import report 12 | 13 | # Parse command line arguments: 14 | parser = argparse.ArgumentParser( 15 | description="""Compare alignments stored in two BAM files. 16 | The two BAM files must have the same set of reads in the same order (name sorted). 17 | """) 18 | parser.add_argument( 19 | '-w', metavar='coarse_tolerance', type=int, help="Tolerance when performing coarse comparison of alignments (50).", default=50) 20 | parser.add_argument( 21 | '-g', action="store_true", help="Do strict comparison of alignment flags.", default=False) 22 | parser.add_argument( 23 | '-r', metavar='report_pdf', type=str, help="Report PDF (bam_alignments_compare.pdf).", default="bam_alignments_compare.pdf") 24 | parser.add_argument( 25 | '-p', metavar='results_pickle', type=str, help="Save pickled results in this file (bam_alignments_compare.pk).", default="bam_alignments_compare.pk") 26 | parser.add_argument( 27 | '-t', metavar='tsv_file', type=str, help="Save results in tsv format in this file (None).", default=None) 28 | parser.add_argument( 29 | '-f', metavar='format', type=str, help="Input format (BAM).", default='BAM') 30 | parser.add_argument( 31 | '-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False) 32 | parser.add_argument( 33 | 'bam_one', metavar='bam_one', type=str, help="First input BAM file.") 34 | parser.add_argument( 35 | 'bam_two', metavar='bam_two', type=str, help="Second input BAM file.") 36 | 37 | if __name__ == '__main__': 38 | args = parser.parse_args() 39 | 40 | stats = bam_compare.bam_compare( 41 | args.bam_one, args.bam_two, coarse_tolerance=args.w, strict_flags=args.g, in_format=args.f, verbose=not args.Q) 42 | 43 | plotter = report.Report(args.r) 44 | 45 | query_stats = OrderedDict((field, stats[field]) for field in ( 46 | 'TotalQueries', 'CoarseMatches', 'DirectionMismatch', 'SeqMismatch')) 47 | plotter.plot_bars_simple( 48 | query_stats, title="Per query statistics", xlab="Field", ylab="Count", auto_limit=False) 49 | 50 | query_stats = OrderedDict((field, stats[field]) for field in ( 51 | 'TotalQueries', 'StrictFlagMismatch', 'RefMismatch')) 52 | plotter.plot_bars_simple( 53 | query_stats, title="Per query statistics (continued)", xlab="Field", ylab="Count", auto_limit=False) 54 | 55 | aligned_bases = OrderedDict( 56 | (os.path.basename(bam), stats[bam]['AlignedBases']) for bam in stats['BamFiles']) 57 | plotter.plot_bars_simple( 58 | aligned_bases, title="Aligned bases", xlab="BAM", ylab="Bases", auto_limit=False) 59 | 60 | aligned_queries = OrderedDict( 61 | (os.path.basename(bam), stats[bam]['AlignedQueries']) for bam in stats['BamFiles']) 62 | plotter.plot_bars_simple( 63 | aligned_queries, title="Aligned queries", xlab="BAM", ylab="Bases", auto_limit=False) 64 | 65 | unaligned_queries = OrderedDict( 66 | (os.path.basename(bam), stats[bam]['UnalignedQueries']) for bam in stats['BamFiles']) 67 | plotter.plot_bars_simple( 68 | unaligned_queries, title="Unaligned queries", xlab="BAM", ylab="Bases", auto_limit=False) 69 | 70 | unaligned_queries = OrderedDict( 71 | (os.path.basename(bam), stats[bam]['HardClippedBases']) for bam in stats['BamFiles']) 72 | plotter.plot_bars_simple( 73 | unaligned_queries, title="Hard clipped bases", xlab="BAM", ylab="Bases", auto_limit=False) 74 | 75 | base_stats = OrderedDict((field, stats[field]) 76 | for field in ('CommonAlignedBases', 'CommonMatchingBases')) 77 | plotter.plot_bars_simple( 78 | base_stats, title="Common aligned base statistics", xlab="Field", ylab="Count", auto_limit=False) 79 | 80 | sim_stats = OrderedDict((field, stats[field]) for field in ['AlignedSimilarity']) 81 | plotter.plot_bars_simple( 82 | sim_stats, title="Proportion of bases with matching alignment ({})".format(sim_stats.values()[0]), xlab="Field", ylab="Count", auto_limit=False) 83 | 84 | plotter.plot_histograms({'PerQuerySim': stats[ 85 | 'PerQueryBaseSim']}, title="Distribution of percent bases with matched alignment", 86 | xlab="Percent bases with matched alignment", ylab="Count", legend=False) 87 | 88 | plotter.plot_histograms({'PerQuerySimClipped': stats[ 89 | 'PerQueryBaseSimClipped']}, title="Distribution of percent bases with matched alignment (with clipping)", 90 | xlab="Percent bases with matched alignment", ylab="Count", legend=False) 91 | 92 | plotter.close() 93 | 94 | if args.p is not None: 95 | misc.pickle_dump(dict(stats), args.p) 96 | 97 | if args.t is not None: 98 | data_map = stats.copy() 99 | del data_map['PerQueryBaseSim'] 100 | del data_map['PerQueryBaseSimClipped'] 101 | for bam in data_map['BamFiles']: 102 | del data_map[bam] 103 | del data_map['BamFiles'] 104 | data_map = OrderedDict((key, [value]) for key, value in six.iteritems(data_map)) 105 | data_frame = pd.DataFrame(data_map) 106 | data_frame.to_csv(args.t, sep="\t", index=False) 107 | -------------------------------------------------------------------------------- /scripts/bam_count_reads.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import six 5 | import argparse 6 | import sys 7 | import pandas as pd 8 | from collections import OrderedDict, defaultdict 9 | from wub.bam import read_counter 10 | from wub.util import misc 11 | from wub.util import seq as seq_util 12 | 13 | import tqdm 14 | 15 | # Parse command line arguments: 16 | parser = argparse.ArgumentParser( 17 | description="""Count reads mapping to each reference in a BAM file.""") 18 | parser.add_argument( 19 | '-a', metavar='min_aqual', type=int, help="Minimum mapping quality (0).", default=0) 20 | parser.add_argument( 21 | '-f', metavar='in_format', type=str, help="Input format (BAM).", default='BAM') 22 | parser.add_argument( 23 | '-z', metavar='ref_fasta', type=str, help="Reference fasta. GC content and length columns are added if present (None).", default=None) 24 | parser.add_argument( 25 | '-k', metavar="words", type=str, help="Include word frequencies of specifed length in output (None).", default=None) 26 | parser.add_argument( 27 | '-g', action="store_true", help="Include mean GC content of reads mapped to each reference (False).", default=False) 28 | parser.add_argument( 29 | '-p', metavar='results_pickle', type=str, help="Save pickled results in this file (None).", default=None) 30 | parser.add_argument( 31 | '-t', metavar='tsv_file', type=str, help="Save results in tsv format in this file (bam_count_reads.tsv).", default="bam_count_reads.tsv") 32 | parser.add_argument( 33 | '-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False) 34 | parser.add_argument( 35 | '-R', action="store_true", help="Count reads from SAM stream in stdin. Only read count fields are written. Header required! (False).", default=False) 36 | parser.add_argument( 37 | '-F', metavar='yield_freq', type=int, help="Yield counts after every -Fth mapped record when doing online counting (100).", default=100) 38 | parser.add_argument('bam', nargs='?', help='Input file (default: stdin).', 39 | type=argparse.FileType('r'), default=sys.stdin) 40 | 41 | 42 | def _offline_counter(args): 43 | """ Offline counting from SAM/BAM file. """ 44 | # Offline counting from SAM/BAM file: 45 | counts, gc_means = read_counter.count_reads( 46 | args.bam.name, in_format=args.f, min_aln_qual=args.a, verbose=not args.Q, reads_gc=args.g) 47 | counts = OrderedDict(six.iteritems(counts)) 48 | 49 | if args.k is not None: 50 | calc_words = [int(k) for k in args.k.split(",")] 51 | 52 | data = OrderedDict() 53 | 54 | # Calculate sequence properties: 55 | if args.z is not None: 56 | lengths, gc_contents, word_freqs = {}, {}, defaultdict( 57 | lambda: defaultdict(dict)) 58 | ref_iter = seq_util.read_seq_records(args.z) 59 | if not args.Q: 60 | sys.stderr.write("Calculating sequence features:\n") 61 | ref_iter = tqdm.tqdm(ref_iter) 62 | 63 | for ref in ref_iter: 64 | # Augment counts dictionary with missing reference entries: 65 | if ref.id not in counts: 66 | counts[ref.id] = 0 67 | lengths[ref.id] = len(ref) 68 | gc_contents[ref.id] = seq_util.gc_content(str(ref.seq)) 69 | if args.k is not None: 70 | for word_size in calc_words: 71 | bf = seq_util.word_composition(ref.seq, word_size) 72 | for word, count in six.iteritems(bf): 73 | word_freqs[word_size][ref.id][ 74 | word] = float(count) / len(ref) 75 | 76 | data['Length'] = [lengths[tr] for tr in six.iterkeys(counts)] 77 | data['GC_content'] = [gc_contents[tr] for tr in six.iterkeys(counts)] 78 | 79 | data['Reference'] = list(counts.keys()) 80 | data['Count'] = list(counts.values()) 81 | 82 | # Calculate word frequencies: 83 | if args.k is not None and args.z: 84 | for ks in calc_words: 85 | for word in next(iter((word_freqs[ks].values()))).keys(): 86 | tmp = [] 87 | for ref in counts.keys(): 88 | tmp.append(word_freqs[ks][ref][word]) 89 | data[word] = tmp 90 | 91 | data_frame = pd.DataFrame(data) 92 | 93 | if args.g: 94 | gc_frame = pd.DataFrame({'Reference': list(gc_means.keys()), 'ReadGC': list(gc_means.values())}) 95 | data_frame = pd.merge(data_frame, gc_frame, how='inner', on='Reference') 96 | 97 | data_frame = data_frame.sort_values(['Count', 'Reference'], ascending=False) 98 | data_frame = data_frame[data_frame.Count > 0] 99 | 100 | if args.t is not None: 101 | data_frame.to_csv(args.t, sep='\t', index=False) 102 | 103 | if args.p is not None: 104 | misc.pickle_dump(data, args.p) 105 | 106 | 107 | def _online_counter(args): 108 | """ Online counting from SAM stream. """ 109 | # Open counts stream: 110 | counts_iter = read_counter.count_reads_realtime( 111 | alignment_file='-', in_format=args.f, min_aln_qual=args.a, verbose=not args.Q, yield_freq=args.F) 112 | 113 | for counts in counts_iter: 114 | data_frame = pd.DataFrame( 115 | OrderedDict([('Reference', list(counts.keys())), ('Count', list(counts.values()))])) 116 | data_frame = data_frame.sort_values(['Count', 'Reference'], ascending=False) 117 | 118 | if args.t is not None: 119 | data_frame.to_csv(args.t, sep='\t', index=False) 120 | if args.p is not None: 121 | misc.pickle_dump(counts, args.p) 122 | 123 | 124 | if __name__ == '__main__': 125 | args = parser.parse_args() 126 | 127 | if not args.R: 128 | # Offline counting from SAM/BAM file: 129 | if args.bam == sys.stdin: 130 | raise Exception("Input file not specified!") 131 | _offline_counter(args) 132 | else: 133 | # Online counting from SAM on stdin. 134 | _online_counter(args) 135 | -------------------------------------------------------------------------------- /scripts/bam_cov.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import tqdm 6 | import sys 7 | from Bio import SeqIO 8 | from wub.util import misc 9 | from wub.bam import common as bam_common 10 | 11 | # Parse command line arguments: 12 | parser = argparse.ArgumentParser( 13 | description="""Produce refrence coverage table.""") 14 | parser.add_argument( 15 | '-f', metavar='reference', type=str, help="Reference fasta.", required=True) 16 | parser.add_argument( 17 | '-c', metavar='region', type=str, help="BAM region (None).", required=False, default=None) 18 | parser.add_argument( 19 | '-t', metavar='tsv', type=str, default="bam_cov.tsv", help="Output TSV (bam_cov.tsv).", required=False) 20 | parser.add_argument( 21 | '-q', metavar='aqual', type=int, default=0, help="Minimum alignment quality (0).") 22 | parser.add_argument( 23 | '-Q', action="store_true", help="Be quiet and do not show progress bars.", default=False) 24 | parser.add_argument('bam', metavar='bam', type=str, help="Input BAM file.") 25 | 26 | 27 | def _process_bam(bam, out_tsv, chrom_lengths, region=None, min_aqual=0, verbose=True): 28 | bam_reader = bam_common.pysam_open(bam, in_format='BAM') 29 | ue = True 30 | if region is not None: 31 | ue = False 32 | bam_iter = bam_reader.fetch(region=region, until_eof=ue) 33 | 34 | try: 35 | total_reads = bam_reader.mapped + bam_reader.unmapped 36 | except: 37 | total_reads = None 38 | if verbose and region is None: 39 | sys.stdout.write( 40 | "Gathering fragment statistics from file: {}\n".format(bam)) 41 | bam_iter = tqdm.tqdm(bam_iter, total=total_reads) 42 | 43 | tsv = open(out_tsv, "w") 44 | tsv.write( 45 | "Read\tRef\tStrand\tRefCov\tReadCov\tReadLength\tReadAlnLength\tRefLength\tRefAlnLength\tMapQual\n") 46 | 47 | for r in bam_iter: 48 | # Skip unmapped reads: 49 | if r.is_unmapped: 50 | continue 51 | # Skip if mapping quality is too low: 52 | if r.mapq < min_aqual: 53 | continue 54 | strand = '-' if r.is_reverse else '+' 55 | ref = r.reference_name 56 | ref_cov = r.reference_length / float(chrom_lengths[ref]) 57 | read = r.query_name 58 | read_length = r.infer_read_length() 59 | mapq = r.mapping_quality 60 | read_aln_len = r.query_alignment_length 61 | read_cov = read_aln_len / float(read_length) 62 | ref_aln_length = r.reference_length 63 | 64 | tsv.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(read, ref, strand, ref_cov, 65 | read_cov, read_length, read_aln_len, chrom_lengths[ref], ref_aln_length, mapq)) 66 | 67 | tsv.flush() 68 | tsv.close() 69 | 70 | 71 | if __name__ == '__main__': 72 | args = parser.parse_args() 73 | verbose = not args.Q 74 | 75 | # Load reference lengths: 76 | references = SeqIO.index(args.f, format='fasta') 77 | chrom_lengths = {name: len(so) for name, so in references.items()} 78 | 79 | # Parse fragments: 80 | _process_bam(args.bam, args.t, chrom_lengths, args.c, args.q, verbose=verbose) 81 | -------------------------------------------------------------------------------- /scripts/bam_fill_unaligned.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | 6 | from wub.bam import common as bam_common 7 | from wub.bam import sam_writer 8 | from wub.util import seq as seq_util 9 | 10 | # Parse command line arguments: 11 | parser = argparse.ArgumentParser( 12 | description="""Generate SAM records for the reads present in the input fastq but missing from 13 | the input SAM/BAM. 14 | """) 15 | parser.add_argument( 16 | '-f', metavar='format', type=str, help="Input/output format (SAM).", default='SAM') 17 | parser.add_argument( 18 | '-q', metavar='fastq', type=str, help="Input fastq.", required=True) 19 | parser.add_argument( 20 | 'infile', metavar='input_file', type=str, help="Input file.") 21 | parser.add_argument( 22 | 'outfile', metavar='output_file', type=str, help="Output SAM file.") 23 | 24 | if __name__ == '__main__': 25 | args = parser.parse_args() 26 | 27 | input_iter = bam_common.pysam_open(args.infile, args.f).fetch(until_eof=True) 28 | 29 | # Get SAM record names: 30 | sam_names = [record.query_name for record in input_iter] 31 | 32 | writer = sam_writer.SamWriter(args.outfile) 33 | 34 | for read in seq_util.read_seq_records(args.q, 'fastq'): 35 | if read.id not in sam_names: 36 | qual = seq_util.quality_array_to_string(read.letter_annotations["phred_quality"]) 37 | sam_record = writer.new_sam_record(qname=read.id, flag=4, rname="*", pos=0, mapq=0, cigar="*", rnext="*", 38 | pnext=0, tlen=0, seq=str(read.seq), qual=qual, tags="AS:i:0") 39 | writer.write(sam_record) 40 | 41 | writer.close() 42 | -------------------------------------------------------------------------------- /scripts/bam_gc_vs_qual.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import print_function 5 | import argparse 6 | import tqdm 7 | 8 | import os 9 | import sys 10 | import pandas as pd 11 | 12 | import pysam 13 | from wub.vis import report 14 | from wub.util import seq as seq_util 15 | 16 | import warnings 17 | warnings.simplefilter("ignore") 18 | import seaborn as sns 19 | 20 | # Parse command line arguments: 21 | parser = argparse.ArgumentParser( 22 | description=""" 23 | Produce a plot of GC content of aligned read and reference portion versus their mean quality values. 24 | """) 25 | parser.add_argument('-f', metavar='reference', type=str, help="Reference fasta.", required=True) 26 | parser.add_argument( 27 | '-q', metavar='aqual', type=int, default=0, help="Minimum alignment quality (0).") 28 | parser.add_argument( 29 | '-r', metavar='report_pdf', type=str, help="Report PDF (bam_gc_vs_qual.pdf).", default="bam_gc_vs_qual.pdf") 30 | parser.add_argument( 31 | '-t', metavar='tsv', type=str, help="Tab separated file to save results (bam_gc_vs_qual.tsv).", default="bam_gc_vs_qual.tsv") 32 | parser.add_argument( 33 | '-Q', action="store_true", help="Be quiet and do not show progress bars.", default=False) 34 | parser.add_argument( 35 | 'bam', metavar='bam', type=str, help="Input BAM file.") 36 | 37 | 38 | def _process_reads(alignment_file, refs, in_format='BAM', min_aln_qual=0, verbose=False): 39 | """ 40 | Gather information about the GC content and mean quality value of aligned portions of the reads. 41 | """ 42 | if in_format == 'BAM': 43 | mode = "rb" 44 | elif in_format == 'SAM': 45 | mode = "r" 46 | else: 47 | raise Exception("Invalid format: {}".format(in_format)) 48 | 49 | aln_iter = pysam.AlignmentFile(alignment_file, mode) 50 | 51 | if verbose and in_format == "BAM": 52 | try: 53 | total_reads = aln_iter.mapped + aln_iter.unmapped 54 | except: 55 | total_reads = None 56 | sys.stdout.write( 57 | "Gathering GC content vs. quality information from file: {}\n".format(alignment_file)) 58 | if in_format == "BAM": 59 | aln_iter = tqdm.tqdm(aln_iter, total=total_reads) 60 | 61 | rgcs, gcs, quals = [], [], [] 62 | ref_lengths = [] 63 | for segment in aln_iter: 64 | if segment.is_unmapped: 65 | continue 66 | if segment.mapping_quality >= min_aln_qual: 67 | # Calculate GC content of aligned read portion: 68 | aln_seq = segment.query_alignment_sequence 69 | gcs.append(seq_util.gc_content(aln_seq)) 70 | 71 | # Calculate GC content of aligned reference: 72 | ref_seq = refs[segment.reference_name].seq[segment.reference_start:segment.reference_end] 73 | rgcs.append(seq_util.gc_content(ref_seq)) 74 | ref_lengths.append(segment.reference_length) 75 | 76 | # Calculate mean quality score of aligned read portion: 77 | aln_quals = segment.query_alignment_qualities 78 | quals.append(seq_util.mean_qscore(aln_quals, qround=False)) 79 | 80 | aln_iter.close() 81 | 82 | df = pd.DataFrame({'GC_content': gcs, 'MeanQuality': quals, 'GC_content_ref': rgcs, 'RefAlnLength': ref_lengths}) 83 | 84 | return df 85 | 86 | 87 | if __name__ == '__main__': 88 | args = parser.parse_args() 89 | verbose = not args.Q 90 | tag = os.path.basename(args.bam) 91 | 92 | references = seq_util.read_seq_records_dict(args.f) 93 | data = _process_reads(args.bam, references, min_aln_qual=args.q, verbose=verbose) 94 | 95 | data.to_csv(args.t, sep="\t", index=False) 96 | 97 | # Plot GC content of aligned read portion vs. mean quality. 98 | plotter = report.Report(args.r) 99 | sns.jointplot("GC_content", "MeanQuality", kind="reg", data=data) 100 | plotter.plt.tight_layout() 101 | plotter.pages.savefig() 102 | plotter.plt.clf() 103 | 104 | # Plot GC content of aligned reference portion vs. mean quality. 105 | sns.jointplot("GC_content_ref", "MeanQuality", kind="reg", data=data) 106 | plotter.plt.tight_layout() 107 | plotter.pages.savefig() 108 | plotter.plt.clf() 109 | 110 | plotter.close() 111 | -------------------------------------------------------------------------------- /scripts/bam_ref_base_coverage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import six 5 | import argparse 6 | import os 7 | import pandas as pd 8 | from Bio import SeqIO 9 | from wub.bam import stats as bam_stats 10 | 11 | # Parse command line arguments: 12 | parser = argparse.ArgumentParser( 13 | description="""Calculate percent covered reference lengths.""") 14 | parser.add_argument( 15 | '-f', metavar='reference', type=str, help="Reference fasta.", required=True) 16 | parser.add_argument( 17 | '-c', metavar='region', type=str, help="BAM region (None).", required=False, default=None) 18 | parser.add_argument( 19 | '-t', metavar='tsv', type=str, default="bam_ref_base_coverage.tsv", help="Output tab separated file (bam_ref_base_coverage.tsv).", required=False) 20 | parser.add_argument( 21 | '-m', metavar='min_cov', type=int, default=1, help="Minimum base coverage for a position to be counted (1).") 22 | parser.add_argument( 23 | '-Q', action="store_true", help="Be quiet and do not show progress bars.", default=False) 24 | parser.add_argument( 25 | 'bam', metavar='bam', type=str, help="Input BAM file.") 26 | 27 | 28 | if __name__ == '__main__': 29 | args = parser.parse_args() 30 | verbose = not args.Q 31 | tag = args.t 32 | if tag is None: 33 | tag = os.path.basename(args.bam) 34 | 35 | # Load reference lengths: 36 | references = SeqIO.index(args.f, format='fasta') 37 | chrom_lengths = {name: len(so) for name, so in six.iteritems(references)} 38 | # Parse fragments: 39 | st = bam_stats.pileup_stats(args.bam, region=args.c, verbose=verbose, with_quals=False)['coverage'] 40 | 41 | res = {} 42 | for chrom, chrom_length in six.iteritems(chrom_lengths): 43 | # No coverage: 44 | if chrom not in st: 45 | res[chrom] = 0.0 46 | else: 47 | nr_hits = 0 48 | # Iterate over covered positions and count valid hits: 49 | for pos, cov in six.iteritems(st[chrom]): 50 | if cov >= args.m: 51 | nr_hits += 1 52 | # Calculate percent covered reference length: 53 | res[chrom] = float(nr_hits * 100) / chrom_length 54 | 55 | # Convert results to sorted data frame: 56 | df = pd.DataFrame({'Chrom': list(res.keys()), 'Percent_cov': list(res.values())}) 57 | df.sort_values(['Percent_cov'], ascending=[0], inplace=True) 58 | df.to_csv(args.t, sep="\t", index=False) 59 | -------------------------------------------------------------------------------- /scripts/bam_ref_tab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | import pandas as pd 8 | import pysam 9 | import tqdm 10 | from collections import OrderedDict 11 | 12 | # Parse command line arguments: 13 | parser = argparse.ArgumentParser( 14 | description="""Produce a tab separated file with read identifiers and the corresponding references, sorted by reference.""") 15 | parser.add_argument( 16 | '-t', metavar='read_tsv', type=str, default="bam_ref_tab.tsv", help="Tab separated file to save reference table.", required=False) 17 | parser.add_argument( 18 | '-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False) 19 | parser.add_argument( 20 | '-s', action="store_true", help="Save read strand in output (False).", default=False) 21 | parser.add_argument( 22 | 'bam', metavar='bam', type=str, help="Input BAM file.") 23 | 24 | 25 | def process_reads(alignment_file, in_format='BAM', save_strand=False, verbose=False): 26 | """Process reads and extract the corresponding reference. 27 | 28 | :param alignment_file: BAM file. 29 | :param verbose: Verbosity flag. 30 | :returns: pandas dataframe with reads and references. 31 | :rtype: dict 32 | """ 33 | reads, refs, strands = [], [], [] 34 | if in_format == 'BAM': 35 | mode = "rb" 36 | elif in_format == 'SAM': 37 | mode = "r" 38 | else: 39 | raise Exception("Invalid format: {}".format(in_format)) 40 | 41 | aln_iter = pysam.AlignmentFile(alignment_file, mode) 42 | 43 | if verbose and in_format == "BAM": 44 | try: 45 | total_reads = aln_iter.mapped + aln_iter.unmapped 46 | except: 47 | total_reads = None 48 | sys.stdout.write( 49 | "Gathering read statistics from file: {}\n".format(alignment_file)) 50 | if in_format == "BAM": 51 | aln_iter = tqdm.tqdm(aln_iter, total=total_reads) 52 | 53 | for segment in aln_iter: 54 | if segment.is_unmapped: 55 | continue 56 | refs.append(segment.reference_name) 57 | reads.append(segment.query_name) 58 | if save_strand: 59 | strand = "-" if segment.is_reverse else "+" 60 | strands.append(strand) 61 | 62 | aln_iter.close() 63 | 64 | data = OrderedDict([('Read', reads), ('Reference', refs)]) 65 | if save_strand: 66 | data['Strand'] = strands 67 | df = pd.DataFrame(data) 68 | 69 | return df 70 | 71 | 72 | if __name__ == '__main__': 73 | args = parser.parse_args() 74 | verbose = not args.Q 75 | 76 | df = process_reads(args.bam, save_strand=args.s, verbose=verbose) 77 | df.sort_values(['Reference'], ascending=[0], inplace=True) 78 | df.to_csv(args.t, sep="\t", index=False) 79 | -------------------------------------------------------------------------------- /scripts/bam_score_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | 6 | import pysam 7 | from wub.bam import filter as bam_filter 8 | from wub.bam import common as bam_common 9 | 10 | # Parse command line arguments: 11 | parser = argparse.ArgumentParser( 12 | description="""Filter SAM/BAM records by score or other criteria. 13 | WARNING: the input records must be sorted by name or the filtering will not work 14 | as expected. 15 | """) 16 | parser.add_argument( 17 | '-f', metavar='format', type=str, help="Input/output format (SAM).", default='SAM') 18 | parser.add_argument( 19 | '-s', metavar='strategy', type=str, help="Filtering strategy: top_per_query, query_coverage, ref_coverage (top_per_query).", 20 | default="top_per_query", choices=['top_per_query', 'query_coverage', 'ref_coverage']) 21 | parser.add_argument( 22 | '-q', metavar='query_cover', type=float, help="Minimum query coverage fraction (0.8).", default=0.8) 23 | parser.add_argument( 24 | 'infile', metavar='input_file', type=str, help="Input file.") 25 | parser.add_argument( 26 | 'outfile', metavar='output_file', type=str, help="Output SAM file.") 27 | 28 | if __name__ == '__main__': 29 | args = parser.parse_args() 30 | 31 | input_iter = bam_common.pysam_open(args.infile, args.f) 32 | 33 | if args.s == 'top_per_query': 34 | output_iter = bam_filter.filter_top_per_query(input_iter.fetch(until_eof=True)) 35 | elif args.s == 'query_coverage': 36 | output_iter = bam_filter.filter_query_coverage(input_iter.fetch(until_eof=True), args.q) 37 | elif args.s == 'ref_coverage': 38 | output_iter = bam_filter.filter_ref_coverage(input_iter.fetch(until_eof=True), args.q, input_iter.header) 39 | else: 40 | raise Exception('Filtering strategy not implemented!') 41 | 42 | writer = pysam.AlignmentFile(args.outfile, "wh", template=input_iter, header=input_iter.header) 43 | for record in output_iter: 44 | writer.write(record) 45 | 46 | writer.close() 47 | -------------------------------------------------------------------------------- /scripts/bam_soft_clips_tab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | import pandas as pd 8 | import pysam 9 | import tqdm 10 | from collections import OrderedDict 11 | 12 | # Parse command line arguments: 13 | parser = argparse.ArgumentParser( 14 | description="""Produce a tab separated file with read identifiers and number of soft clipped bases at each end (relative to the original sequence in the fastq).""") 15 | parser.add_argument( 16 | '-t', metavar='tsv', type=str, default="bam_soft_clips_tab.tsv", help="Output tab separated file.", required=False) 17 | parser.add_argument( 18 | '-Q', action="store_true", help="Be quiet and do not print progress bar (False).", default=False) 19 | parser.add_argument( 20 | 'bam', metavar='bam', type=str, help="Input BAM file.") 21 | 22 | 23 | def _get_clips(cigar, is_reverse): 24 | """ Get clips at the start and end relative to the original sequence. """ 25 | clip_start, clip_end = 0, 0 26 | 27 | # Consider the first CIGAR tuple: 28 | if cigar[0][0] == 4: 29 | clip_start = cigar[0][1] 30 | 31 | # Consider the last CIGAR tuple: 32 | if cigar[-1][0] == 4: 33 | clip_end = cigar[-1][1] 34 | 35 | # Reverse orientation if necessary: 36 | if is_reverse: 37 | clip_start, clip_end = clip_end, clip_start 38 | return clip_start, clip_end 39 | 40 | 41 | def process_reads(alignment_file, in_format='BAM', verbose=False): 42 | """Process reads and extract the corresponding information. 43 | 44 | :param alignment_file: BAM file. 45 | :param verbose: Verbosity flag. 46 | :returns: pandas dataframe with reads and soft clip lengths. 47 | :rtype: pandas.DataFrame 48 | """ 49 | reads, strand, clip_start, clip_end = [], [], [], [] 50 | if in_format == 'BAM': 51 | mode = "rb" 52 | elif in_format == 'SAM': 53 | mode = "r" 54 | else: 55 | raise Exception("Invalid format: {}".format(in_format)) 56 | 57 | aln_iter = pysam.AlignmentFile(alignment_file, mode) 58 | 59 | if verbose and in_format == "BAM": 60 | try: 61 | total_reads = aln_iter.mapped + aln_iter.unmapped 62 | except: 63 | total_reads = None 64 | sys.stdout.write( 65 | "Gathering read statistics from file: {}\n".format(alignment_file)) 66 | if in_format == "BAM": 67 | aln_iter = tqdm.tqdm(aln_iter, total=total_reads) 68 | 69 | for segment in aln_iter: 70 | if segment.is_unmapped: 71 | continue 72 | reads.append(segment.query_name) 73 | strand.append('-' if segment.is_reverse else '+') 74 | cs, ce = _get_clips(segment.cigartuples, segment.is_reverse) 75 | clip_start.append(cs) 76 | clip_end.append(ce) 77 | 78 | aln_iter.close() 79 | 80 | data = OrderedDict([('Read', reads), ('Strand', strand), ('ClipStart', clip_start), ('ClipEnd', clip_end)]) 81 | df = pd.DataFrame(data) 82 | 83 | return df 84 | 85 | 86 | if __name__ == '__main__': 87 | args = parser.parse_args() 88 | verbose = not args.Q 89 | 90 | df = process_reads(args.bam, verbose=verbose) 91 | df.to_csv(args.t, sep="\t", index=False) 92 | -------------------------------------------------------------------------------- /scripts/bias_explorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import numpy as np 6 | import pandas as pd 7 | from wub.vis import report 8 | import warnings 9 | with warnings.catch_warnings(): 10 | warnings.simplefilter("ignore") 11 | import seaborn as sns 12 | warnings.resetwarnings() 13 | _ = sns 14 | 15 | # Parse command line arguments: 16 | parser = argparse.ArgumentParser( 17 | description=""" 18 | Simple tool for exploring biases in transcript counts. Takes as input count files generated by bam_count_reads.py (with the -z flag) 19 | and performs linear regression of log counts against transcript length and GC content. 20 | """) 21 | parser.add_argument( 22 | '-r', metavar='report_pdf', type=str, help="Report PDF (bias_explorer.pdf).", default="bias_explorer.pdf") 23 | parser.add_argument('-x', action="store_true", 24 | help="Exclude transcripts with zero counts.", default=False) 25 | parser.add_argument( 26 | 'count_file', metavar='count_file', type=str, help="Input counts file with length ang GC content features.") 27 | 28 | 29 | if __name__ == '__main__': 30 | args = parser.parse_args() 31 | 32 | data = pd.read_csv(args.count_file, sep="\t") 33 | data["logCount"] = np.log(np.array(data["Count"]) + 1.0) 34 | 35 | if args.x: 36 | data = data[data.Count > 0] 37 | 38 | plotter = report.Report(args.r) 39 | 40 | sns.jointplot("GC_content", "logCount", kind="reg", data=data) 41 | plotter.plt.tight_layout() 42 | plotter.pages.savefig() 43 | plotter.plt.clf() 44 | 45 | sns.jointplot("Length", "logCount", kind="reg", data=data) 46 | plotter.plt.tight_layout() 47 | plotter.pages.savefig() 48 | plotter.plt.clf() 49 | 50 | plotter.close() 51 | -------------------------------------------------------------------------------- /scripts/calculate_coverage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | from wub.util import seq as seq_util 8 | from wub.util import misc 9 | 10 | 11 | # Parse command line arguments: 12 | parser = argparse.ArgumentParser( 13 | description='Calculate total number of bases and genome coverage if genome size is given.') 14 | parser.add_argument( 15 | '-f', metavar='format', type=str, help="Input format (fastq).", default='fastq') 16 | parser.add_argument( 17 | '-s', metavar='genome_size', type=int, help="Genome size (None).", default=None) 18 | parser.add_argument( 19 | '-p', metavar='results_pickle', type=str, help="Save pickled results in this file.", default=None) 20 | parser.add_argument('input_fastx', nargs='?', help='Input (default: stdin).', 21 | type=argparse.FileType('r'), default=sys.stdin) 22 | 23 | 24 | if __name__ == '__main__': 25 | args = parser.parse_args() 26 | 27 | in_format = args.f 28 | input_iterator = seq_util.read_seq_records( 29 | args.input_fastx, format=in_format) 30 | 31 | total_bases = 0 32 | for record in input_iterator: 33 | total_bases += len(record) 34 | results = {'total_bases': total_bases} 35 | print("Total bases\t{}".format(total_bases)) 36 | 37 | if args.s is not None: 38 | results['genome_size'] = args.s 39 | results['coverage'] = float(total_bases) / args.s 40 | print("Genome size\t{}".format(results['genome_size'])) 41 | print("Coverage\t{}".format(results['coverage'])) 42 | 43 | if args.p is not None: 44 | misc.pickle_dump(results, args.p) 45 | -------------------------------------------------------------------------------- /scripts/compare_genomes_dnadiff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import print_function 5 | import six 6 | import sys 7 | import argparse 8 | from wub.util import cmd as cmd_util 9 | from wub.wrappers import dnadiff 10 | from wub.util import misc 11 | 12 | # Parse command line arguments: 13 | parser = argparse.ArgumentParser( 14 | description="""Compare a set of reference sequences (genome) to another set (target assembly) using mummer's dnadiff. 15 | It prints the alignment results to stdout. All parsed results can be saved in a pickle file. 16 | """) 17 | parser.add_argument( 18 | '-p', metavar='results_pickle', type=str, help="Save pickled results in this file (None).", default=None) 19 | parser.add_argument( 20 | '-r', metavar='raw_file', type=str, help="Save dnadiff report in this file (None).", default=None) 21 | parser.add_argument( 22 | '-d', metavar='work_dir', type=str, help="Use this working directory instead of a temporary directory (None).", default=None) 23 | parser.add_argument( 24 | '-k', action="store_true", help="Keep dnadiff result files (False).", default=False) 25 | parser.add_argument( 26 | '-v', action="store_true", help="Print out dnadiff output (False).", default=False) 27 | parser.add_argument( 28 | 'ref', metavar='reference_fasta', type=str, help="Reference fasta.") 29 | parser.add_argument( 30 | 'target', metavar='target_fasta', type=str, help="Target fasta.") 31 | 32 | if __name__ == '__main__': 33 | args = parser.parse_args() 34 | 35 | cmd_util.ensure_executable('dnadiff') 36 | cmd_util.ensure_executable('delta-filter') 37 | cmd_util.ensure_executable('show-diff') 38 | cmd_util.ensure_executable('show-snps') 39 | cmd_util.ensure_executable('show-coords') 40 | cmd_util.ensure_executable('nucmer') 41 | 42 | results, raw_report, log = dnadiff.dnadiff(args.ref, args.target, args.d, not args.k) 43 | 44 | if args.v: 45 | sys.stdout.write(log) 46 | 47 | if args.r is not None: 48 | with open(args.r, 'w') as out_handle: 49 | out_handle.write(raw_report) 50 | 51 | if args.p is not None: 52 | misc.pickle_dump(results, args.p) 53 | 54 | for section, properties in six.iteritems(results['Alignments']): 55 | print(section, ":\t\tref\tquery") 56 | for name, prop in properties.iteritems(): 57 | print("\t{}\t{}\t{}".format(name, prop.ref, prop.query)) 58 | -------------------------------------------------------------------------------- /scripts/compare_genomes_lastal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | 7 | from wub.mappers import lastal 8 | from wub.util import parse 9 | from wub.util import cmd as cmd_util 10 | from wub.vis import report 11 | from wub.util import misc 12 | 13 | import warnings 14 | with warnings.catch_warnings(): 15 | warnings.simplefilter("ignore") 16 | import seaborn as sns 17 | warnings.resetwarnings() 18 | _ = sns 19 | 20 | # Parse command line arguments: 21 | parser = argparse.ArgumentParser( 22 | description="""Compare a set of reference sequences (genome) to another set (target assembly) using lastal alignment. 23 | Accuracy is the total number of matched bases divided by total alignment length. Coverage is total reference covered 24 | by alignment divided by total length of reference. 25 | 26 | Caveats: 27 | - The lastal alignments are filtered by default (use -f to disable) so only the best scoring alignment is kept per query. Hence some shorter valid 28 | alignments might be discarded causing an underestimation of coverage. 29 | - The estimated accuracy is dependent on the scoring of gaps and mismatches. By default gap open and gap extend penalties are set to equal. 30 | """) 31 | parser.add_argument( 32 | '-p', metavar='results_pickle', type=str, help="Save pickled results in this file (None).", default=None) 33 | parser.add_argument( 34 | '-l', metavar='lastal_args', type=str, help="Parameters passed to lastal in the :value,... format (a:1,b:1).", default="a:1,b:1") 35 | parser.add_argument( 36 | '-t', metavar='details_tsv', type=str, help="Save details of lastal alignment in this tab-separated file (None).", default=None) 37 | parser.add_argument( 38 | '-f', help="Do *not* filter for best alignment per query.", default=False, action="store_true") 39 | parser.add_argument( 40 | '-r', metavar='report_pdf', type=str, help="Report with alignment details plot (None).", default=None) 41 | parser.add_argument( 42 | 'ref', metavar='reference_fasta', type=str, help="Reference fasta.") 43 | parser.add_argument( 44 | 'target', metavar='target_fasta', type=str, help="Target fasta.") 45 | 46 | if __name__ == '__main__': 47 | args = parser.parse_args() 48 | 49 | cmd_util.ensure_executable('lastal') 50 | cmd_util.ensure_executable('lastdb') 51 | 52 | filter_alignments = not args.f 53 | lastal_args = parse.args_string_to_dict(args.l) 54 | stats = lastal.compare_genomes_lastal( 55 | args.ref, args.target, lastal_options=lastal_args, filter_alns=filter_alignments, cleanup=True) 56 | 57 | global_accuracy = (stats['aln_length'].sum() - stats['substitutions'].sum() - 58 | stats['deletions'].sum() - stats['insertions'].sum()) / float(stats['aln_length'].sum()) 59 | global_coverage = stats[ 60 | 'ref_aln_len'].sum() / float(stats['ref_len'].sum()) 61 | 62 | sys.stdout.write("Accuracy\tCoverage\n") 63 | sys.stdout.write("{}\t{}\n".format(global_accuracy, global_coverage)) 64 | 65 | if args.t is not None: 66 | stats.to_csv(args.t, sep='\t', index=False) 67 | 68 | if args.r is not None: 69 | plotter = report.Report(args.r) 70 | data = {'': (stats['coverage'], stats['accuracy'])} 71 | plotter.plot_arrays( 72 | data, title="Alignment properties", xlab='Coverage', ylab='Accuracy', legend=False) 73 | plotter.close() 74 | 75 | if args.p is not None: 76 | res_data = {'Accuracy': global_accuracy, 'Coverage': global_coverage} 77 | misc.pickle_dump(res_data, args.p) 78 | -------------------------------------------------------------------------------- /scripts/convert_alphabet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | from wub.util import seq as seq_util 8 | 9 | # Parse command line arguments: 10 | parser = argparse.ArgumentParser( 11 | description='Convert between DNA and RNA alphabets.') 12 | parser.add_argument( 13 | '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq') 14 | parser.add_argument( 15 | '-o', metavar='out_format', type=str, help="Output format (fastq).", default='fastq') 16 | parser.add_argument( 17 | '-D', action='store_true', help="RNA->DNA alphabet conversion.", default=False) 18 | parser.add_argument( 19 | '-R', action='store_true', help="DNA->RNA alphabet conversion.", default=False) 20 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).', 21 | type=argparse.FileType('r'), default=sys.stdin) 22 | parser.add_argument('output_fastx', nargs='?', help='Output file (default: stdout).', 23 | type=argparse.FileType('w'), default=sys.stdout) 24 | 25 | 26 | def record_filter(input_iter, in_format, to_alphabet): 27 | """ Filter SeqRecord objects by length and mean quality. 28 | 29 | :param input_iter: Iterator of SeqRecord objects. 30 | :param in_format: Input format. 31 | :param to_alphabet: Convert to this alphabet. 32 | :returns: SeqRecord object. 33 | :rtype: generator 34 | """ 35 | for record in input_iter: 36 | if to_alphabet == 'DNA': 37 | yield seq_util.rna_record_to_dna(record) 38 | elif to_alphabet == 'RNA': 39 | yield seq_util.dna_record_to_rna(record) 40 | else: 41 | raise Exception('Invalid alphabet type') 42 | 43 | 44 | if __name__ == '__main__': 45 | args = parser.parse_args() 46 | 47 | input_iterator = seq_util.read_seq_records( 48 | args.input_fastx, format=args.i) 49 | 50 | to_alphabet = None 51 | if args.D and args.R: 52 | sys.stderr.write("-D and -R are mutually exclusive!\n") 53 | sys.exit(1) 54 | elif not args.D and not args.R: 55 | sys.stderr.write("Either -D or -R must be specified!\n") 56 | sys.exit(1) 57 | elif args.D: 58 | to_alphabet = 'DNA' 59 | elif args.R: 60 | to_alphabet = 'RNA' 61 | 62 | output_iterator = record_filter(input_iterator, args.i, to_alphabet) 63 | 64 | seq_util.write_seq_records(output_iterator, args.output_fastx, format=args.o) 65 | -------------------------------------------------------------------------------- /scripts/correlate_counts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import six 5 | import argparse 6 | import sys 7 | import numpy as np 8 | from scipy import stats 9 | from collections import OrderedDict 10 | import pandas as pd 11 | from os import path 12 | from wub.vis import report 13 | from functools import reduce 14 | 15 | import warnings 16 | with warnings.catch_warnings(): 17 | warnings.simplefilter("ignore") 18 | import seaborn as sns 19 | warnings.resetwarnings() 20 | _ = sns 21 | 22 | # Parse command line arguments: 23 | parser = argparse.ArgumentParser( 24 | description="""Correlate counts produced by multiple runs of bam_count_reads.py.""") 25 | parser.add_argument( 26 | '-r', metavar='report_pdf', type=str, help="Report PDF (bam_multi_qc.pdf).", default="correlate_counts.pdf") 27 | parser.add_argument( 28 | '-c', metavar='corr_type', type=str, help="Correlation statistic - spearman or pearson (spearman).", default="spearman") 29 | parser.add_argument( 30 | '-L', action="store_true", help="Log transform data.", default=False) 31 | parser.add_argument( 32 | '-o', action="store_true", help="Omit lower diagonal.", default=False) 33 | parser.add_argument( 34 | 'counts', metavar='input_counts', nargs='*', type=str, help="Input counts as tab separated files.") 35 | 36 | 37 | def load_counts(counts, log_transform): 38 | """Load statistics from tsv files. 39 | 40 | :param counts: List of count files. 41 | :returns: OrderedDict of count data frames per dataset. 42 | :rtype: OrderedDict 43 | """ 44 | stats = OrderedDict() 45 | for count_file in counts: 46 | name = path.basename(count_file).rsplit('.', 1)[0] 47 | if log_transform: 48 | name = 'log(' + name + '+1)' 49 | 50 | tmp = pd.read_csv(count_file, sep="\t")[["Reference", "Count"]] 51 | tmp = tmp[tmp.Count > 0] 52 | tmp = tmp.rename(columns={"Count": name}) 53 | stats[name] = tmp 54 | if log_transform: 55 | stats[name][name] = np.log(stats[name][name] + 1) 56 | return stats 57 | 58 | 59 | def _get_reference_set(dfs): 60 | """Get list of all references.""" 61 | references = set() 62 | for df in six.itervalues(dfs): 63 | references = references.union(set(df['Reference'])) 64 | return sorted(list(references)) 65 | 66 | 67 | def join_counts(counts): 68 | """Join count data frames. 69 | :param counts: Dictionary of data frames. 70 | :returns: Merged data frame. 71 | :rtype: DataFrame 72 | """ 73 | df_merged = reduce(lambda left, right: pd.merge(left, right, how="outer", on=["Reference"]), counts.values()) 74 | df_merged = df_merged.fillna(0.0) 75 | return df_merged 76 | 77 | 78 | def _corrfunc(x, y, **kws): 79 | """ Annotate grid with correaltion coefficient. 80 | Solution from http://stackoverflow.com/a/30942817 81 | """ 82 | if args.c == 'spearman': 83 | r, _ = stats.spearmanr(x, y) 84 | corr_type = 'Rho' 85 | elif args.c == 'pearson': 86 | r, _ = stats.pearsonr(x, y) 87 | corr_type = 'r' 88 | else: 89 | raise Exception('Invalid correlation statistic.') 90 | correlations.append(r) 91 | ax = plotter.plt.gca() 92 | ax.annotate("{} = {:.2f}".format(corr_type, r), 93 | xy=(.1, .9), xycoords=ax.transAxes) 94 | 95 | 96 | if __name__ == '__main__': 97 | args = parser.parse_args() 98 | plotter = report.Report(args.r) 99 | 100 | if len(args.counts) == 0: 101 | sys.stderr.write("No count files given!\n") 102 | sys.exit(1) 103 | 104 | counts = load_counts(args.counts, args.L) 105 | joint_df = join_counts(counts) 106 | correlations = [] 107 | 108 | # Solution from http://stackoverflow.com/a/30942817 109 | g = sns.PairGrid(joint_df, palette=["red"]) 110 | g.map_upper(plotter.plt.scatter, s=10) 111 | g.map_diag(sns.distplot, kde=False) 112 | if not args.o: 113 | g.map_lower(sns.kdeplot, cmap="Blues_d") 114 | g.map_lower(_corrfunc) 115 | g.map_upper(_corrfunc) 116 | plotter.plt.tight_layout() 117 | plotter.pages.savefig() 118 | 119 | plotter.plt.clf() 120 | correlations = pd.DataFrame( 121 | {"Distribution of correlation coefficients": correlations}) 122 | sns.boxplot(data=correlations) 123 | plotter.plt.tight_layout() 124 | plotter.pages.savefig() 125 | 126 | plotter.close() 127 | -------------------------------------------------------------------------------- /scripts/fasta_to_mock_fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | from wub.util import seq 8 | from Bio import SeqIO 9 | 10 | # Parse command line arguments: 11 | parser = argparse.ArgumentParser( 12 | description='Convert fasta file to fastq with mock qualities.') 13 | parser.add_argument( 14 | '-q', metavar='mock_quals', type=int, help="Mock quality value (40).", default=40) 15 | parser.add_argument('input_fasta', nargs='?', help='Input fasta (default: stdin).', 16 | type=argparse.FileType('r'), default=sys.stdin) 17 | parser.add_argument('output_fastq', nargs='?', help='Output fastq (default: stdout)', 18 | type=argparse.FileType('w'), default=sys.stdout) 19 | 20 | 21 | if __name__ == '__main__': 22 | args = parser.parse_args() 23 | 24 | mock_qual = args.q 25 | 26 | input_iterator = SeqIO.parse(args.input_fasta, 'fasta') 27 | output_iterator = (seq.mock_qualities(record, mock_qual) for record in input_iterator) 28 | SeqIO.write(output_iterator, args.output_fastq, 'fastq') 29 | -------------------------------------------------------------------------------- /scripts/fastq_qual_tab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | from wub.util import seq as seq_util 8 | import pandas as pd 9 | from collections import OrderedDict 10 | 11 | # Parse command line arguments: 12 | parser = argparse.ArgumentParser( 13 | description='Generate a table of read names and mean quality values.') 14 | parser.add_argument( 15 | '-t', metavar='tsv', type=str, help="Output tab separated file.", default='fastq_qual_tab.tsv') 16 | parser.add_argument('input_fastq', nargs='?', help='Input fastq (default: stdin).', 17 | type=argparse.FileType('r'), default=sys.stdin) 18 | 19 | 20 | if __name__ == '__main__': 21 | args = parser.parse_args() 22 | 23 | input_iterator = seq_util.read_seq_records( 24 | args.input_fastq, format='fastq') 25 | 26 | read = [] 27 | mean_qualities = [] 28 | 29 | for record in input_iterator: 30 | read.append(record.id) 31 | mean_quality = seq_util.mean_qscore(record.letter_annotations["phred_quality"], qround=False) 32 | mean_qualities.append(mean_quality) 33 | 34 | df = pd.DataFrame(OrderedDict([('Read', read), ('MeanQual', mean_qualities)])) 35 | df = df.set_index("Read") 36 | df.to_csv(args.t, sep="\t") 37 | -------------------------------------------------------------------------------- /scripts/fastq_time_slice.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | import pandas as pd 8 | from wub.util import seq as seq_util 9 | import datetime 10 | 11 | # Parse command line arguments: 12 | parser = argparse.ArgumentParser( 13 | description="""Filter a fastq file by starting time.""") 14 | parser.add_argument( 15 | '-t', metavar='time_tsv', type=str, help="Tab separeted file produced by fastq_time_tab.py.", required=True) 16 | parser.add_argument( 17 | '-s', metavar='start_perc', type=float, help="Start of slice as percent of total time.", required=False, default=0.0) 18 | parser.add_argument( 19 | '-e', metavar='end_perc', type=float, help="End of slice as percent of total time.", required=False, default=100.0) 20 | parser.add_argument('input_fastq', nargs='?', help='Input fastq (default: stdin).', 21 | type=argparse.FileType('r'), default=sys.stdin) 22 | parser.add_argument('output_fastq', nargs='?', help='Output fastq (default: stdout)', 23 | type=argparse.FileType('w'), default=sys.stdout) 24 | 25 | 26 | def _time_slice(input_iter, start_perc, end_perc, time_df): 27 | """ Filter for fastq records falling in the specified time range. """ 28 | first = time_df.index.min() 29 | last = time_df.index.max() 30 | 31 | s = first + ((last - first) * start_perc) / 100.0 32 | e = first + ((last - first) * end_perc) / 100.0 33 | 34 | for rec in input_iter: 35 | desc = rec.description.split() 36 | tmp_start = desc[4].split("=")[1] 37 | start_time = datetime.datetime.strptime(tmp_start, "%Y-%m-%dT%H:%M:%SZ") 38 | if start_time >= s and start_time <= e: 39 | yield rec 40 | 41 | 42 | if __name__ == '__main__': 43 | args = parser.parse_args() 44 | time_df = pd.read_csv(args.t, sep="\t", parse_dates=True, index_col="StartTime") 45 | 46 | input_iterator = seq_util.read_seq_records(args.input_fastq, format='fastq') 47 | 48 | output_iterator = _time_slice(input_iterator, args.s, args.e, time_df) 49 | 50 | seq_util.write_seq_records(output_iterator, args.output_fastq, format='fastq') 51 | -------------------------------------------------------------------------------- /scripts/fastq_time_tab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | 6 | import pandas as pd 7 | from collections import OrderedDict 8 | from wub.util import seq as seq_util 9 | import datetime 10 | 11 | # Parse command line arguments: 12 | parser = argparse.ArgumentParser( 13 | description="""Produce a tab separated file with read start times, read and channel numbers sorted by start time.""") 14 | parser.add_argument( 15 | '-t', metavar='read_tsv', type=str, default="fastq_time_tab.tsv", help="Tab separated file to save read time table.", required=False) 16 | parser.add_argument( 17 | 'fastq', metavar='fastq', type=str, help="Input fastq file.") 18 | 19 | 20 | if __name__ == '__main__': 21 | args = parser.parse_args() 22 | 23 | input_iterator = seq_util.read_seq_records(args.fastq, format='fastq') 24 | 25 | read, read_nr, channel, start, length = [], [], [], [], [] 26 | 27 | for rec in input_iterator: 28 | read.append(rec.id) 29 | length.append(len(rec.seq)) 30 | desc = rec.description.split() 31 | 32 | # Parse out read number: 33 | tmp_read_nr = int(desc[2].split("=")[1]) 34 | read_nr.append(tmp_read_nr) 35 | 36 | # Parse out channel: 37 | tmp_channel = int(desc[3].split("=")[1]) 38 | channel.append(tmp_channel) 39 | 40 | # Parse out start time: 41 | tmp_start = desc[4].split("=")[1] 42 | tmp_start = datetime.datetime.strptime(tmp_start, "%Y-%m-%dT%H:%M:%SZ") 43 | start.append(tmp_start) 44 | 45 | df = pd.DataFrame(OrderedDict([('Read', read), ('Channel', channel), 46 | ('ReadNumber', read_nr), ('StartTime', start), ("ReadLength", length)])) 47 | 48 | df.sort_values(by="StartTime", inplace=True) 49 | df = df.set_index("StartTime") 50 | 51 | df.to_csv(args.t, sep="\t") 52 | -------------------------------------------------------------------------------- /scripts/fastx_ends_tab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | from wub.util import seq as seq_util 8 | 9 | # Parse command line arguments: 10 | parser = argparse.ArgumentParser( 11 | description='Generate a tab separated file with the first and last -n bases of the sequences.') 12 | parser.add_argument( 13 | '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq') 14 | parser.add_argument( 15 | '-n', metavar='nr_bases', type=int, help=".", default=100) 16 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).', 17 | type=argparse.FileType('r'), default=sys.stdin) 18 | parser.add_argument('output_tsv', nargs='?', help='Output file (default: stdout).', 19 | type=argparse.FileType('w'), default=sys.stdout) 20 | 21 | 22 | if __name__ == '__main__': 23 | args = parser.parse_args() 24 | 25 | input_iterator = seq_util.read_seq_records( 26 | args.input_fastx, format=args.i) 27 | 28 | args.output_tsv.write("Read\tStartSeq\tEndSeq\n") 29 | for rec in input_iterator: 30 | args.output_tsv.write("{}\t{}\t{}\n".format(rec.id, rec.seq[0:args.n], rec.seq[-args.n:])) 31 | 32 | args.output_tsv.flush() 33 | args.output_tsv.close() 34 | -------------------------------------------------------------------------------- /scripts/fastx_grep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | from wub.util import seq as seq_util 8 | 9 | # Parse command line arguments: 10 | parser = argparse.ArgumentParser( 11 | description='Filter sequence files by read name.') 12 | parser.add_argument( 13 | '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq') 14 | parser.add_argument( 15 | '-o', metavar='out_format', type=str, help="Output format (fastq).", default='fastq') 16 | parser.add_argument( 17 | '-n', metavar='read_names', type=str, help="Comma separated list of read names to select.", default="") 18 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).', 19 | type=argparse.FileType('r'), default=sys.stdin) 20 | parser.add_argument('output_fastx', nargs='?', help='Output file (default: stdout).', 21 | type=argparse.FileType('w'), default=sys.stdout) 22 | 23 | 24 | def record_filter(input_iter, in_format, read_names): 25 | """ Filter SeqRecord objects by length and mean quality. 26 | 27 | :param input_iter: Iterator of SeqRecord objects. 28 | :param in_format: Input format. 29 | :param to_alphabet: Convert to this alphabet. 30 | :returns: SeqRecord object. 31 | :rtype: generator 32 | """ 33 | for record in input_iter: 34 | if record.id in read_names: 35 | yield record 36 | 37 | 38 | if __name__ == '__main__': 39 | args = parser.parse_args() 40 | 41 | input_iterator = seq_util.read_seq_records( 42 | args.input_fastx, format=args.i) 43 | 44 | names = args.n.split(',') 45 | 46 | output_iterator = record_filter(input_iterator, args.i, names) 47 | 48 | seq_util.write_seq_records(output_iterator, args.output_fastx, format=args.o) 49 | -------------------------------------------------------------------------------- /scripts/fastx_length_tab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | from wub.util import seq as seq_util 8 | 9 | # Parse command line arguments: 10 | parser = argparse.ArgumentParser( 11 | description='Generate a tab separated file with the sequence lengths in the input file.') 12 | parser.add_argument( 13 | '-i', metavar='in_format', type=str, help="Input format (fasta).", default='fasta') 14 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).', 15 | type=argparse.FileType('r'), default=sys.stdin) 16 | parser.add_argument('output_tsv', nargs='?', help='Output file (default: stdout).', 17 | type=argparse.FileType('w'), default=sys.stdout) 18 | 19 | if __name__ == '__main__': 20 | args = parser.parse_args() 21 | 22 | input_iterator = seq_util.read_seq_records( 23 | args.input_fastx, format=args.i) 24 | 25 | args.output_tsv.write("Reference\tLength\n") 26 | for rec in input_iterator: 27 | args.output_tsv.write("{}\t{}\n".format(rec.id, len(rec.seq))) 28 | 29 | args.output_tsv.flush() 30 | args.output_tsv.close() 31 | -------------------------------------------------------------------------------- /scripts/length_normalise_counts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import pandas as pd 6 | import numpy as np 7 | from collections import OrderedDict 8 | from wub.util import seq as seq_util 9 | 10 | # Parse command line arguments: 11 | parser = argparse.ArgumentParser( 12 | description="""Calculate RPKM values from raw counts and a transcriptome reference.""") 13 | parser.add_argument( 14 | '-f', metavar='in_trs', type=str, help="Input transcriptome.", required=True) 15 | parser.add_argument('input_counts', nargs=1, help='Input count file.', 16 | type=str, default=None) 17 | parser.add_argument('output_count', nargs=1, help='Output RPKM file.', 18 | type=str, default=None) 19 | 20 | 21 | def _load_transcript_lengths(fasta): 22 | """ Load transcript lengths. """ 23 | res = {} 24 | for record in seq_util.read_seq_records(fasta): 25 | res[record.id] = len(record.seq) 26 | return res 27 | 28 | 29 | if __name__ == '__main__': 30 | args = parser.parse_args() 31 | 32 | # Load transcript lengths: 33 | trs_lens = _load_transcript_lengths(args.f) 34 | 35 | # Load input counts: 36 | in_df = pd.read_csv(args.input_counts[0], sep="\t") 37 | 38 | # Calculate scaling factor: 39 | million_factor = np.sum(in_df["Count"]) / float(10**6) 40 | 41 | # Normalise counts: 42 | refs, rpkms = [], [] 43 | for row in in_df.itertuples(): 44 | refs.append(row.Reference) 45 | rpkms.append(row.Count / (million_factor * 46 | (trs_lens[row.Reference] / 1000.0))) 47 | 48 | out_data = OrderedDict([('Reference', refs), ('Count', rpkms)]) 49 | out_df = pd.DataFrame(out_data) 50 | out_df.to_csv(args.output_count[0], sep="\t", index=False) 51 | -------------------------------------------------------------------------------- /scripts/merge_tsvs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import pandas as pd 6 | from functools import reduce 7 | 8 | # Parse command line arguments: 9 | parser = argparse.ArgumentParser( 10 | description="""Merge tab separated files on a given field using pandas.""") 11 | parser.add_argument( 12 | '-j', metavar='join', type=str, help="Join type (outer).", default="outer") 13 | parser.add_argument( 14 | '-f', metavar='field', type=str, help="Join on this field (Read).", default="Read") 15 | parser.add_argument( 16 | '-o', metavar='out_tsv', type=str, help="Output tsv (merge_tsvs.tsv).", default="merge_tsvs.tsv") 17 | parser.add_argument( 18 | '-z', action="store_true", help="Fill NA values with zero.", default=False) 19 | parser.add_argument( 20 | 'tsvs', metavar='input_tsvs', nargs='*', type=str, help="Input tab separated files.") 21 | 22 | 23 | if __name__ == '__main__': 24 | args = parser.parse_args() 25 | 26 | dfs = [pd.read_csv(x, sep="\t") for x in args.tsvs] 27 | 28 | df_merged = reduce(lambda left, right: pd.merge(left, right, on=args.f, how=args.j), dfs) 29 | if args.z: 30 | df_merged = df_merged.fillna(0) 31 | 32 | df_merged.to_csv(args.o, sep="\t", index=False) 33 | -------------------------------------------------------------------------------- /scripts/multi_length_hist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | import numpy as np 7 | from os import path 8 | from wub.vis import report 9 | from wub.util import seq as seq_util 10 | 11 | import warnings 12 | warnings.filterwarnings('ignore') 13 | 14 | # Parse command line arguments: 15 | parser = argparse.ArgumentParser( 16 | description="""Plot histograms of length distributions from multiple sequence files.""") 17 | parser.add_argument( 18 | '-r', metavar='report_pdf', type=str, help="Report PDF.", default="multi_length_hist.pdf") 19 | parser.add_argument( 20 | '-f', metavar='in_format', type=str, help="Input format (fastq).", default="fastq") 21 | parser.add_argument( 22 | '-b', metavar='nr_bins', type=int, help="Number of bins (50).", default=50) 23 | parser.add_argument( 24 | '-l', metavar='min_len', type=int, help="Minimum read length (None).", default=None) 25 | parser.add_argument( 26 | '-u', metavar='max_len', type=int, help="Maximum read length (None).", default=None) 27 | parser.add_argument( 28 | '-L', action="store_true", help="Log transform lengths.", default=False) 29 | parser.add_argument( 30 | 'in_files', metavar='input_counts', nargs='*', type=str, help="Input sequence files.") 31 | 32 | 33 | def _get_lengths(in_file, in_format, min_length, max_length, do_log): 34 | """ Iterate over input and accumulate sequence lengths. """ 35 | input_iterator = seq_util.read_seq_records(in_file, format=in_format) 36 | lengths = [] 37 | for record in input_iterator: 38 | length = len(record) 39 | # Filter for minimum read length: 40 | if (min_length is not None) and (length < min_length): 41 | continue 42 | # Filter for maximum read length: 43 | if (max_length is not None) and (length > max_length): 44 | continue 45 | if do_log: 46 | length = np.log(length) 47 | lengths.append(length) 48 | input_iterator.close() 49 | return lengths 50 | 51 | 52 | if __name__ == '__main__': 53 | args = parser.parse_args() 54 | plotter = report.Report(args.r) 55 | 56 | if len(args.in_files) == 0: 57 | sys.stderr.write("No input files given!\n") 58 | sys.exit(1) 59 | 60 | data_map = {} 61 | for in_file in args.in_files: 62 | name = path.basename(in_file).rsplit('.', 1)[0] 63 | data_map[name] = _get_lengths(in_file, args.f, args.l, args.u, args.L) 64 | 65 | if args.L: 66 | xlab = 'log(read length)' 67 | else: 68 | xlab = 'read length' 69 | 70 | plotter.plot_histograms(data_map, title='Read length distributions', xlab=xlab, ylab='Count', bins=args.b, alpha=0.7, legend_loc='best', legend=True, vlines=None) 71 | 72 | plotter.close() 73 | -------------------------------------------------------------------------------- /scripts/pickle_cat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import pprint 6 | 7 | from wub.util import misc 8 | 9 | # Parse command line arguments: 10 | parser = argparse.ArgumentParser( 11 | description="""Pretty print the contents of a pickle file.""") 12 | parser.add_argument( 13 | 'pickle', metavar='pickle_file', type=str, help="Input pickle file.") 14 | 15 | if __name__ == '__main__': 16 | args = parser.parse_args() 17 | 18 | pprint.pprint(misc.pickle_load(args.pickle)) 19 | -------------------------------------------------------------------------------- /scripts/plot_counts_correlation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import pandas as pd 6 | import os 7 | from collections import OrderedDict 8 | from wub.vis import report 9 | from matplotlib import pyplot as plt 10 | import seaborn as sns 11 | from scipy.stats import spearmanr 12 | 13 | # Parse command line arguments: 14 | parser = argparse.ArgumentParser( 15 | description='Scatter plot of two set of counts.') 16 | parser.add_argument( 17 | '-r', metavar='report_pdf', type=str, help="Report PDF.", required=False, default="plot_counts_correlation.pdf") 18 | parser.add_argument( 19 | '-T', metavar='tags', type=str, help="Data tags: tag1,tag2.", required=False, default=None) 20 | parser.add_argument( 21 | '-t', metavar='merged_data', type=str, help="Merged data TSV.", required=False, default=None) 22 | parser.add_argument( 23 | '-o', metavar='Correlation_tsv', type=str, help="Correlation TSV.", required=False, default=None) 24 | parser.add_argument( 25 | 'counts_one', metavar='counts_one', type=str, help="Input tab separated file.") 26 | parser.add_argument( 27 | 'counts_two', metavar='counts_two', type=str, help="Input tab separated file.") 28 | 29 | 30 | def _create_tagged_column(df, tag): 31 | df[tag] = df["Count"] 32 | df = df.drop("Count", axis=1) 33 | return df 34 | 35 | 36 | if __name__ == '__main__': 37 | args = parser.parse_args() 38 | 39 | data_one = pd.read_csv(args.counts_one, sep="\t") 40 | data_two = pd.read_csv(args.counts_two, sep="\t") 41 | 42 | # Set data tags: 43 | tags = args.T 44 | if tags is not None: 45 | tags = args.T.split(",") 46 | else: 47 | t1 = os.path.basename(args.counts_one).rsplit(".", 1)[0] 48 | t2 = os.path.basename(args.counts_two).rsplit(".", 1)[0] 49 | tags = [t1, t2] 50 | 51 | # Set column names: 52 | data_one = _create_tagged_column(data_one, tags[0]) 53 | data_two = _create_tagged_column(data_two, tags[1]) 54 | 55 | data_merged = pd.merge(data_one, data_two, on=["Reference"], how="outer") 56 | data_merged = data_merged.fillna(0.0) 57 | 58 | plotter = report.Report(args.r) 59 | 60 | g = sns.jointplot(tags[0], tags[1], data=data_merged, stat_func=spearmanr, kind="reg") 61 | plt.tight_layout() 62 | plotter.pages.savefig() 63 | 64 | plotter.close() 65 | 66 | if args.t is not None: 67 | data_merged.to_csv(args.t, sep="\t", index=False) 68 | 69 | if args.o is not None: 70 | rho, pval = spearmanr(data_merged[tags[0]], data_merged[tags[1]]) 71 | res = pd.DataFrame(OrderedDict([("rho", [rho]), ("pval", [pval])])) 72 | res.to_csv(args.o, sep="\t", index=False) 73 | -------------------------------------------------------------------------------- /scripts/plot_qualities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | import numpy as np 7 | 8 | from wub.util import seq as seq_util 9 | from wub.vis import report 10 | 11 | import warnings 12 | with warnings.catch_warnings(): 13 | warnings.simplefilter("ignore") 14 | import seaborn as sns 15 | warnings.resetwarnings() 16 | _ = sns 17 | 18 | # Parse command line arguments: 19 | parser = argparse.ArgumentParser( 20 | description='Plot the mean quality values across non-overlapping windows in the input sequences.') 21 | parser.add_argument( 22 | '-w', metavar='win_size', type=int, help="Window size (50).", default=50) 23 | parser.add_argument( 24 | '-r', metavar='report_pdf', type=str, help="Report pdf (plot_qualities.pdf).", default='plot_qualities.pdf') 25 | parser.add_argument('input_fastx', nargs='?', help='Input (default: stdin).', 26 | type=argparse.FileType('r'), default=sys.stdin) 27 | 28 | 29 | def _smooth_qualitites(quals, winsize): 30 | """ Smooth out qualities by taking average of non-overlapping windows. """ 31 | smooth_quals = [] 32 | for i in range(0, len(quals) - winsize, winsize): 33 | smooth_quals.append(np.mean(quals[i:i + winsize])) 34 | smooth_quals = np.array(smooth_quals, dtype=float) 35 | return smooth_quals 36 | 37 | 38 | if __name__ == '__main__': 39 | args = parser.parse_args() 40 | 41 | input_iterator = seq_util.read_seq_records( 42 | args.input_fastx, format="fastq") 43 | 44 | plotter = report.Report(args.r) 45 | 46 | for record in input_iterator: 47 | quals = np.array(record.letter_annotations["phred_quality"]) 48 | smooth_quals = _smooth_qualitites(quals, args.w) 49 | pos = np.arange(len(smooth_quals)) 50 | data_map = {'Mean qualities': (pos, smooth_quals)} 51 | plotter.plot_arrays(data_map, marker='-', title=record.id, xlab="Window", ylab="Mean quality") 52 | 53 | plotter.close() 54 | -------------------------------------------------------------------------------- /scripts/plot_sequence_properties.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | import numpy as np 7 | 8 | from wub.util import seq as seq_util 9 | from wub.vis import report 10 | 11 | import warnings 12 | with warnings.catch_warnings(): 13 | warnings.simplefilter("ignore") 14 | import seaborn as sns 15 | warnings.resetwarnings() 16 | _ = sns 17 | 18 | # Parse command line arguments: 19 | parser = argparse.ArgumentParser( 20 | description='Plot histograms of lengths and quality values.') 21 | parser.add_argument( 22 | '-f', metavar='format', type=str, help="Input format (fastq).", default='fastq') 23 | parser.add_argument( 24 | '-b', metavar='bins', type=int, help="Number of bins on histograms (50).", default=50) 25 | parser.add_argument( 26 | '-r', metavar='report_pdf', type=str, help="Report pdf (plot_sequence_properties.pdf).", default='plot_sequence_properties.pdf') 27 | parser.add_argument( 28 | '-j', help="Produce joint plot of lengths and mean quality values (False).", default=False, action="store_true") 29 | parser.add_argument('input_fastx', nargs='?', help='Input (default: stdin).', 30 | type=argparse.FileType('r'), default=sys.stdin) 31 | 32 | 33 | if __name__ == '__main__': 34 | args = parser.parse_args() 35 | 36 | in_format = args.f 37 | input_iterator = seq_util.read_seq_records( 38 | args.input_fastx, format=in_format) 39 | 40 | # Could be more efficient with dictionaries if we did not have to 41 | # deal with the joint plot. 42 | lengths = [] 43 | mean_qualities = [] 44 | 45 | for record in input_iterator: 46 | lengths.append(len(record)) 47 | if in_format == 'fastq': 48 | mean_quality = seq_util.mean_qscore(record.letter_annotations["phred_quality"]) 49 | mean_qualities.append(mean_quality) 50 | 51 | plotter = report.Report(args.r) 52 | 53 | plotter.plot_histograms( 54 | {'lengths': lengths}, title="Distribution of sequence lengths (mean={0:.3f})".format(np.mean(lengths)), xlab="Length", ylab="Count", legend=False) 55 | 56 | if in_format == 'fastq': 57 | plotter.plot_histograms( 58 | {'qualities': mean_qualities}, title="Distribution of mean base qualities (mean={0:.3f})".format(np.mean(mean_qualities)), xlab="Mean base quality", ylab="Count", legend=False) 59 | if args.j: 60 | plotter.plot_arrays({'scatter': (lengths, mean_qualities)}, title="Sequence length vs. mean base quality", 61 | xlab="Sequence length", ylab="Mean base quality", legend=False) 62 | 63 | plotter.close() 64 | -------------------------------------------------------------------------------- /scripts/reads_across_time.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import pandas as pd 6 | 7 | from wub.vis import report 8 | import matplotlib.pyplot as plt 9 | import warnings 10 | with warnings.catch_warnings(): 11 | warnings.simplefilter("ignore") 12 | import seaborn as sns 13 | warnings.resetwarnings() 14 | _ = sns 15 | 16 | # Parse command line arguments: 17 | parser = argparse.ArgumentParser( 18 | description=""" 19 | Plot read and alignment properties across time. 20 | """) 21 | 22 | parser.add_argument( 23 | '-i', metavar='time_tab', type=str, help="Tab separated file generated by fastq_time_tab.py", required=True) 24 | parser.add_argument( 25 | '-a', metavar='aln_tab', type=str, help="Tab separated file generated by bam_alignment_length.py", required=True) 26 | parser.add_argument( 27 | '-w', metavar='res_freq', type=float, help="Resampling frequency in minutes.", required=False, default=5) 28 | parser.add_argument( 29 | '-r', metavar='report_pdf', type=str, help="Report PDF (reads_across_time.pdf).", default="reads_across_time.pdf") 30 | parser.add_argument( 31 | '-t', metavar='out_tsv', type=str, help="Output tsv (reads_across_time.tsv).", default="reads_across_time.tsv") 32 | 33 | 34 | if __name__ == '__main__': 35 | args = parser.parse_args() 36 | 37 | freq = str(args.w) + "T" 38 | 39 | time_tab = pd.read_csv(args.i, sep="\t", parse_dates=True) 40 | 41 | aln_tab = pd.read_csv(args.a, sep="\t") 42 | aln_tab = aln_tab.rename(columns={"read_name": "Read", "aligned_ref_bases": "AlignedRefBases", 43 | "aligned_read_bases": "AlignedReadBases", "reference": "Reference", "mapping_quality": "MappingQuality"}) 44 | aln_tab.drop("read_length", axis=1, inplace=True) 45 | 46 | tt = time_tab.copy() 47 | tt = tt.set_index("StartTime").sort_index() 48 | tt.index = pd.DatetimeIndex(tt.index) 49 | 50 | df = pd.merge(aln_tab, time_tab, how='inner', on=['Read']) 51 | df = df.set_index("StartTime").sort_index() 52 | df.index = pd.DatetimeIndex(df.index) 53 | 54 | df["AlnRatio"] = df.AlignedReadBases / df.ReadLength 55 | 56 | df.to_csv(args.t, sep="\t") 57 | 58 | plotter = report.Report(args.r) 59 | 60 | tt.ReadLength.resample(freq).mean().plot() 61 | plt.ylabel("ReadLength") 62 | plotter.plt.tight_layout() 63 | plotter.pages.savefig() 64 | plotter.plt.clf() 65 | 66 | df.ReadLength.resample(freq).mean().plot() 67 | plt.ylabel("AlignedReadLength") 68 | plotter.plt.tight_layout() 69 | plotter.pages.savefig() 70 | plotter.plt.clf() 71 | 72 | df.ReadLength.resample(freq).count().plot() 73 | plt.ylabel("ReadCount") 74 | plotter.plt.tight_layout() 75 | plotter.pages.savefig() 76 | plotter.plt.clf() 77 | 78 | df.AlignedReadBases.resample(freq).mean().plot() 79 | plt.ylabel("AlignedReadBases") 80 | plotter.plt.tight_layout() 81 | plotter.pages.savefig() 82 | plotter.plt.clf() 83 | 84 | df.AlignedRefBases.resample(freq).mean().plot() 85 | plt.ylabel("AlignedRefBases") 86 | plotter.plt.tight_layout() 87 | plotter.pages.savefig() 88 | plotter.plt.clf() 89 | 90 | df.AlnRatio.resample(freq).mean().plot() 91 | plt.ylabel("AlignedReadBases / ReadLenght") 92 | plotter.plt.tight_layout() 93 | plotter.pages.savefig() 94 | plotter.plt.clf() 95 | 96 | df.MappingQuality.resample(freq).mean().plot() 97 | plt.ylabel("MappingQuality") 98 | plotter.plt.tight_layout() 99 | plotter.pages.savefig() 100 | plotter.plt.clf() 101 | 102 | plotter.close() 103 | -------------------------------------------------------------------------------- /scripts/reads_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import os 4 | from wub.read_stats import contig_stats as cstats 5 | from wub.util import misc 6 | 7 | 8 | def main(): 9 | 10 | savepath = args.savepath 11 | fastx = args.fastx 12 | tag = args.tag 13 | 14 | if savepath is None: 15 | savepath = os.getcwd() 16 | else: 17 | savepath = misc.mkdir(savepath) 18 | 19 | if tag is None: 20 | tag = misc.get_fname(fastx) 21 | 22 | if misc._getextension(fastx) == 'fastq': 23 | fq = True 24 | else: 25 | fq = False 26 | 27 | rawdata = cstats.GC_per_read(cstats.readfast(fastx), fq=fq) 28 | 29 | # print os.path.join(savepath, '{}_summary.stats'.format(tag)) 30 | 31 | if args.raw: 32 | rawdata.to_csv(os.path.join(savepath, '{}_raw.stats'.format(tag))) 33 | 34 | summary = cstats.get_stats(df=rawdata) 35 | summary.to_csv(os.path.join(savepath, '{}_summary.stats'.format(tag))) 36 | # print summary.round(2).to_string() 37 | 38 | if args.report: 39 | from wub.vis import report 40 | Plotter = report.Report(os.path.join(savepath, '{}.pdf'.format(tag))) 41 | 42 | rawdata = rawdata.sort_values('Seqlen', ascending=True) 43 | 44 | rawdata['cumsum'] = rawdata["Seqlen"].cumsum() 45 | rawdata['norm'] = 100.0 * rawdata['cumsum'] / rawdata['cumsum'].max() 46 | 47 | Plotter.plot_line(data=rawdata, x='Seqlen', y='norm', 48 | title='Normalized cumulative plot', xlab='length (bp)', ylab="normalized (%)",) 49 | 50 | # df1.sort_values('Seqlen', ascending=False) 51 | # df1["cumsum1"] = df1['Seqlen'].cumsum() 52 | # Plotter.plot_line(data=rawdata, x='Cumsum1', y=df1.reset_index().index, title='Ordered cumulative sum plot', xlab="contigs ordered largest to smallest", ylab='cumulative sum') 53 | 54 | Plotter.plot_scatter(data=rawdata, x='GC content (%)', y='Seqlen', title='GC content vs length plot', 55 | xlab="GC content (%)", ylab="length (bp)", alpha=0.5, ylim=0, xlim=0) 56 | if 'mean_q' in rawdata: 57 | 58 | Plotter.plot_scatter(data=rawdata, x='mean_q', y='Seqlen', title='Mean Q score vs length', 59 | xlab='Mean Q', ylab='length', alpha=0.5, xlim=rawdata['mean_q'].min() - 0.5, ylim=rawdata['Seqlen'].min() - 0.5) 60 | 61 | Plotter.close() 62 | 63 | 64 | if __name__ == "__main__": 65 | parser = argparse.ArgumentParser( 66 | description='Calculates the GC content and N50') 67 | 68 | parser.add_argument('--fastx', '-i', 69 | metavar='FILE', 70 | required=True, 71 | help='input file fastq or fasta') 72 | 73 | parser.add_argument('--raw', '-a', 74 | action='store_true', 75 | required=False, 76 | help='save raw the gc content per read/contig. default[False]') 77 | 78 | parser.add_argument('--savepath', '-s', 79 | metavar='DIR', 80 | required=False, 81 | default=None, 82 | help='output dir. default[cwd]') 83 | 84 | parser.add_argument('--report', '-r', 85 | # metavar="TRUE", 86 | action='store_true', 87 | required=False, 88 | default=None, 89 | help="Report PDF default[False]") 90 | 91 | parser.add_argument('--tag', '-n', 92 | metavar='STR', 93 | required=False, 94 | default=None, 95 | help='output name or tag. default[input name]') 96 | 97 | args = parser.parse_args() 98 | # print args 99 | 100 | main() 101 | -------------------------------------------------------------------------------- /scripts/reverse_fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | from wub.util import seq as seq_util 8 | 9 | # Parse command line arguments: 10 | parser = argparse.ArgumentParser( 11 | description='Reverse (but not complement!) sequences and qualities in fastq file.') 12 | parser.add_argument('input_fastq', nargs='?', help='Input fastq (default: stdin).', 13 | type=argparse.FileType('r'), default=sys.stdin) 14 | parser.add_argument('output_fastq', nargs='?', help='Output fastq (default: stdout)', 15 | type=argparse.FileType('w'), default=sys.stdout) 16 | 17 | 18 | def reverse_seq_records(input_iterator): 19 | """Reverse SeqRecord objects. 20 | 21 | :param input_iterator: Iterator of SeqRecord objects. 22 | :returns: Generator of reversed SeqRecord objects. 23 | :rtype: generator 24 | """ 25 | for record in input_iterator: 26 | yield record[::-1] 27 | 28 | 29 | if __name__ == '__main__': 30 | args = parser.parse_args() 31 | 32 | input_iterator = seq_util.read_seq_records( 33 | args.input_fastq, format='fastq') 34 | output_iterator = reverse_seq_records(input_iterator) 35 | seq_util.write_seq_records( 36 | output_iterator, args.output_fastq, format='fastq') 37 | -------------------------------------------------------------------------------- /scripts/sequence_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | from wub.util import seq as seq_util 8 | 9 | # Parse command line arguments: 10 | parser = argparse.ArgumentParser( 11 | description='Filter sequences by length and mean quality value.') 12 | parser.add_argument( 13 | '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq') 14 | parser.add_argument( 15 | '-o', metavar='out_format', type=str, help="Output format (fastq).", default='fastq') 16 | parser.add_argument( 17 | '-q', metavar='min_qual', type=float, help="Minimum mean quality value (0.0).", default=0.0) 18 | parser.add_argument( 19 | '-l', metavar='min_length', type=int, help="Minimum length (0).", default=0) 20 | parser.add_argument( 21 | '-c', action='store_true', help="Reverse complement sequences.", default=False) 22 | parser.add_argument( 23 | '-u', metavar='max_length', type=int, help="Maximum length (None).", default=None) 24 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).', 25 | type=argparse.FileType('r'), default=sys.stdin) 26 | parser.add_argument('output_fastx', nargs='?', help='Output file (default: stdout).', 27 | type=argparse.FileType('w'), default=sys.stdout) 28 | 29 | 30 | def record_filter(input_iter, in_format, min_qual, min_len, max_len, rev_comp): 31 | """ Filter SeqRecord objects by length and mean quality. 32 | 33 | :param input_iter: Iterator of SeqRecord objects. 34 | :param in_format: Input format. 35 | :param min_qual: Minimum mean quality. 36 | :param min_len: Minimum length. 37 | :param max_len: Maximum length. 38 | :param rev_comp: Reverse complement sequences if True. 39 | :returns: SeqRecord object. 40 | :rtype: generator 41 | """ 42 | for record in input_iter: 43 | # Quality filtering: 44 | if in_format == 'fastq': 45 | mean_quality = seq_util.mean_qscore(record.letter_annotations["phred_quality"]) 46 | if mean_quality < min_qual: 47 | continue 48 | # Length filtering: 49 | if len(record) < min_len: 50 | continue 51 | if max_len is not None and len(record) > max_len: 52 | continue 53 | if rev_comp: 54 | record = record.reverse_complement() 55 | yield record 56 | 57 | 58 | if __name__ == '__main__': 59 | args = parser.parse_args() 60 | 61 | if args.i == 'fasta' and args.o == 'fastq': 62 | sys.stderr.write( 63 | "Cannot produce fastq output from fasta! Use fasta_to_mock_fastq.py instead.\n") 64 | sys.exit(1) 65 | 66 | input_iterator = seq_util.read_seq_records( 67 | args.input_fastx, format=args.i) 68 | 69 | output_iterator = record_filter(input_iterator, args.i, args.q, args.l, args.u, args.c) 70 | 71 | seq_util.write_seq_records(output_iterator, args.output_fastx, format=args.o) 72 | -------------------------------------------------------------------------------- /scripts/sequence_subtract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | from wub.util import seq as seq_util 8 | 9 | # Parse command line arguments: 10 | parser = argparse.ArgumentParser( 11 | description='Filter out sequences present in the first file from the second file.') 12 | parser.add_argument( 13 | '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq') 14 | parser.add_argument( 15 | '-o', metavar='out_format', type=str, help="Output format (fastq).", default='fastq') 16 | parser.add_argument('input_fastx_bait', nargs='?', help='First input file (default: stdin).', 17 | type=argparse.FileType('r'), default=sys.stdin) 18 | parser.add_argument('input_fastx_target', nargs='?', help='Second input file.', 19 | type=argparse.FileType('r'), default=sys.stdin) 20 | parser.add_argument('output_fastx', nargs='?', help='Output file (default: stdout).', 21 | type=argparse.FileType('w'), default=sys.stdout) 22 | 23 | 24 | def _record_filter(input_iter_bait, input_iter_target): 25 | """ Filter out SeqRecord objects present in the first iterator. """ 26 | bait_ids = [read.id for read in input_iter_bait] 27 | for record in input_iter_target: 28 | if record.id not in bait_ids: 29 | yield record 30 | 31 | 32 | if __name__ == '__main__': 33 | args = parser.parse_args() 34 | 35 | input_iterator_bait = seq_util.read_seq_records( 36 | args.input_fastx_bait, format=args.i) 37 | 38 | input_iterator_target = seq_util.read_seq_records( 39 | args.input_fastx_target, format=args.i) 40 | 41 | output_iterator = _record_filter(input_iterator_bait, input_iterator_target) 42 | 43 | seq_util.write_seq_records(output_iterator, args.output_fastx, format=args.o) 44 | -------------------------------------------------------------------------------- /scripts/simulate_errors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | import numpy as np 8 | from Bio.Seq import Seq 9 | 10 | from wub.simulate import seq as sim_seq 11 | from wub.util import parse as parse_util 12 | from wub.util import seq as seq_util 13 | 14 | # Parse command line arguments: 15 | parser = argparse.ArgumentParser( 16 | description="""Simulate sequencing errors for each input sequence. 17 | """) 18 | parser.add_argument('-e', metavar='error_rate', type=float, 19 | help="Total rate of substitutions insertions and deletions (0.1).", default=0.1) 20 | parser.add_argument('-w', metavar='error_weights', type=str, 21 | help="Relative frequency of substitutions,insertions,deletions (1,1,4).", default="1,1,4") 22 | parser.add_argument('-z', metavar='random_seed', type=int, 23 | help="Random seed (None).", default=None) 24 | parser.add_argument('input_fasta', nargs='?', help='Input fasta (default: stdin).', 25 | type=argparse.FileType('r'), default=sys.stdin) 26 | parser.add_argument('output_fasta', nargs='?', help='Output fasta (default: stdout)', 27 | type=argparse.FileType('w'), default=sys.stdout) 28 | 29 | 30 | def simulate_errors(input_iter, error_rate, error_weights): 31 | """Simulate sequencing errors for each SeqRecord object in the input iterator. 32 | 33 | :param input_iter: Iterator of SeqRecord objects. 34 | :para error_rate: Total error rate of substitutions, insertions and deletions. 35 | :param error_weights: Relative frequency of substitutions,insertions,deletions. 36 | :returns: Generator of SeqRecord objects. 37 | :rtype: generator 38 | """ 39 | for record in input_iter: 40 | mutated_seq = sim_seq.simulate_sequencing_errors(record.seq, error_rate, error_weights).seq 41 | record.seq = Seq(mutated_seq) 42 | yield record 43 | 44 | 45 | if __name__ == '__main__': 46 | args = parser.parse_args() 47 | 48 | # Set random seed: 49 | if args.z is not None: 50 | np.random.seed(args.z) 51 | 52 | # Process error weights: 53 | error_weights = np.array(parse_util.separated_list_to_floats(args.w)) 54 | # Normalise error weights to probabilities: 55 | error_weights = parse_util.normalise_array(error_weights) 56 | error_weights = dict( 57 | zip(['substitution', 'insertion', 'deletion'], error_weights)) 58 | 59 | input_iterator = seq_util.read_seq_records(args.input_fasta, format='fasta') 60 | 61 | simulation_iterator = simulate_errors(input_iterator, args.e, error_weights) 62 | 63 | seq_util.write_seq_records( 64 | simulation_iterator, args.output_fasta, format='fasta') 65 | -------------------------------------------------------------------------------- /scripts/simulate_genome.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | import numpy as np 8 | 9 | from wub.simulate import genome as sim_genome 10 | from wub.util import parse as parse_util 11 | from wub.util import seq as seq_util 12 | 13 | # Parse command line arguments: 14 | parser = argparse.ArgumentParser( 15 | description="""Simulate genome sequence with the specified number of chromosomes, 16 | length distribution (truncated gamma) and base composition.""") 17 | parser.add_argument( 18 | '-n', metavar='nr_chrom', type=int, help="Number of chromosomes (23).", default=23) 19 | parser.add_argument('-m', metavar='mean_length', type=int, 20 | help="Mean length of chromosomes (5000000).", default=5000000) 21 | parser.add_argument( 22 | '-a', metavar='gamma_shape', type=float, help="Gamma shape parameter (1).", default=1.0) 23 | parser.add_argument( 24 | '-l', metavar='low_trunc', type=int, help="Lower truncation point (None).", default=None) 25 | parser.add_argument( 26 | '-u', metavar='high_trunc', type=int, help="Upper truncation point (None).", default=None) 27 | parser.add_argument('-b', metavar='base_freqs', type=str, 28 | help="Relative base frequencies in A,C,G,T order (1,1,1,1) or \"random\".", default="1,1,1,1") 29 | parser.add_argument('-z', metavar='random_seed', type=int, 30 | help="Random seed (None).", default=None) 31 | parser.add_argument('output_fasta', nargs='?', help='Output fasta (default: stdout)', 32 | type=argparse.FileType('w'), default=sys.stdout) 33 | 34 | 35 | if __name__ == '__main__': 36 | args = parser.parse_args() 37 | 38 | # Set random seed: 39 | if args.z is not None: 40 | np.random.seed(args.z) 41 | 42 | if args.b == "random": 43 | base_frequencies = np.random.uniform(size=4) 44 | base_frequencies = base_frequencies / np.sum(base_frequencies) 45 | else: 46 | base_frequencies = np.array(parse_util.separated_list_to_floats(args.b)) 47 | # Normalise relative base frequencies to probabilities: 48 | base_frequencies = parse_util.normalise_array(base_frequencies) 49 | 50 | simulation_iterator = sim_genome.simulate_genome( 51 | args.n, args.m, args.a, args.l, args.u, base_frequencies) 52 | seq_util.write_seq_records(simulation_iterator, args.output_fasta, format='fasta') 53 | -------------------------------------------------------------------------------- /scripts/simulate_sequences.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | 7 | import numpy as np 8 | 9 | from wub.simulate import seq as sim_seq 10 | from wub.util import parse as parse_util 11 | from wub.util import seq as seq_util 12 | 13 | # Parse command line arguments: 14 | parser = argparse.ArgumentParser( 15 | description="""Simulate sequences of fixed length and specified base composition.""") 16 | parser.add_argument( 17 | '-n', metavar='nr_seq', type=int, help="Number of sequences (1).", default=1) 18 | parser.add_argument('-m', metavar='length', type=int, 19 | help="Length of simulated sequences (3000).", default=3000) 20 | parser.add_argument('-b', metavar='base_freqs', type=str, 21 | help="Relative base frequencies in A,C,G,T order (1,1,1,1).", default="1,1,1,1") 22 | parser.add_argument('-z', metavar='random_seed', type=int, 23 | help="Random seed (None).", default=None) 24 | parser.add_argument('output_fasta', nargs='?', help='Output fasta (default: stdout)', 25 | type=argparse.FileType('w'), default=sys.stdout) 26 | 27 | 28 | if __name__ == '__main__': 29 | args = parser.parse_args() 30 | 31 | # Set random seed: 32 | if args.z is not None: 33 | np.random.seed(args.z) 34 | 35 | base_frequencies = np.array(parse_util.separated_list_to_floats(args.b)) 36 | # Normalise relative base frequencies to probabilities: 37 | base_frequencies = parse_util.normalise_array(base_frequencies) 38 | 39 | simulation_iterator = (seq_util.new_dna_record(sim_seq.simulate_sequence(args.m, base_frequencies), "seq_{}".format(i)) for i in range(args.n)) 40 | 41 | seq_util.write_seq_records(simulation_iterator, args.output_fasta, format='fasta') 42 | -------------------------------------------------------------------------------- /scripts/split_fastx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import six 5 | import argparse 6 | import sys 7 | from os import path 8 | 9 | from wub.util import seq as seq_util 10 | 11 | # Parse command line arguments: 12 | parser = argparse.ArgumentParser( 13 | description='Split sequence records in file to one record per file or batches of records.') 14 | parser.add_argument( 15 | '-i', metavar='in_format', type=str, help="Input format (fastq).", default='fastq') 16 | parser.add_argument( 17 | '-o', metavar='out_format', type=str, help="Output format (fastq).", default='fastq') 18 | parser.add_argument( 19 | '-b', metavar='batch_size', type=int, help="Batch size (None).", default=None) 20 | parser.add_argument('input_fastx', nargs='?', help='Input file (default: stdin).', 21 | type=argparse.FileType('r'), default=sys.stdin) 22 | parser.add_argument('output_dir', nargs='?', help='Output directory (default: .)', default='.') 23 | 24 | 25 | def batch_iterator(iterator, batch_size): 26 | """Returns lists of length batch_size. 27 | Taken from the biopython wiki: http://biopython.org/wiki/Split_large_file 28 | 29 | This is a generator function, and it returns lists of the 30 | entries from the supplied iterator. Each list will have 31 | batch_size entries, although the final list may be shorter. 32 | 33 | :param iterator: Input iterator. 34 | :param batch_size: Batch size. 35 | :returns: Generator of lists. 36 | :rtype: generator 37 | """ 38 | entry = True # Make sure we loop once 39 | while entry: 40 | batch = [] 41 | while len(batch) < batch_size: 42 | try: 43 | entry = six.next(iterator) 44 | except StopIteration: 45 | entry = None 46 | if entry is None: 47 | # End of file 48 | break 49 | batch.append(entry) 50 | if batch: 51 | yield batch 52 | 53 | 54 | if __name__ == '__main__': 55 | args = parser.parse_args() 56 | 57 | input_iterator = seq_util.read_seq_records( 58 | args.input_fastx, format=args.i) 59 | 60 | if args.b is None: 61 | # Splitting one record per file: 62 | for record in input_iterator: 63 | bn = path.basename(args.input_fastx.name) 64 | ext = bn.rsplit('.', 1)[-1] 65 | fh = open(path.join(args.output_dir, "{}.{}".format(record.id, ext)), 'w') 66 | seq_util.write_seq_records([record], fh, format=args.o) 67 | fh.flush() 68 | fh.close() 69 | else: 70 | # Split into batches: 71 | input_iterator = batch_iterator(input_iterator, args.b) 72 | i = 0 73 | for records in input_iterator: 74 | bn = path.basename(args.input_fastx.name) 75 | fh = open(path.join(args.output_dir, "batch_{}_{}".format(i, bn)), 'w') 76 | seq_util.write_seq_records(records, fh, format=args.o) 77 | fh.flush() 78 | fh.close() 79 | i += 1 80 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.5.1 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:wub/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | test = pytest 22 | 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup, find_packages 5 | from glob import glob 6 | 7 | with open('README.md') as readme_file: 8 | readme = readme_file.read() 9 | 10 | requirements = [ 11 | 'six', 12 | 'pytest', 13 | 'pycmd', 14 | 'biopython', 15 | 'numpy', 16 | 'matplotlib', 17 | 'seaborn', 18 | 'editdistance', 19 | 'pandas>=0.20.2', 20 | 'pysam', 21 | 'tqdm', 22 | 'statsmodels' 23 | ] 24 | 25 | test_requirements = [ 26 | 'pytest', 27 | 'pycmd', 28 | 'editdistance', 29 | 'numpy', 30 | ] 31 | 32 | setup( 33 | name='Wub', 34 | version='0.5.1', 35 | description="Tools and software components developed by the ONT Applications group.", 36 | long_description=readme, 37 | author="ONT Applications Group", 38 | author_email='Apps@nanoporetech.com', 39 | url='', 40 | packages=find_packages(exclude=["scripts"]), 41 | package_dir={'wub': 42 | 'wub'}, 43 | include_package_data=True, 44 | install_requires=requirements, 45 | zip_safe=False, 46 | keywords='wub', 47 | classifiers=[ 48 | 'Development Status :: 2 - Pre-Alpha', 49 | 'Intended Audience :: Developers', 50 | 'Natural Language :: English', 51 | "Programming Language :: Python :: 2", 52 | 'Programming Language :: Python :: 2.6', 53 | 'Programming Language :: Python :: 2.7', 54 | 'Programming Language :: Python :: 3.4', 55 | ], 56 | tests_require=test_requirements, 57 | scripts=[x for x in glob('scripts/*.py') if x != 'scripts/__init__.py'] 58 | ) 59 | -------------------------------------------------------------------------------- /wub/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = 'ONT Applications Group' 4 | __email__ = 'Apps@nanoporetech.com' 5 | __version__ = '0.5.1' 6 | -------------------------------------------------------------------------------- /wub/bam/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/bam/__init__.py -------------------------------------------------------------------------------- /wub/bam/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pysam 4 | 5 | 6 | def pysam_open(alignment_file, in_format='BAM'): 7 | """Open SAM/BAM file using pysam. 8 | 9 | :param alignment_file: Input file. 10 | :param in_format: Format (SAM or BAM). 11 | :returns: pysam.AlignmentFile 12 | :rtype: pysam.AlignmentFile 13 | """ 14 | if in_format == 'BAM': 15 | mode = "rb" 16 | elif in_format == 'SAM': 17 | mode = "r" 18 | else: 19 | raise Exception("Invalid format: {}".format(in_format)) 20 | 21 | aln_iter = pysam.AlignmentFile(alignment_file, mode) 22 | return aln_iter 23 | -------------------------------------------------------------------------------- /wub/bam/filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Filter SAM/BAM records by various criteria.""" 3 | 4 | import itertools 5 | 6 | 7 | def get_alignment_score(segement): 8 | """Get alignment score from pysam segment. 9 | 10 | :param segment: Pysam aligned segment. 11 | :returns: Alignment score. 12 | :rtype: int 13 | """ 14 | 15 | score = 0 16 | try: 17 | score = segement.get_tag('AS') 18 | except: 19 | pass 20 | return score 21 | 22 | 23 | def filter_top_per_query(records_iter): 24 | """Filter pysam records keeping top scoring per query. Assumes 25 | records are sorted by name. 26 | 27 | :param records_iter: Iterator of pysam aligned segments. 28 | :returns: Generator of filtered records. 29 | :rtype: generator 30 | """ 31 | buff = [] 32 | for rec in itertools.chain(records_iter, [None]): 33 | if len(buff) == 0: 34 | buff.append(rec) 35 | elif rec is None or buff[-1].query_name != rec.query_name: 36 | sorted_buff = sorted(buff, key=get_alignment_score, reverse=True) 37 | buff = [rec] 38 | yield sorted_buff[0] 39 | else: 40 | buff.append(rec) 41 | 42 | 43 | def filter_query_coverage(records_iter, minimum_coverage): 44 | """Filter pysam records keeping the ones with sufficient query coverage. 45 | 46 | :param records_iter: Iterator of pysam aligned segments. 47 | :param minimum_coverage: Minimum fraction of covered query. 48 | :returns: Generator of filtered records. 49 | :rtype: generator 50 | """ 51 | for rec in records_iter: 52 | if rec.is_unmapped: 53 | yield rec 54 | elif (float(rec.query_alignment_length) / rec.infer_query_length()) >= minimum_coverage: 55 | yield rec 56 | 57 | 58 | def filter_ref_coverage(records_iter, minimum_coverage, header): 59 | """Filter pysam records keeping the ones with sufficient reference coverage. 60 | 61 | :param records_iter: Iterator of pysam aligned segments. 62 | :param minimum_coverage: Minimum fraction of covered reference. 63 | :param header: SAM header with reference lengths. 64 | :returns: Generator of filtered records. 65 | :rtype: generator 66 | """ 67 | ref_lengths = dict((h['SN'], int(h['LN'])) for h in header['SQ']) 68 | for rec in records_iter: 69 | if rec.is_unmapped: 70 | yield rec 71 | elif (float(rec.query_alignment_length) / ref_lengths[rec.reference_name]) >= minimum_coverage: 72 | yield rec 73 | -------------------------------------------------------------------------------- /wub/bam/read_counter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Count reads per reference in BAM/SAM file.""" 3 | import sys 4 | 5 | import pysam 6 | import numpy as np 7 | from collections import defaultdict 8 | from wub.util import seq as seq_util 9 | import tqdm 10 | 11 | 12 | def count_reads(alignment_file, in_format='BAM', min_aln_qual=0, verbose=False, reads_gc=False): 13 | """Count reads mapping to references in a BAM file. 14 | 15 | :param alignment_file: BAM file. 16 | :param min_aln_qual: Minimum mapping quality. 17 | :param verbose: Verbose if True. 18 | :param read_gc: Calculate mean GC content of reads for each reference. 19 | :returns: Dictionary with read counts per reference and read GC contents. 20 | :rtype: tuple of dicts 21 | """ 22 | counts = defaultdict(int) 23 | gc_means = defaultdict(list) 24 | if in_format == 'BAM': 25 | mode = "rb" 26 | elif in_format == 'SAM': 27 | mode = "r" 28 | else: 29 | raise Exception("Invalid format: {}".format(in_format)) 30 | 31 | aln_iter = pysam.AlignmentFile(alignment_file, mode) 32 | 33 | if verbose and in_format == "BAM": 34 | try: 35 | total_reads = aln_iter.mapped + aln_iter.unmapped 36 | except: 37 | total_reads = None 38 | sys.stdout.write( 39 | "Gathering read statistics from file: {}\n".format(alignment_file)) 40 | if in_format == "BAM": 41 | aln_iter = tqdm.tqdm(aln_iter, total=total_reads) 42 | 43 | for segment in aln_iter: 44 | if segment.is_unmapped: 45 | continue 46 | if segment.mapping_quality >= min_aln_qual: 47 | counts[segment.reference_name] += 1 48 | if reads_gc: 49 | gc_means[segment.reference_name].append(seq_util.gc_content(segment.query_alignment_sequence)) 50 | 51 | gc_cont = {} 52 | if reads_gc: 53 | # Calculate mean of mean GC contents: 54 | for trs, gc_ms in gc_means.items(): 55 | gc_cont[trs] = np.mean(gc_ms) 56 | aln_iter.close() 57 | 58 | return dict(counts), gc_cont 59 | 60 | 61 | def count_reads_realtime(alignment_file='-', in_format='SAM', min_aln_qual=0, yield_freq=1, verbose=False): 62 | """Online counting of reads mapping to references in a SAM/BAM stream from stdin. 63 | 64 | :param alignment_file: BAM file (stdin). 65 | :param min_aln_qual: Minimum mapping quality. 66 | :param yield_freq: Yield frequency. 67 | :param verbose: Minimum mapping quality. 68 | :returns: Generator of dictionary with read counts per reference. 69 | :rtype: generator 70 | """ 71 | counts = defaultdict(int) 72 | if in_format == 'BAM': 73 | mode = "rb" 74 | elif in_format == 'SAM': 75 | mode = "r" 76 | else: 77 | raise Exception("Invalid format: {}".format(in_format)) 78 | 79 | aln_iter = pysam.AlignmentFile(alignment_file, mode) 80 | 81 | if verbose: 82 | sys.stdout.write( 83 | "Online counting of read statistics from file: {}\n".format(alignment_file)) 84 | aln_iter = iter(tqdm.tqdm(aln_iter)) 85 | 86 | nr_mapped = 0 87 | while True: 88 | try: 89 | segment = aln_iter.next() 90 | except StopIteration: 91 | # Final yield: 92 | yield counts 93 | return 94 | 95 | if segment.is_unmapped: 96 | continue 97 | if segment.mapping_quality >= min_aln_qual: 98 | counts[segment.reference_name] += 1 99 | nr_mapped += 1 100 | 101 | if nr_mapped % yield_freq == 0: 102 | yield counts 103 | 104 | aln_iter.close() 105 | -------------------------------------------------------------------------------- /wub/bam/sam_writer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import six 4 | from collections import OrderedDict 5 | 6 | 7 | class SamWriter: 8 | 9 | """ Simple class to write SAM files. """ 10 | 11 | def __init__(self, out_file, header=None): 12 | """ Initialise SAM writer object """ 13 | self.out_file = out_file 14 | self.header = header 15 | self.out_handler = open(out_file, 'w') 16 | if header is not None: 17 | self._write_header() 18 | 19 | def _write_header(self): 20 | """Write SAM header.""" 21 | for record_type, records in six.iteritems(self.header): 22 | for record in records: 23 | self.out_handler.write("@{}".format(record_type)) 24 | for key, value in six.iteritems(record): 25 | self.out_handler.write("\t{}:{}".format(key, value)) 26 | self.out_handler.write("\n") 27 | 28 | def new_sam_record(self, qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual, tags): 29 | """Create new SAM record structure. 30 | 31 | :param self: object 32 | :param qname: Read name. 33 | :param rname: Reference name. 34 | :param pos: Position in reference. 35 | :param mapq: Mapping quality. 36 | :param cigar: CIGAR string. 37 | :param rnext: Reference of next read. 38 | :param pnext: Position of next read. 39 | :param tlen: Template length. 40 | :param seq: Read sequence. 41 | :param qual: Base qualities. 42 | :param tags: Optional tags. 43 | :returns: SAM record. 44 | :rtype: OrderedDict 45 | """ 46 | record = OrderedDict() 47 | 48 | record['QNAME'] = qname 49 | record['FLAG'] = flag 50 | record['RNAME'] = rname 51 | record['POS'] = pos 52 | record['MAPQ'] = mapq 53 | record['CIGAR'] = cigar 54 | record['RNEXT'] = rnext 55 | record['PNEXT'] = pnext 56 | record['TLEN'] = tlen 57 | record['SEQ'] = seq 58 | record['QUAL'] = qual 59 | record['TAGS'] = tags 60 | 61 | return record 62 | 63 | def write(self, record): 64 | """Write SAM record to file. 65 | 66 | :param self: object 67 | :param record: SAM record. 68 | :returns: None 69 | :rtype: object 70 | """ 71 | self.out_handler.write("{}\n".format("\t".join(map(lambda x: str(x), six.itervalues(record))))) 72 | 73 | def close(self): 74 | """Close SAM file. 75 | 76 | :param self: object 77 | :returns: None 78 | :rtype: object 79 | """ 80 | self.out_handler.flush() 81 | self.out_handler.close() 82 | -------------------------------------------------------------------------------- /wub/mappers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/mappers/__init__.py -------------------------------------------------------------------------------- /wub/parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/parsers/__init__.py -------------------------------------------------------------------------------- /wub/parsers/blastn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ Parser functions for blastn outfmt 6. """ 3 | 4 | 5 | def _parse_coord_line(line): 6 | """ Parse a line from a blast outfmt 6 file. """ 7 | fields = line.split() 8 | aln_record = { 9 | 'query': fields[0], 10 | 'ref': fields[1], 11 | 'identity': float(fields[2]), 12 | 'aln_length': int(fields[3]), 13 | 'mismatch': int(fields[4]), 14 | 'gapopen': int(fields[5]), 15 | 'query_start': int(fields[6]), 16 | 'query_end': int(fields[7]), 17 | 'ref_start': int(fields[8]), 18 | 'ref_end': int(fields[9]), 19 | 'evalue': float(fields[10]), 20 | 'bitscore': float(fields[11]), 21 | 'strand': '+' 22 | } 23 | 24 | if aln_record['ref_start'] > aln_record['ref_end']: 25 | aln_record['strand'] = '-' 26 | aln_record['ref_start'], aln_record['ref_end'] = aln_record[ 27 | 'ref_end'], aln_record['ref_start'] 28 | return aln_record 29 | 30 | 31 | def parse_coords(input_object): 32 | """ Parse coordinates file produced by blastn outfmt 6. 33 | 34 | :param input_object: Input path or file hanlder. 35 | :returns: List of dictionaries with parsed records. 36 | :rtype: list 37 | """ 38 | if type(input_object) == str: 39 | input_object = open(input_object, 'r') 40 | records = [] 41 | for line in input_object: 42 | line = line.strip() 43 | records.append(_parse_coord_line(line)) 44 | return records 45 | -------------------------------------------------------------------------------- /wub/parsers/mummer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ Parser functions for mummer. """ 3 | 4 | 5 | def _parse_coord_line(line): 6 | """ Parse a line from a mummer coordinate file. """ 7 | fields = line.replace("|", "").split() 8 | aln_record = { 9 | 'ref_start': int(fields[0]), 10 | 'ref_end': int(fields[1]), 11 | 'query_start': int(fields[2]), 12 | 'query_end': int(fields[3]), 13 | 'ref_len': int(fields[4]), 14 | 'query_len': int(fields[5]), 15 | 'identity': float(fields[6]), 16 | 'ref': fields[7], 17 | 'query': fields[8], 18 | } 19 | return aln_record 20 | 21 | 22 | def parse_coords(input_object): 23 | """ Parse coordinates file produced by mummer. 24 | 25 | :param input_object: Input path or file hanlder. 26 | :returns: List of dictionaries with parsed records. 27 | :rtype: list 28 | """ 29 | if type(input_object) == str: 30 | input_object = open(input_object, 'r') 31 | records = [] 32 | for line in input_object: 33 | line = line.strip() 34 | if line.count('/') > 0: 35 | continue 36 | if line.count('NUCMER') > 0: 37 | continue 38 | if line.count('[') > 0: 39 | continue 40 | if line.count('=') > 0: 41 | continue 42 | if len(line) == 0: 43 | continue 44 | records.append(_parse_coord_line(line)) 45 | return records 46 | -------------------------------------------------------------------------------- /wub/read_stats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/read_stats/__init__.py -------------------------------------------------------------------------------- /wub/read_stats/contig_stats.py: -------------------------------------------------------------------------------- 1 | from Bio import SeqIO 2 | import pandas as pd 3 | from wub.util.misc import _getextension 4 | from wub.util.seq import mean_qscore 5 | 6 | 7 | def readfast(fast): 8 | """ reads a fasta or fastq file. 9 | 10 | :param fast: fastq or fasta 11 | :return: list of records with attr 12 | :rtype: generator object 13 | 14 | """ 15 | 16 | extension = _getextension(fast) 17 | for rec in SeqIO.parse(open(fast), extension): 18 | 19 | yield rec 20 | 21 | 22 | def _cumsum(df, col): 23 | ''' 24 | Calculates the cumulative sum of column 25 | 26 | :param df: dataframe with sequence length 27 | :param col: identify the sequence length 28 | :return: dataframe with cumulative sum of length 29 | :rtype: dataframe 30 | ''' 31 | df = df.sort_values(by=col, ascending=False).reset_index(drop=True) 32 | df['cumsum'] = df[col].cumsum() 33 | return df 34 | 35 | 36 | def N50(df, col, percent=50): 37 | """ Calculate the N50 by default however, by changing percent to 75, N75 can be calculated. 38 | 39 | :param df: dataframe with seqlen column 40 | :param col: column with sequence length 41 | :param percent: percentage to be calculated 42 | :return: N50 Value 43 | :rtype: int 44 | 45 | """ 46 | df1 = _cumsum(df, col) 47 | df1['cumsum'] = df1[col].cumsum() 48 | n50 = df1['cumsum'].max() * percent / 100 49 | return df1.where(df1['cumsum'] >= n50)[col].dropna().head(1).tolist()[0] 50 | 51 | 52 | def L50(df, col, percent=50): 53 | """ Calculate the L50 by default however, by changing percent to 75, N75 can be calculated 54 | 55 | :param df: dataframe with seqlen column 56 | :param col: column with sequence length 57 | :param percent: percentage to be calculated 58 | :return: N50 Value 59 | :rtype: int 60 | 61 | """ 62 | 63 | df1 = _cumsum(df, col).copy() 64 | return df1[df1 >= N50(df, col, percent)][col].count() 65 | 66 | 67 | def GC_per_read(seq_rec, fq=False): 68 | """ Calculates the number of bases per sequence, GC content and mean Q score if fastq is given 69 | 70 | :param seq_rec: sequence records with attr from biopython 71 | :param fq: boolean 72 | :return: dataframe 73 | :rtype: dataframe 74 | """ 75 | 76 | d = [] 77 | 78 | bases = ["A", "T", "C", "G", 'N'] 79 | # total_lengths = 0 80 | for rec in seq_rec: 81 | tmp = {"SeqID": rec.id, "Seqlen": len(rec.seq), "A": 0, "T": 0, "G": 0, "C": 0, "N": 0} 82 | for base in bases: 83 | tmp[base] += rec.seq.count(base) 84 | 85 | if fq: 86 | tmp['mean_q'] = round(mean_qscore(rec.letter_annotations[ 87 | "phred_quality"], qround=False), 2) 88 | 89 | d.append(tmp) 90 | 91 | raw = pd.DataFrame(d).set_index('SeqID') 92 | raw['GC content (%)'] = raw.apply(lambda x: float( 93 | (x['G']) + x['C']) / x['Seqlen'] * 100.0, axis=1) 94 | 95 | for base in bases: 96 | raw[base + ' (%)'] = (raw[base] / raw["Seqlen"]) * 100.0 97 | raw["other base"] = raw['Seqlen'] - raw[bases].sum(axis=1) 98 | return raw 99 | 100 | 101 | def get_stats(df): 102 | """ Calcualtes the summary stats 103 | 104 | :param df: dataframe from GC_per_read 105 | :return: summary Series 106 | :rtype: Series 107 | 108 | """ 109 | stats = pd.Series({}) 110 | df = df.copy() 111 | Mbase = 1000000.0 112 | 113 | bases = ["A", "T", "C", "G", 'N'] 114 | 115 | total_len = int(df["Seqlen"].sum()) 116 | total_bases = df[bases].sum().sum() 117 | 118 | stats['N75'] = N50(df, 'Seqlen', 75) 119 | stats['N50'] = N50(df, 'Seqlen', 50) 120 | stats['N25'] = N50(df, 'Seqlen', 25) 121 | 122 | stats['L75'] = L50(df, "Seqlen", 75) 123 | stats['L50'] = L50(df, "Seqlen", 50) 124 | stats['L25'] = L50(df, "Seqlen", 25) 125 | 126 | stats['Max contig'] = df['Seqlen'].max() 127 | stats['Min contig'] = df['Seqlen'].min() 128 | stats['Avg length'] = df['Seqlen'].mean() 129 | stats['Length SD'] = df['Seqlen'].std() 130 | stats['Total length (Mb)'] = total_len / Mbase 131 | 132 | stats['Total bases (Mb)'] = total_len / Mbase 133 | stats['Other bases (Mb)'] = (total_len - total_bases) / Mbase 134 | stats['No. contigs'] = df['Seqlen'].count() 135 | 136 | stats["Greater then 10 Kb"] = df[df['Seqlen'] >= 10000.0].Seqlen.count() 137 | stats["Greater then 100 Kb"] = df[df['Seqlen'] >= 100000.0].Seqlen.count() 138 | stats["Greater then 500 Kb"] = df[df['Seqlen'] >= 500000.0].Seqlen.count() 139 | stats["Greater then 1 Mb"] = df[df['Seqlen'] >= 1000000.0].Seqlen.count() 140 | 141 | stats['Yield > 10kb (Mb)'] = df[df['Seqlen'] >= 10000.0]['Seqlen'].sum() / Mbase 142 | stats['Yield > 50kb (Mb)'] = df[df['Seqlen'] >= 50000.0]['Seqlen'].sum() / Mbase 143 | 144 | if 'mean_q' in df.columns: 145 | stats['Max Qscore'] = df['mean_q'].max() 146 | stats['Min Qscore'] = df['mean_q'].min() 147 | stats['Avg Qscore'] = df['mean_q'].mean() 148 | stats['Qscore SD'] = df['mean_q'].std() 149 | stats['Yield >Q6 (Mb)'] = df[df['mean_q'] >= 6.0]['Seqlen'].sum() / Mbase 150 | stats['Yield >Q9 (Mb)'] = df[df['mean_q'] >= 9.0]['Seqlen'].sum() / Mbase 151 | 152 | stats["GC content"] = float(df[['G', "C"]].sum().sum()) / total_len * 100.0 153 | for base in bases: 154 | 155 | stats[base + ' (%)'] = float(df[base].sum()) / total_len * 100.0 156 | 157 | return stats.round(2) 158 | -------------------------------------------------------------------------------- /wub/simulate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/simulate/__init__.py -------------------------------------------------------------------------------- /wub/simulate/dist.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | 5 | """Sample from various distributions.""" 6 | 7 | 8 | def sample_truncated_gamma(mean, shape, low=None, high=None): 9 | """A naive rejection approach to sample from truncated gamma distribution. 10 | Note that truncation points ae included in the sample. 11 | 12 | :param mean: Mean of the distribution. 13 | :param shape: Shape parameter. 14 | :param low: Lower truncation point. 15 | :param high: Upper truncation point. 16 | :returns: Random sample from the specified distribution. 17 | :rtype: float 18 | 19 | """ 20 | 21 | scale = float(mean) / shape 22 | while True: 23 | sample = np.random.gamma(scale=scale, shape=shape, size=1) 24 | if low is not None and sample < low: 25 | continue 26 | if high is not None and sample > high: 27 | continue 28 | return float(sample) 29 | -------------------------------------------------------------------------------- /wub/simulate/genome.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import six 4 | import sys 5 | import numpy as np 6 | from collections import OrderedDict, namedtuple 7 | 8 | from wub.util import seq as seq_util 9 | from wub.simulate import seq as sim_seq 10 | from wub.simulate import dist 11 | 12 | Fragment = namedtuple('Fragment', 'chrom uid start end seq') 13 | 14 | 15 | def simulate_genome(number_chromosomes, mean_length, gamma_shape, low_truncation, high_truncation, base_frequencies): 16 | """Generator function for simulating chromosomes in a genome. 17 | Chromosome lengths are sampled from a truncated gamma distribution. 18 | 19 | :param number_chromosomes: Number of simulated chromosomes. 20 | :param mean_length: Mean length of simulated chromosomes. 21 | :param gamma_shape: Shape parameter of the chromosome length distribution. 22 | :param low_truncation: Minimum chromosome length. 23 | :param high_truncation: Maximum chromosome length. 24 | :param base_frequencies: Array of base frequencies in the ACGT order. 25 | :returns: A generator of SeqRecord objects. 26 | :rtype: generator 27 | 28 | """ 29 | chrom_info = OrderedDict( 30 | ('chr' + str(i), 31 | int(dist.sample_truncated_gamma(mean_length, gamma_shape, low_truncation, high_truncation))) 32 | for i in range(number_chromosomes)) 33 | sim_iter = (seq_util.new_dna_record(sim_seq.simulate_sequence(length, base_frequencies), name) 34 | for name, length in six.iteritems(chrom_info)) 35 | return sim_iter 36 | 37 | 38 | def sample_chromosome(chromosomes): 39 | """Sample a random chromosome. 40 | 41 | :param chromosomes: A collection of SeqRecord object. 42 | :returns: A randomly sampled element from the input collection. 43 | :rtype: SeqRecord 44 | """ 45 | indexes = range(len(chromosomes)) 46 | pick = np.random.choice(indexes) 47 | return chromosomes[pick] 48 | 49 | 50 | def simulate_fragment(chromosome, mean_length, gamma_shape, low_truncation, high_truncation, fragment_number): 51 | """Simulate a fragment from a chromosome. 52 | 53 | :param chromosome: Chromosome to simulate fragment from, SeqRecord object. 54 | :param mean_length: Mean length of simulated fragment. 55 | :param gamma_shape: Shape parameter of length distribution. 56 | :param low_truncation: Minimum read length. 57 | :param high_truncation: Maximum read length. 58 | :param fragment_number: The unique identifier of fragment in simulation (number of fragment). 59 | :returns: A named tuple with chromosome id, fragment number, start, end and sequence. 60 | :rtype: namedtuple 61 | """ 62 | fragment_length = int(dist.sample_truncated_gamma( 63 | mean_length, gamma_shape, low_truncation, high_truncation)) 64 | upper_boundary = len(chromosome) - fragment_length 65 | # Special case when upper boundary is less than the read length. Maybe 66 | # should handle this by rejection? 67 | if upper_boundary < fragment_length: 68 | start = 0 69 | end = len(chromosome) 70 | else: 71 | start = np.random.randint(0, upper_boundary) 72 | end = start + fragment_length 73 | fragment_sequence = chromosome.seq[start:end] 74 | return Fragment(chromosome.id, fragment_number, start, end, fragment_sequence) 75 | 76 | 77 | def simulate_fragments(chromosomes, mean_length, gamma_shape, low_truncation, high_truncation, number_fragments): 78 | """Simulate a fragments from a set of chromosomes. Chromosomes are picked randomly for each fragment. 79 | 80 | :param chromosomes: Chromosomes to simulate fragment from, a list of SeqRecord objects. 81 | :param mean_length: Mean length of simulated fragments. 82 | :param gamma_shape: Shape parameter of length distribution. 83 | :param low_truncation: Minimum read length. 84 | :param high_truncation: Maximum read length. 85 | :param number_fragments: Number of fragments to simulate. 86 | :returns: An iterator named tuples with chromosome id, fragment number, start, end and sequence. 87 | :rtype: generator 88 | """ 89 | fragment_uid = 0 90 | while True: 91 | if fragment_uid >= number_fragments: 92 | break 93 | chromosome = sample_chromosome(chromosomes) 94 | fragment = simulate_fragment( 95 | chromosome, mean_length, gamma_shape, low_truncation, high_truncation, fragment_uid) 96 | if (fragment.end - fragment.start) > 0: 97 | fragment_uid += 1 98 | yield fragment 99 | else: 100 | sys.stderr.write( 101 | "Skipped zero length fragment! Consider increase minimum read length!\n") 102 | -------------------------------------------------------------------------------- /wub/simulate/seq.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import functools 5 | from collections import namedtuple 6 | 7 | from wub.util import seq as seq_util 8 | 9 | uniform_probs = [0.25, 0.25, 0.25, 0.25] 10 | 11 | strand_directions = ['+', '-'] 12 | 13 | MutatedSeq = namedtuple( 14 | 'MutatedSeq', 'seq real_qual real_subst real_del real_ins cigar') 15 | 16 | cigar_operations = {'match': 'M', 'substitution': 'M', 'insertion': 'I', 'deletion': 'D'} 17 | 18 | 19 | def sample_direction(forward_prob): 20 | return np.random.choice(strand_directions, p=[forward_prob, 1 - forward_prob]) 21 | 22 | 23 | def random_base(probs=uniform_probs): 24 | """Generate a random DNA base. 25 | 26 | :param probs: Probabilities of sampling a base, in the ACGT order. 27 | :returns: A sampled base. 28 | :rtype: str 29 | """ 30 | return np.random.choice(seq_util.bases, p=probs) 31 | 32 | 33 | def random_base_except(excluded, probs=uniform_probs): 34 | """Generate a random base according to the specified probabilities with the exclusion of the specified base. 35 | 36 | :param excluded: Exclude this base from sampling. 37 | :param probs: Base sampling probabilities in the ACGT order. 38 | :returns: A sampled base. 39 | :rtype: str 40 | """ 41 | if len(probs) != len(seq_util.bases): 42 | raise ValueError('Probability vector has wrong length!') 43 | # Filter out excluded base: 44 | bp_dict = dict((x, y) 45 | for x, y in zip(seq_util.bases, probs) if x != excluded) 46 | filtered_bases = list(bp_dict.keys()) 47 | norm_probs = np.array(list(bp_dict.values()), dtype=float) 48 | # Re-normalise probabilities: 49 | norm_probs = norm_probs / np.sum(norm_probs) 50 | return np.random.choice(filtered_bases, p=norm_probs) 51 | 52 | 53 | def simulate_sequence(length, probs=uniform_probs): 54 | """Simulate sequence of specified length and base composition. 55 | 56 | :param length: Length of simulated sequence. 57 | :param probs: Base composition vector in the ACGT order. 58 | :returns: Simulated sequence. 59 | :rtype: str 60 | """ 61 | return ''.join(np.random.choice(seq_util.bases, size=length, p=probs)) 62 | 63 | 64 | def sample_error_type(error_weights): 65 | """Sample error type from error weights dictionary. 66 | 67 | :param error_weights: A dcitionary with (type, probability) pairs. 68 | :returns: Error type 69 | :rtype: str 70 | """ 71 | return np.random.choice(list(error_weights.keys()), p=list(error_weights.values())) 72 | 73 | 74 | def cigar_list_to_string(cigar_list): 75 | """Sample error type from error weights dictionary. 76 | 77 | :param error_weights: A dcitionary with (type, probability) pairs. 78 | :returns: Error type 79 | :rtype: str 80 | """ 81 | 82 | tmp = map(lambda x: str(x[0]) + str(x[1]), cigar_list) 83 | return ''.join(tmp) 84 | 85 | 86 | def compress_raw_cigar_list(raw_cigar): 87 | """Sample error type from error weights dictionary. 88 | 89 | :param error_weights: A dcitionary with (type, probability) pairs. 90 | :returns: Error type 91 | :rtype: str 92 | """ 93 | 94 | raw_cigar[0] = [raw_cigar[0]] 95 | 96 | def cigar_op_compose(a, b): 97 | x = a.pop() 98 | if x[1] == b[1]: 99 | a.append((x[0] + b[0], x[1])) 100 | else: 101 | a.extend([x, b]) 102 | return a 103 | 104 | cigar = functools.reduce(cigar_op_compose, raw_cigar) 105 | return cigar 106 | 107 | 108 | def simulate_sequencing_errors(sequence, error_rate, error_weights): 109 | """Simulate substitutions, deletions and insertions. 110 | 111 | :param sequence: Input sequence. 112 | :param error_rate: Total error rate. 113 | :param error_weights: A dictionary with error types as keys and probabilities as values. 114 | The possible error types are: substitution, deletion, insertion. 115 | :returns: A named tuple with elements: mutated sequence, realised quality, number of realised substitutions, 116 | number of realised deletions, number of realised insertions, cigar string. 117 | :rtype: namedtuple 118 | """ 119 | if len(sequence) == 0: 120 | raise Exception('Cannot simulate sequencing errors on empty sequence!') 121 | 122 | new_bases = [] 123 | 124 | realised_substitutions = 0 125 | realised_deletions = 0 126 | realised_insertions = 0 127 | raw_cigar_list = [] 128 | 129 | for position, base in enumerate(sequence): 130 | if np.random.uniform() < error_rate: 131 | error_type = sample_error_type(error_weights) 132 | 133 | if error_type == 'substitution': 134 | new_base = random_base_except(base) 135 | realised_substitutions += 1 136 | raw_cigar_list.append((1, cigar_operations[error_type])) 137 | 138 | elif error_type == 'deletion': 139 | new_base = '' 140 | realised_deletions += 1 141 | raw_cigar_list.append((1, cigar_operations[error_type])) 142 | 143 | elif error_type == 'insertion': 144 | new_base = base + random_base() 145 | realised_insertions += 1 146 | raw_cigar_list.append((1, cigar_operations['match'])) 147 | raw_cigar_list.append((1, cigar_operations[error_type])) 148 | 149 | else: 150 | raise Exception("Unhandled error type: {}".format(error_type)) 151 | else: 152 | raw_cigar_list.append((1, cigar_operations['match'])) 153 | new_base = base 154 | new_bases.append(new_base) 155 | 156 | new_sequence = ''.join(new_bases) 157 | cigar = cigar_list_to_string(compress_raw_cigar_list(raw_cigar_list)) 158 | 159 | realised_events = realised_substitutions + \ 160 | realised_deletions + realised_insertions 161 | realised_quality = seq_util.prob_to_phred( 162 | round(float(realised_events) / float(len(sequence)), 3)) 163 | mutated_record = MutatedSeq( 164 | new_sequence, realised_quality, realised_substitutions, realised_deletions, realised_insertions, cigar) 165 | return mutated_record 166 | 167 | 168 | def add_errors(seq, nr_errors, error_type): 169 | """Introduce a specified number of errors in the target sequence at random positions. 170 | 171 | :param seq: Input DNA sequence. 172 | :param nr_errors: Number of mismatches to introduce. 173 | :returns: Mutated sequence. 174 | :rtype: str 175 | """ 176 | seq = list(seq) 177 | positions = np.random.choice(np.arange(len(seq)), size=nr_errors, replace=False) 178 | if error_type == 'substitution': 179 | for pos in positions: 180 | seq[pos] = random_base_except(seq[pos]) 181 | elif error_type == 'deletion': 182 | for pos in positions: 183 | seq[pos] = '' 184 | elif error_type == 'insertion': 185 | for pos in positions: 186 | seq[pos] = seq[pos] + random_base() 187 | else: 188 | raise Exception('Invalid error type') 189 | return ''.join(seq) 190 | -------------------------------------------------------------------------------- /wub/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /wub/tests/data/test_bam_stats/stat_ref.fas: -------------------------------------------------------------------------------- 1 | >seq_0 2 | GTACAGCGGACAGTATGAAGGAAACTGACAACGCGAGTCACGTAATGGAGATGGATCCCAGACTGTTGCCCGGACCGATTCAAGCACA 3 | -------------------------------------------------------------------------------- /wub/tests/data/test_bam_stats/stat_test.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/tests/data/test_bam_stats/stat_test.bam -------------------------------------------------------------------------------- /wub/tests/data/test_bam_stats/stat_test.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/tests/data/test_bam_stats/stat_test.bam.bai -------------------------------------------------------------------------------- /wub/tests/data/test_blastn_parse/blastn_test.coords: -------------------------------------------------------------------------------- 1 | seq_0 seq_0 100.00 100 0 0 1 100 201 300 3e-50 181 2 | seq_0 seq_0 100.00 100 0 0 1 100 600 501 3e-50 181 3 | -------------------------------------------------------------------------------- /wub/tests/data/test_nucmer_parse/nucmer_test.coords: -------------------------------------------------------------------------------- 1 | /nfs/vnx-home/bsipos/gt/r2/scripts/read_err.fas /nfs/vnx-home/bsipos/gt/r2/scripts/adapter.fas 2 | NUCMER 3 | 4 | [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] 5 | ===================================================================================== 6 | 454 500 | 1 49 | 47 49 | 90.00 | seq_0 adapter 7 | 956 1003 | 3 50 | 48 48 | 92.00 | seq_0 adapter 8 | -------------------------------------------------------------------------------- /wub/tests/test_bam_compare.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import tempfile 4 | import os 5 | 6 | from wub.bam import compare 7 | 8 | 9 | class TestBamCompare(unittest.TestCase): 10 | 11 | """Test BAM comparison test.""" 12 | 13 | def _generate_test_data(self): 14 | """Generate test data for dnadiff test.""" 15 | fh_sam_one = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, mode='w') 16 | self.sam_one = fh_sam_one.name 17 | 18 | data = """@SQ SN:chr0 LN:827 19 | @SQ SN:chr1 LN:6379 20 | @PG ID:bwa PN:bwa VN:0.7.15-r1142-dirty CL:bwa mem genome.fas reads.fq 21 | r0_chr1_4118_4168_+/q17/s0/d0/i1 0 chr1 4119 60 8M1I42M * 0 0 CATTTGGTACCATTGTGATCCGCTCTTAGAAACTTTTGGCACTTTATCGCG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:1 MD:Z:50 AS:i:43 XS:i:0 22 | r1_chr1_72_122_+/q12/s0/d2/i1 4 * 0 0 * * 0 0 AGCGCAGTGGTCGACTTAGCTTATTCACGAGAGCCTTCCAACTGGCCAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XS:i:0 23 | r2_chr0_279_329_-/q17/s0/d1/i0 16 chr0 280 60 16M1D33M * 0 0 AGAACTTGCAAGCGCGGCTCCAGCCTTTCAGGACGAGACCCTCCAAGAC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:1 MD:Z:16^C33 AS:i:42 XS:i:0 24 | r3_chr1_60_110_+/q14/s1/d1/i0 0 chr1 61 51 36M1D13M * 0 0 GGTGTTTTATATAGCGCAGTGTCGACTTAGCTTATTGCGACGAGCCTTC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:2 MD:Z:36^C0A12 AS:i:37 XS:i:0 25 | r4_chr1_1268_1318_+/q12/s1/d2/i0 0 chr1 1269 28 19M1D23M1D6M * 0 0 GTATTCCATCGAGCTGGATCAGTTTAGGAGTGTGCCTAGGTATATCCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:3 MD:Z:19^G5G17^C6 AS:i:30 XS:i:0 26 | r5_chr0_576_626_-/q12/s1/d2/i0 4 * 0 0 * * 0 0 GCAAATTTTACAGATGATAAAACACCGAATATTCAGACCGTGTAAATA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XS:i:0 27 | r6_chr0_509_559_-/q12/s0/d3/i0 16 chr0 510 60 20M1D27M * 0 0 TGTGGTAGGAGCGGAGCGGGCCCACACCCCCATCCCCCGCGAAATAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:1 MD:Z:20^A27 AS:i:40 XS:i:0 28 | r7_chr1_2417_2467_-/q12/s1/d1/i1 16 chr1 2418 41 6M1D37M1I6M * 0 0 AGCCGATCATCCCGTCCCTGTTCACTCCTACGTCTTGGCTTGGAAAGTGT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:3 MD:Z:6^G27G15 AS:i:34 XS:i:0 29 | r8_chr0_661_711_-/q11/s0/d3/i1 4 * 0 0 * * 0 0 GTCTGAGGCGCCATATTAGGCGGGCAAAATGGACTATGACTGTGGCAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XS:i:0 30 | r9_chr0_523_573_-/q14/s0/d1/i1 16 chr0 524 59 3M1I25M1D21M * 0 0 AGCGGGGACCCACACCCCCATCCCCCGCGAATAATTCAACGTTCGCATTA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:2 MD:Z:28^A21 AS:i:39 XS:i:0 31 | r10_chr1_2417_2467_-/q12/s1/d1/i1 16 chr1 2418 41 6M1D37M1I6M * 0 0 AGCCGATCATCCCGTCCCTGTTCACTCCTACGTCTTGGCTTGGAAAGTGT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:3 MD:Z:6^G27G15 AS:i:34 XS:i:0 32 | """ 33 | 34 | fh_sam_one.write(data) 35 | fh_sam_one.flush() 36 | fh_sam_one.close() 37 | 38 | data = """@SQ SN:chr0 LN:827 39 | @SQ SN:chr1 LN:6379 40 | @PG ID:bwa PN:bwa VN:0.7.15-r1142-dirty CL:bwa mem genome.fas reads.fq 41 | r0_chr1_4118_4168_+/q17/s0/d0/i1 0 chr1 4119 60 8M1I42M * 0 0 CATTTGGTACCATTGTGATCCGCTCTTAGAAACTTTTGGCACTTTATCGCG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:1 MD:Z:50 AS:i:43 XS:i:0 42 | r1_chr1_72_122_+/q12/s0/d2/i1 4 * 0 0 * * 0 0 AGCGCAGTGGTCGACTTAGCTTATTCACGAGAGCCTTCCAACTGGCCAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XS:i:0 43 | r2_chr0_279_329_-/q17/s0/d1/i0 16 chr0 280 60 16M1D33M * 0 0 AGAACTTGCAAGCGCGGCTCCAGCCTTTCAGGACGAGACCCTCCAAGAC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:1 MD:Z:16^C33 AS:i:42 XS:i:0 44 | r3_chr1_60_110_+/q14/s1/d1/i0 0 chr1 61 51 36M1D13M * 0 0 GGTGTTTTATATAGCGCAGTGTCGACTTAGCTTATTGCGACGAGCCTTC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:2 MD:Z:36^C0A12 AS:i:37 XS:i:0 45 | r4_chr1_1268_1318_+/q12/s1/d2/i0 0 chr1 1269 28 19M1D23M1D6M * 0 0 GTATTCCATCGAGCTGGATCAGTTTAGGAGTGTGCCTAGGTATATCCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:3 MD:Z:19^G5G17^C6 AS:i:30 XS:i:0 46 | r5_chr0_576_626_-/q12/s1/d2/i0 4 * 0 0 * * 0 0 GCAAATTTTACAGATGATAAAACACCGAATATTCAGACCGTGTAAATA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XS:i:0 47 | r6_chr0_509_559_-/q12/s0/d3/i0 16 chr0 510 60 20M1D27M * 0 0 TGTGGTAGGAGCGGAGCGGGCCCACACCCCCATCCCCCGCGAAATAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:1 MD:Z:20^A27 AS:i:40 XS:i:0 48 | r7_chr1_2417_2467_-/q12/s1/d1/i1 16 chr1 2418 41 6M1D37M1I6M * 0 0 AGCCGATCATCCCGTCCCTGTTCACTCCTACGTCTTGGCTTGGAAAGTGT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:3 MD:Z:6^G27G15 AS:i:34 XS:i:0 49 | r8_chr0_661_711_-/q11/s0/d3/i1 4 * 0 0 * * 0 0 GTCTGAGGCGCCATATTAGGCGGGCAAAATGGACTATGACTGTGGCAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XS:i:0 50 | r9_chr0_523_573_-/q14/s0/d1/i1 16 chr0 726 59 4M25M1D21M * 0 0 AGCGGGGACCCACACCCCCATCCCCCGCGAATAATTCAACGTTCGCATTA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:2 MD:Z:28^A21 AS:i:39 XS:i:0 51 | r10_chr1_2417_2467_-/q12/s1/d1/i1 16 chr1 2420 41 2H4M1D37M1I6M * 0 0 CCGATCATCCCGTCCCTGTTCACTCCTACGTCTTGGCTTGGAAAGTGT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:3 MD:Z:6^G27G15 AS:i:34 XS:i:0 52 | """ 53 | fh_sam_two = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, mode='w') 54 | self.sam_two = fh_sam_two.name 55 | 56 | fh_sam_two.write(data) 57 | fh_sam_two.flush() 58 | fh_sam_two.close() 59 | 60 | def _cleanup_test_data(self): 61 | """Cleanup test dataset.""" 62 | os.unlink(self.sam_one) 63 | os.unlink(self.sam_two) 64 | 65 | def test_bam_read_counter(self): 66 | """Test read_counter wrapper.""" 67 | self._generate_test_data() 68 | res = compare.bam_compare(self.sam_one, self.sam_two, in_format='SAM') 69 | self.assertAlmostEqual(res['AlignedSimilarity'], 0.8680, places=3) 70 | self._cleanup_test_data() 71 | -------------------------------------------------------------------------------- /wub/tests/test_bam_read_counter.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import tempfile 4 | import os 5 | 6 | from wub.bam import read_counter 7 | 8 | error_rate = 0.1 9 | ref_length = 5000 10 | 11 | 12 | class TestBamReadCounter(unittest.TestCase): 13 | 14 | """Test BAM read counter wrapper.""" 15 | 16 | def _generate_test_data(self): 17 | """Generate test data for dnadiff test.""" 18 | fh_sam = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, mode='w') 19 | self.sam = fh_sam.name 20 | 21 | data = """@SQ SN:chr0 LN:827 22 | @SQ SN:chr1 LN:6379 23 | @PG ID:bwa PN:bwa VN:0.7.15-r1142-dirty CL:bwa mem genome.fas reads.fq 24 | r0_chr1_4118_4168_+/q17/s0/d0/i1 0 chr1 4119 60 8M1I42M * 0 0 CATTTGGTACCATTGTGATCCGCTCTTAGAAACTTTTGGCACTTTATCGCG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:1 MD:Z:50 AS:i:43 XS:i:0 25 | r1_chr1_72_122_+/q12/s0/d2/i1 4 * 0 0 * * 0 0 AGCGCAGTGGTCGACTTAGCTTATTCACGAGAGCCTTCCAACTGGCCAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XS:i:0 26 | r2_chr0_279_329_-/q17/s0/d1/i0 16 chr0 280 60 16M1D33M * 0 0 AGAACTTGCAAGCGCGGCTCCAGCCTTTCAGGACGAGACCCTCCAAGAC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:1 MD:Z:16^C33 AS:i:42 XS:i:0 27 | r3_chr1_60_110_+/q14/s1/d1/i0 0 chr1 61 51 36M1D13M * 0 0 GGTGTTTTATATAGCGCAGTGTCGACTTAGCTTATTGCGACGAGCCTTC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:2 MD:Z:36^C0A12 AS:i:37 XS:i:0 28 | r4_chr1_1268_1318_+/q12/s1/d2/i0 0 chr1 1269 28 19M1D23M1D6M * 0 0 GTATTCCATCGAGCTGGATCAGTTTAGGAGTGTGCCTAGGTATATCCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:3 MD:Z:19^G5G17^C6 AS:i:30 XS:i:0 29 | r5_chr0_576_626_-/q12/s1/d2/i0 4 * 0 0 * * 0 0 GCAAATTTTACAGATGATAAAACACCGAATATTCAGACCGTGTAAATA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XS:i:0 30 | r6_chr0_509_559_-/q12/s0/d3/i0 16 chr0 510 60 20M1D27M * 0 0 TGTGGTAGGAGCGGAGCGGGCCCACACCCCCATCCCCCGCGAAATAA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:1 MD:Z:20^A27 AS:i:40 XS:i:0 31 | r7_chr1_2417_2467_-/q12/s1/d1/i1 16 chr1 2418 41 6M1D37M1I6M * 0 0 AGCCGATCATCCCGTCCCTGTTCACTCCTACGTCTTGGCTTGGAAAGTGT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:3 MD:Z:6^G27G15 AS:i:34 XS:i:0 32 | r8_chr0_661_711_-/q11/s0/d3/i1 4 * 0 0 * * 0 0 GTCTGAGGCGCCATATTAGGCGGGCAAAATGGACTATGACTGTGGCAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII AS:i:0 XS:i:0 33 | r9_chr0_523_573_-/q14/s0/d1/i1 16 chr0 524 59 3M1I25M1D21M * 0 0 AGCGGGGACCCACACCCCCATCCCCCGCGAATAATTCAACGTTCGCATTA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:2 MD:Z:28^A21 AS:i:39 XS:i:0 34 | """ 35 | 36 | fh_sam.write(data) 37 | fh_sam.flush() 38 | fh_sam.close() 39 | 40 | def _cleanup_test_data(self): 41 | """Cleanup test dataset.""" 42 | os.unlink(self.sam) 43 | 44 | def test_bam_read_counter(self): 45 | """Test read_counter wrapper.""" 46 | self._generate_test_data() 47 | res = read_counter.count_reads(self.sam, in_format='SAM') 48 | self.assertEqual(res[0]['chr0'], 3) 49 | self.assertEqual(res[0]['chr1'], 4) 50 | self._cleanup_test_data() 51 | -------------------------------------------------------------------------------- /wub/tests/test_bam_stats.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import six 4 | from os import path 5 | from collections import OrderedDict 6 | from Bio import SeqIO 7 | from wub.bam import stats 8 | from wub.util import seq as seq_util 9 | 10 | 11 | class TestBamStats(unittest.TestCase): 12 | 13 | """Test BAM statistics functions.""" 14 | 15 | def test_error_and_read_stats(self): 16 | """Test the gathering of error and read statistics.""" 17 | top = path.dirname(__file__) 18 | ref_fasta = path.join(top, "data/test_bam_stats/stat_ref.fas") 19 | bam = path.join(top, "data/test_bam_stats/stat_test.bam") 20 | refs = seq_util.read_seq_records_dict(ref_fasta) 21 | res = stats.error_and_read_stats( 22 | bam, refs, context_sizes=(1, 1), region=None, min_aqual=0, verbose=False) 23 | 24 | # Test evenets: 25 | self.assertEqual(res['events']['AGA'], {'*': 1, 'G': 2}) 26 | self.assertEqual(res['events']['CGA'], {'-': 1, 'G': 2}) 27 | self.assertEqual(res['events']['ACA'], {'C': 2, 'T': 1}) 28 | 29 | # Test indel properties: 30 | self.assertEqual(res['indel_dists']['insertion_lengths'], {8: 1}) 31 | self.assertEqual(res['indel_dists']['insertion_composition'], {'G': 8}) 32 | self.assertEqual(res['indel_dists']['deletion_lengths'], {9: 1}) 33 | 34 | # Test read statistics: 35 | self.assertEqual(res['read_stats'], {'alignment_lengths': [87], 'mapping_quals': [47], 'unaligned_lengths': [], 'unaligned_quals': [], 'mqfail_alignment_lengths': [], 'mapped': 1, 'unmapped': 0, 'mqfail_aligned_quals': [], 'aligned_quals': [40], 'aligned_lengths': [87]}) 36 | 37 | def test_read_stats(self): 38 | """Test the gathering read statistics.""" 39 | top = path.dirname(__file__) 40 | bam = path.join(top, "data/test_bam_stats/stat_test.bam") 41 | res = stats.read_stats(bam, region=None, min_aqual=0, verbose=False) 42 | 43 | self.maxDiff = None 44 | target = {'aligned_lengths': [87], 'read_stats': OrderedDict([('name', ['r0_seq_1_0_87_+/q93/s0/d0/i0']), ('ref', ['seq_0']), ('coverage', [1.0]), ('direction', ['+']), ('aln_length', [96]), ('insertion', [8]), ('deletion', [9]), ('mismatch', [1]), ('match', [78]), ('identity', [0.9873417721518988]), ('accuracy', [0.8125]), ('clipps', [0])]), 'aligned_quals': [ 45 | 40], 'base_stats': {'deletion': 9, 'mismatch': 1, 'identity': 0.9873417721518988, 'insertion': 8, 'clipps': 0, 'aln_length': 96, 'match': 78, 'accuracy': 0.8125}, 'mapping_quals': [47], 'mqfail_alignment_lengths': [], 'alignment_lengths': [87], 'mqfail_aligned_quals': [], 'unaligned_lengths': [], 'unaligned_quals': [], 'mapped': 1, 'unmapped': 0} 46 | self.assertEqual(res, target) 47 | 48 | def test_pileup_stats(self): 49 | """Test the gathering read statistics.""" 50 | top = path.dirname(__file__) 51 | bam = path.join(top, "data/test_bam_stats/stat_test.bam") 52 | res = stats.pileup_stats(bam, region=None, verbose=False) 53 | 54 | self.assertEqual(res, {'coverage': {'seq_0': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1, 28: 1, 29: 1, 30: 1, 31: 1, 32: 1, 33: 1, 34: 1, 35: 1, 36: 1, 37: 1, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1, 46: 1, 47: 1, 48: 1, 49: 1, 50: 1, 51: 1, 52: 1, 53: 1, 54: 1, 55: 1, 56: 1, 57: 1, 58: 1, 59: 1, 60: 1, 61: 1, 62: 1, 63: 1, 64: 1, 65: 1, 66: 1, 67: 1, 68: 1, 69: 1, 70: 1, 71: 1, 72: 1, 73: 1, 74: 1, 75: 1, 76: 1, 77: 1, 78: 1, 79: 1, 80: 1, 81: 1, 82: 1, 83: 1, 84: 1, 85: 1, 86: 1, 87: 1}}, 'qualities': {'seq_0': {0: [40], 1: [40], 2: [40], 3: [40], 4: [40], 5: [ 55 | 40], 6: [40], 7: [40], 8: [40], 9: [40], 10: [40], 11: [40], 12: [40], 13: [40], 14: [40], 15: [40], 16: [40], 17: [40], 18: [40], 19: [40], 20: [40], 21: [40], 22: [40], 23: [40], 24: [40], 34: [40], 35: [40], 36: [40], 37: [40], 38: [40], 39: [40], 40: [40], 41: [40], 42: [40], 43: [40], 44: [40], 45: [40], 46: [40], 47: [40], 48: [40], 49: [40], 50: [40], 51: [40], 52: [40], 53: [40], 54: [40], 55: [40], 56: [40], 57: [40], 58: [40], 59: [40], 60: [40], 61: [40], 62: [40], 63: [40], 64: [40], 65: [40], 66: [40], 67: [40], 68: [40], 69: [40], 70: [40], 71: [40], 72: [40], 73: [40], 74: [40], 75: [40], 76: [40], 77: [40], 78: [40], 79: [40], 80: [40], 81: [40], 82: [40], 83: [40], 84: [40], 85: [40], 86: [40], 87: [40]}}}) 56 | 57 | def test_fragment_stats(self): 58 | """Test the gathering of fragment statistics.""" 59 | top = path.dirname(__file__) 60 | bam = path.join(top, "data/test_bam_stats/stat_test.bam") 61 | ref = path.join(top, "data/test_bam_stats/stat_ref.fas") 62 | references = SeqIO.index(ref, format='fasta') 63 | chrom_lengths = {name: len(so) for name, so in six.iteritems(references)} 64 | res = stats.frag_coverage(bam, chrom_lengths, region=None, min_aqual=0, verbose=False) 65 | 66 | self.maxDiff = None 67 | target = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 70 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 71 | self.assertEqual(list(res['frags_fwd']['seq_0']), target) 72 | -------------------------------------------------------------------------------- /wub/tests/test_blastn_coord_parse.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from os import path 4 | from wub.parsers import blastn 5 | 6 | 7 | class TestBlastnCoordParse(unittest.TestCase): 8 | 9 | def test_nucmer_coord_parse(self): 10 | """Test blastn outfmt 6 cooridnate parsing.""" 11 | top = path.dirname(__file__) 12 | coord_file = path.join(top, "data/test_blastn_parse/blastn_test.coords") 13 | records = blastn.parse_coords(coord_file) 14 | self.assertEqual(records, [{'gapopen': 0, 'query_end': 100, 'mismatch': 0, 'ref_end': 300, 'query': 'seq_0', 'identity': 100.0, 'bitscore': 181.0, 'query_start': 1, 'ref_start': 201, 'strand': '+', 'aln_length': 100, 'ref': 'seq_0', 'evalue': 3e-50}, {'gapopen': 0, 'query_end': 100, 'mismatch': 0, 'ref_end': 600, 'query': 'seq_0', 'identity': 100.0, 'bitscore': 181.0, 'query_start': 1, 'ref_start': 501, 'strand': '-', 'aln_length': 100, 'ref': 'seq_0', 'evalue': 3e-50}]) 15 | -------------------------------------------------------------------------------- /wub/tests/test_contig_stats.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from wub.read_stats import contig_stats 4 | import pandas as pd 5 | 6 | 7 | class TestContigStats(unittest.TestCase): 8 | """Test N50 utility function.""" 9 | 10 | def test_N50(self): 11 | """Test calculation of N50.""" 12 | sequence_lengths = pd.DataFrame({'dummy': [2, 3, 4, 5, 6, 7, 8, 9, 10]}) 13 | self.assertEqual(contig_stats.N50(sequence_lengths, 'dummy'), 8) 14 | -------------------------------------------------------------------------------- /wub/tests/test_example.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | class ExampleTest(unittest.TestCase): 5 | 6 | def setUp(self): 7 | pass 8 | 9 | def tearDown(self): 10 | pass 11 | 12 | def test_success(self): 13 | self.assertTrue(True) 14 | -------------------------------------------------------------------------------- /wub/tests/test_mappers_lastal.py: -------------------------------------------------------------------------------- 1 | import six 2 | import unittest 3 | import tempfile 4 | import os 5 | 6 | from wub.mappers import lastal 7 | from wub.util import seq as seq_util 8 | from wub.simulate import seq as sim_seq 9 | from wub.util import cmd as cmd_util 10 | 11 | error_rate = 0.1 12 | ref_length = 1000 13 | 14 | 15 | class TestMappersLastal(unittest.TestCase): 16 | 17 | def setUp(self): 18 | fh_ref = tempfile.NamedTemporaryFile(suffix=".fas", delete=False, mode='w') 19 | self.ref_fasta = fh_ref.name 20 | fh_target = tempfile.NamedTemporaryFile(suffix=".fas", delete=False, mode='w') 21 | self.target_fasta = fh_target.name 22 | 23 | self.ref = sim_seq.simulate_sequence(ref_length) 24 | nr_errors = int(len(self.ref) * error_rate) 25 | self.target = sim_seq.add_errors(self.ref, nr_errors, 'substitution') 26 | 27 | left_flanking = sim_seq.simulate_sequence(50) 28 | right_flanking = sim_seq.simulate_sequence(50) 29 | 30 | self.ref = left_flanking + self.ref + right_flanking 31 | self.target = left_flanking + self.target + right_flanking 32 | 33 | fh_ref.write(">ref\n{}\n".format(self.ref)) 34 | fh_ref.flush() 35 | fh_ref.close() 36 | 37 | fh_target.write(">target\n{}\n".format(self.target)) 38 | fh_target.flush() 39 | fh_target.close() 40 | 41 | def tearDown(self): 42 | os.unlink(self.ref_fasta) 43 | os.unlink(self.target_fasta) 44 | 45 | def test_parse_lastal_identical(self): 46 | raw = """\ 47 | # batch 0 48 | a score=23 EG2=3.8e+06 E=5.2e-13 49 | s Simulomonas 0 23 + 23 ATGCGGGGGATAGGACCATATCT 50 | s tig00000000 0 23 + 23 ATGCGGGGGATAGGACCATATCT 51 | """ 52 | parsed = six.next(lastal.parse_lastal(raw)) 53 | acc = seq_util.alignment_stats(parsed.r_aln, parsed.q_aln).accuracy 54 | self.assertEqual(acc, 1.0) 55 | self.assertEqual(parsed.score, 23) 56 | 57 | def test_parse_lastal_difference(self): 58 | raw = """\ 59 | # batch 0 60 | a score=23 EG2=3.8e+06 E=5.2e-13 61 | s Simulomonas 0 23 + 23 TTGCGGGGGATAGGACCATATCT 62 | s tig00000000 0 23 + 23 ATGCGGGGGATAGGACCATATCT 63 | """ 64 | parsed = six.next(lastal.parse_lastal(raw)) 65 | acc = seq_util.alignment_stats(parsed.r_aln, parsed.q_aln).accuracy 66 | self.assertAlmostEqual(acc, 0.9565, places=3) 67 | 68 | def test_parse_lastal_zero(self): 69 | raw = """\ 70 | # batch 0 71 | a score=23 EG2=3.8e+06 E=5.2e-13 72 | s Simulomonas 0 23 + 23 CCCTCCCCCCCCCCCTTCCCCAC 73 | s tig00000000 0 23 + 23 ATGCGGGGGATAGGACCATATCT 74 | """ 75 | parsed = six.next(lastal.parse_lastal(raw)) 76 | acc = seq_util.alignment_stats(parsed.r_aln, parsed.q_aln).accuracy 77 | self.assertAlmostEqual(acc, 0.0, places=3) 78 | 79 | @unittest.skipIf(not cmd_util.find_executable('lastal'), 80 | "Lastal binary not found, skipping integration tests.") 81 | def test_lastal_compare_genomes(self): 82 | tmp = lastal.compare_genomes_lastal( 83 | self.ref_fasta, self.target_fasta) 84 | substs = tmp['substitutions'][0] 85 | self.assertEqual(int(ref_length * error_rate), substs) 86 | -------------------------------------------------------------------------------- /wub/tests/test_nucmer_coord_parse.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from os import path 4 | from wub.parsers import mummer 5 | 6 | 7 | class TestNucmerCoordParse(unittest.TestCase): 8 | 9 | """Test numcmer cooridnate parsing.""" 10 | 11 | def test_nucmer_coord_parse(self): 12 | """Test parsing of nucmer coordinate files.""" 13 | top = path.dirname(__file__) 14 | coord_file = path.join(top, "data/test_nucmer_parse/nucmer_test.coords") 15 | records = mummer.parse_coords(coord_file) 16 | self.assertEqual(records, [{'query_len': 49, 'query_end': 49, 'ref_end': 500, 'ref_len': 47, 'query': 'adapter', 'query_start': 1, 'ref_start': 454, 'ref': 'seq_0', 'identity': 90.0}, {'query_len': 48, 'query_end': 50, 'ref_end': 1003, 'ref_len': 48, 'query': 'adapter', 'query_start': 3, 'ref_start': 956, 'ref': 'seq_0', 'identity': 92.0}]) 17 | -------------------------------------------------------------------------------- /wub/tests/test_simulate_genome.py: -------------------------------------------------------------------------------- 1 | import six 2 | import unittest 3 | 4 | from wub.simulate import genome as sim_genome 5 | 6 | 7 | class TestSimulateGenome(unittest.TestCase): 8 | 9 | """Test genome simulation utilities.""" 10 | 11 | def test_simulate_genome(self): 12 | """Test genome simulator.""" 13 | record = six.next(sim_genome.simulate_genome(number_chromosomes=1, mean_length=1000, 14 | gamma_shape=50, low_truncation=1000, high_truncation=1001, base_frequencies=[0.25] * 4)) 15 | self.assertEqual(len(record), 1000) 16 | 17 | def test_simulate_fragment(self): 18 | """Test fragment simulator.""" 19 | chrom = six.next(sim_genome.simulate_genome(number_chromosomes=1, mean_length=1000, 20 | gamma_shape=50, low_truncation=1000, high_truncation=1001, base_frequencies=[0.25] * 4)) 21 | frag = sim_genome. simulate_fragment( 22 | chrom, mean_length=50, gamma_shape=50, low_truncation=50, high_truncation=51, fragment_number=0) 23 | self.assertEqual(frag.end - frag.start, 50) 24 | 25 | def test_simulate_fragment_edge(self): 26 | """Test fragment simulator (edge case).""" 27 | chrom = six.next(sim_genome.simulate_genome(number_chromosomes=1, mean_length=1000, 28 | gamma_shape=50, low_truncation=1000, high_truncation=1001, base_frequencies=[0.25] * 4)) 29 | frag = sim_genome. simulate_fragment( 30 | chrom, mean_length=2000, gamma_shape=50, low_truncation=2000, high_truncation=2001, fragment_number=0) 31 | self.assertEqual(frag.end - frag.start, 1000) 32 | -------------------------------------------------------------------------------- /wub/tests/test_simulate_seq.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import editdistance 4 | import numpy as np 5 | from wub.simulate import seq as sim_seq 6 | from wub.util import seq as seq_util 7 | 8 | 9 | class TestSimulateSeq(unittest.TestCase): 10 | 11 | """Test sequence simulation utilities.""" 12 | 13 | def test_simulate_sequencing_errors(self): 14 | """Test function simulating sequencing errors.""" 15 | error_rate = 0.1 16 | error_weights = {'substitution': 1.0 / 6, 17 | 'insertion': 1.0 / 6, 18 | 'deletion': 4.0 / 6} 19 | sequence = sim_seq.simulate_sequence(5000) 20 | mutated_record = sim_seq.simulate_sequencing_errors( 21 | sequence, error_rate, error_weights) 22 | distance = editdistance.eval(sequence, mutated_record.seq) 23 | expected_errors = len(sequence) * error_rate 24 | errors_sd = np.sqrt(len(sequence) * error_rate * (1 - error_rate)) 25 | # Should pass 0.9973 proportion of cases: 26 | self.assertTrue(expected_errors - errors_sd * 3 < distance < expected_errors + 27 | errors_sd * 3, msg="expected: {} realised:{}".format(expected_errors, distance)) 28 | 29 | def test_add_errors(self): 30 | """Test function adding sequencing errors.""" 31 | seq = "ATGCATGCATGC" 32 | mut_seq = sim_seq.add_errors(seq, 6, 'substitution') 33 | self.assertSequenceEqual(seq_util.alignment_stats(seq, mut_seq), (12, 6, 0, 0, 0.5)) 34 | 35 | def test_compress_raw_cigar_list(self): 36 | """Test compression of raw cigar lists.""" 37 | cigar_list = [ 38 | (1, 'M'), (1, 'M'), (1, 'M'), (1, 'D'), (1, 'D'), (1, 'M'), (1, 'I'), (1, 'M')] 39 | compressed = sim_seq.compress_raw_cigar_list(cigar_list) 40 | expected = [(3, 'M'), (2, 'D'), (1, 'M'), (1, 'I'), (1, 'M')] 41 | self.assertSequenceEqual(compressed, expected) 42 | 43 | def test_cigar_list_to_string(self): 44 | """Test formatting of cigar strings.""" 45 | cigar_list = [(3, 'M'), (2, 'D'), (1, 'M'), (1, 'I'), (1, 'M')] 46 | cigar_string = sim_seq.cigar_list_to_string(cigar_list) 47 | expected = "3M2D1M1I1M" 48 | self.assertEqual(cigar_string, expected) 49 | -------------------------------------------------------------------------------- /wub/tests/test_util_parse.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from wub.util import parse 4 | import numpy as np 5 | 6 | 7 | class TestUtilParse(unittest.TestCase): 8 | 9 | """Test parsing utilities.""" 10 | 11 | def test_separated_list_to_floats(self): 12 | """Test parsing of separated lists.""" 13 | string = "0.1,0.2,0.3" 14 | parsed = (0.1, 0.2, 0.3) 15 | self.assertSequenceEqual(parse.separated_list_to_floats(string), parsed) 16 | 17 | def test_args_string_to_dict(self): 18 | """Test parsing of dictionaries encoded in separated strings.""" 19 | string = "a:0.1,b:0.2,c:0.3" 20 | parsed = (("a", "0.1"), ("b", "0.2"), ("c", "0.3")) 21 | self.assertSequenceEqual(tuple(parse.args_string_to_dict(string).items()), parsed) 22 | 23 | def test_args_string_to_dict_empty(self): 24 | """Test parsing of dictionaries encoded in separated strings (empty input).""" 25 | string = "" 26 | self.assertEqual(len(tuple(parse.args_string_to_dict(string).items())), 0) 27 | 28 | def test_normalise_array(self): 29 | """Test array normalization.""" 30 | a = np.array([2, 2, 2, 2]) 31 | a_norm = np.array([0.25, 0.25, 0.25, 0.25], dtype=float) 32 | self.assertTrue(all(parse.normalise_array(a) == a_norm)) 33 | -------------------------------------------------------------------------------- /wub/tests/test_util_seq.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from wub.util import seq 4 | from Bio.SeqRecord import SeqRecord 5 | 6 | 7 | class TestUtilSeq(unittest.TestCase): 8 | 9 | """Test sequence utilities.""" 10 | 11 | def test_new_dna_record(self): 12 | """Test the construction of new DNA SeqRecord.""" 13 | sequence = seq.new_dna_record("ATGC", "test") 14 | self.assertEqual(type(sequence), SeqRecord) 15 | 16 | def test_mock_qualities(self): 17 | """Test quality mocking function.""" 18 | sequence = seq.new_dna_record("ATGC", "test") 19 | mock_qual = 40 20 | qual_seq = seq.mock_qualities(sequence, mock_qual) 21 | self.assertSequenceEqual( 22 | qual_seq.letter_annotations["phred_quality"], [mock_qual] * len(qual_seq)) 23 | 24 | def test_reverse_complement(self): 25 | """Test reverse complementing.""" 26 | sequence = "ATGCNXatgcnx-" 27 | revcomp = "-xngcatXNGCAT" 28 | self.assertEqual(seq.reverse_complement(sequence), revcomp) 29 | 30 | def test_prob_to_phred(self): 31 | """Test error probability to phred score conversion.""" 32 | self.assertEqual(seq.prob_to_phred(0.5), 3) 33 | 34 | def test_prob_to_phred_max(self): 35 | """Test error probability to phred score conversion (very small error).""" 36 | self.assertEqual(seq.prob_to_phred(1 * 10 ** -10), 93) 37 | 38 | def test_phred_to_prob(self): 39 | """Test error probability to phred score conversion.""" 40 | self.assertAlmostEqual(seq.phred_to_prob(3), 0.5, places=2) 41 | 42 | def test_mean_qscore_large(self): 43 | """Test mean q score calculation (large identical input).""" 44 | scores = [30] * 5000 45 | self.assertEqual(seq.mean_qscore(scores), 30) 46 | 47 | def test_mean_qscore(self): 48 | """Test mean q score calculation.""" 49 | scores = [14, 10] 50 | self.assertEqual(seq.mean_qscore(scores), 12) 51 | 52 | def test_alignment_stats(self): 53 | """Test calculation of alignment statistics.""" 54 | seq1 = "ATGCTG-AAAAA" 55 | seq2 = "TTG-TGCAAAAA" 56 | self.assertEqual( 57 | tuple(seq.alignment_stats(seq1, seq2)), (12, 1, 1, 1, 0.75)) 58 | -------------------------------------------------------------------------------- /wub/tests/test_wrappers_dnadiff.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import tempfile 4 | import os 5 | 6 | from wub.wrappers import dnadiff 7 | from wub.util import cmd as cmd_util 8 | from wub.simulate import seq as sim_seq 9 | 10 | error_rate = 0.1 11 | ref_length = 5000 12 | 13 | 14 | class TestWrappersDnadiff(unittest.TestCase): 15 | 16 | """Test dnadiff wrapper.""" 17 | 18 | def _generate_test_data(self): 19 | """Generate test data for dnadiff test.""" 20 | fh_ref = tempfile.NamedTemporaryFile(suffix=".fas", delete=False, mode='w') 21 | self.ref_fasta = fh_ref.name 22 | fh_target = tempfile.NamedTemporaryFile(suffix=".fas", delete=False, mode='w') 23 | self.target_fasta = fh_target.name 24 | 25 | self.ref = sim_seq.simulate_sequence(ref_length) 26 | nr_errors = int(len(self.ref) * error_rate) 27 | self.target = sim_seq.add_errors(self.ref, nr_errors, 'substitution') 28 | 29 | fh_ref.write(">ref\n{}\n".format(self.ref)) 30 | fh_ref.flush() 31 | fh_ref.close() 32 | 33 | fh_target.write(">target\n{}\n".format(self.target)) 34 | fh_target.flush() 35 | fh_target.close() 36 | 37 | def _cleanup_test_data(self): 38 | """Cleanup test dataset.""" 39 | os.unlink(self.ref_fasta) 40 | os.unlink(self.target_fasta) 41 | 42 | @unittest.skipIf(not cmd_util.find_executable('dnadiff'), 43 | "Dnadiff binary not found, skipping integration tests.") 44 | def test_dnadiff(self): 45 | """Test dnadiff wrapper.""" 46 | self._generate_test_data() 47 | res, _, _ = dnadiff.dnadiff(self.ref_fasta, self.target_fasta) 48 | self.assertAlmostEqual( 49 | res['Alignments']['1-to-1']['AvgIdentity'].ref, 90.0, places=0) 50 | self._cleanup_test_data() 51 | -------------------------------------------------------------------------------- /wub/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/util/__init__.py -------------------------------------------------------------------------------- /wub/util/cmd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Utilities related to running external commands.""" 3 | 4 | from distutils.spawn import find_executable as exefind_distutils 5 | import sys 6 | 7 | 8 | def find_executable(command): 9 | """Find executable in path corresponding to a command. 10 | 11 | :param command: Command. 12 | :returns: Path to executable of False. 13 | :rtype: str 14 | """ 15 | # In the future we might want to eliminate the dependency of 16 | # distutils. 17 | return exefind_distutils(command) 18 | 19 | 20 | def ensure_executable(command): 21 | """Find executable in path corresponding to a command and abort if not found. 22 | 23 | :param command: Command. 24 | :returns: None 25 | :rtype: object 26 | """ 27 | # In the future we might want to eliminate the dependency of 28 | # distutils. 29 | if not find_executable(command): 30 | sys.stderr.write( 31 | "Required command \"{}\" not found in path! Aborting.!\n".format(command)) 32 | sys.exit(127) 33 | -------------------------------------------------------------------------------- /wub/util/misc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Yet uncategorised utility functions.""" 3 | 4 | import pickle 5 | import os.path 6 | 7 | 8 | def get_fname(fname): 9 | """ get the file name without extension. 10 | 11 | :param fname: file name 12 | :return: file name 13 | :rtype: str 14 | 15 | """ 16 | return os.path.splitext(os.path.basename(fname))[0] 17 | 18 | 19 | def get_extension(fname): 20 | """ get the file extension. 21 | 22 | :param fname: file name 23 | :return: file extention 24 | :rtype: str format '.*' 25 | 26 | """ 27 | return os.path.splitext(os.path.basename(fname))[1] 28 | 29 | 30 | def _getextension(fast): 31 | """ finds and check for the correct extension. If extension is not correct it will return Exception and exit. 32 | 33 | :param fast: fastq or fasta file 34 | :return: "fastq" or "fasta" 35 | :rtype: str 36 | 37 | """ 38 | 39 | extension = get_extension(fast) 40 | if extension in ('.fa', '.fasta'): 41 | extension = "fasta" 42 | elif extension in ('.fq', '.fastq'): 43 | extension = "fastq" 44 | else: 45 | raise Exception('Incorrect file format') 46 | exit() 47 | # print >> sys.stderr, "Incorrect file format" 48 | return extension 49 | 50 | 51 | def mkdir(path): 52 | """ if the dir does not exists it create it 53 | 54 | :param path: dir path 55 | :return: path 56 | :rtype: str 57 | 58 | """ 59 | if not os.path.exists(path): 60 | os.makedirs(path) 61 | return path 62 | 63 | 64 | def pickle_load(fname): 65 | """ Load object from pickle. 66 | 67 | :param fname: Input pickle file name. 68 | :returns: Object loaded from pickle file. 69 | :rtype: object 70 | 71 | """ 72 | fh = open(fname, 'rb') 73 | data = pickle.load(fh) 74 | fh.close() 75 | return data 76 | 77 | 78 | def pickle_dump(obj, fname): 79 | """Pickle object to file. 80 | 81 | :param obj: Object to be pickled. 82 | :fname: Output file name. 83 | :returns: The name of output file. 84 | :rtype: str 85 | 86 | """ 87 | fh = open(fname, 'wb') 88 | pickle.dump(obj, fh) 89 | fh.flush() 90 | fh.close() 91 | return fname 92 | -------------------------------------------------------------------------------- /wub/util/parse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import six 5 | from collections import OrderedDict 6 | 7 | """Utilities to parse strings into various data sructures.""" 8 | 9 | 10 | def separated_list_to_floats(separated_list, separator=","): 11 | """ Convert a separated list into a list of floats. 12 | 13 | :param separated_list: A separated list as string. 14 | :param separator: List separator. 15 | :returns: List of floats. 16 | :rtype: list 17 | """ 18 | return [float(element) for element in separated_list.split(separator)] 19 | 20 | 21 | def args_string_to_dict(args_string, elements_separator=",", keyvalue_separator=":"): 22 | """ Convert a two-level separated list into a dictionary. 23 | 24 | :param args_string: Two-level separated string. 25 | :param elements_separator: Separator between elements. 26 | :param keyvalue_separator: Separator between key/value pairs. 27 | :returns: dict 28 | :rtype: dict 29 | """ 30 | if len(args_string) == 0: 31 | return {} 32 | pairs = [pair.strip() for pair in args_string.split(elements_separator)] 33 | elements = OrderedDict(pair.split(keyvalue_separator) for pair in pairs) 34 | parsed = OrderedDict((k.strip(), v.strip()) for k, v in six.iteritems(elements)) 35 | return parsed 36 | 37 | 38 | def interval_string_to_tuples(interval_string, elements_separator="|", interval_separator=","): 39 | """ Convert a two-level separated list into a dictionary. 40 | 41 | :param interval_string: Two-level separated string. 42 | :param elements_separator: Separator between elements. 43 | :param keyvalue_separator: Separator between interval boundaries. 44 | :returns: tuple 45 | :rtype: tuple 46 | """ 47 | if len(interval_string) == 0: 48 | return tuple() 49 | pairs = [pair.strip() for pair in interval_string.split(elements_separator)] 50 | elements = OrderedDict(pair.split(interval_separator) for pair in pairs) 51 | parsed = tuple((int(k.strip()), int(v.strip())) for k, v in six.iteritems(elements)) 52 | return parsed 53 | 54 | 55 | def normalise_array(array): 56 | """ Normalise numpy array so the elments sum to 1.0. 57 | 58 | :param array: Input array. 59 | :returns: Normalised array. 60 | :rtype: numpy.array 61 | """ 62 | temporary_array = array.astype(float) 63 | return temporary_array / np.sum(temporary_array) 64 | -------------------------------------------------------------------------------- /wub/vis/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = 'ONT Applications Group' 4 | __email__ = 'Apps@nanoporetech.com' 5 | __version__ = '0.1.0' 6 | -------------------------------------------------------------------------------- /wub/wrappers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nanoporetech/wub/768b38ff145bae1982014e8b80ae234624828fd4/wub/wrappers/__init__.py -------------------------------------------------------------------------------- /wub/wrappers/dnadiff.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ Wrapper for mummer's dnadiff """ 3 | 4 | import six 5 | import os 6 | import re 7 | from collections import defaultdict 8 | from collections import namedtuple 9 | import subprocess 10 | import tempfile 11 | from subprocess import STDOUT 12 | 13 | dnadiff_extensions = ( 14 | '.1coords', '.1delta', '.delta', '.mcoords', '.mdelta', '.qdiff', '.rdiff', '.report', '.snps', '.unref', '.unqry') 15 | 16 | Property = namedtuple('Property', 'ref query') 17 | PropertyWithPerc = namedtuple('PropertyWithPerc', 'ref ref_perc query query_perc') 18 | 19 | 20 | def dnadiff(reference, query, working_directory=None, cleanup=True): 21 | """Run dnadiff on reference and query fasta and parse results. 22 | 23 | :param reference: Reference fasta. 24 | :param query: Query fasta. 25 | :param working_directory: Write output in this directory if specified. 26 | :param cleanup: Delete dnadiff output after parsing if True. 27 | :returns: Parsed results, raw report and log. 28 | :rtype: 3-tuple 29 | """ 30 | reference = os.path.abspath(reference) 31 | query = os.path.abspath(query) 32 | work_dir = working_directory 33 | 34 | if not os.path.exists(reference): 35 | raise Exception("Reference fasta {} does not exists!".format(reference)) 36 | if not os.path.exists(query): 37 | raise Exception("Target fasta {} does not exists!".format(query)) 38 | if work_dir is not None and not os.path.exists(work_dir): 39 | raise Exception("Working directory {} does not exists!".format(work_dir)) 40 | 41 | if work_dir is None: 42 | work_dir = tempfile.mkdtemp(prefix='dnadiff_') 43 | 44 | old_dir = os.getcwd() 45 | os.chdir(work_dir) 46 | 47 | command = ['dnadiff', reference, query] 48 | try: 49 | log = subprocess.check_output(command, stderr=STDOUT) 50 | finally: 51 | os.chdir(old_dir) 52 | 53 | report_file = os.path.join(work_dir, 'out.report') 54 | output = open(report_file, 'r').read() 55 | 56 | results = parse_dnadiff_report(report_file) 57 | 58 | if cleanup: 59 | cleanup_dnadiff_report(work_dir) 60 | if working_directory is None: 61 | os.rmdir(work_dir) 62 | 63 | return results, output, log 64 | 65 | 66 | def cleanup_dnadiff_report(directory, prefix='out'): 67 | """Cleanup dnadiff output files in the specified directory. 68 | 69 | :param directory: Output directory. 70 | :param prefix: Output prefix. 71 | :returns: None 72 | :rtype: object 73 | """ 74 | for ext in dnadiff_extensions: 75 | name = prefix + ext 76 | path = os.path.join(directory, name) 77 | if os.path.exists(path): 78 | os.unlink(path) 79 | 80 | 81 | def _parse_dnadiff_into_sections(report_file): 82 | """Parse dnadiff output lines into sections.""" 83 | report_fh = open(report_file, 'r') 84 | section = "NO_SECTION" 85 | sections = defaultdict(list) 86 | for line in report_fh: 87 | line = line.strip() 88 | if len(line) == 0: 89 | continue 90 | if line.startswith('/') or line.startswith('NUCMER') or line.startswith('[REF]'): 91 | continue 92 | if line.startswith('['): 93 | section = line 94 | section = section.replace('[', '') 95 | section = section.replace(']', '') 96 | else: 97 | sections[section].append(line) 98 | return sections 99 | 100 | 101 | def _parse_percent_field(field): 102 | """Parse dnadiff field with percent value.""" 103 | tmp = field.split('(') 104 | perc = tmp[1].replace(')', '') 105 | perc = perc.replace('%', '') 106 | return float(tmp[0]), float(perc) 107 | 108 | 109 | def _parse_simple_section(lines): 110 | """Parse a simple dnadiff report section.""" 111 | results = {} 112 | for line in lines: 113 | tmp = re.split("\s+", line) 114 | if '%' not in tmp[1] and '%' not in tmp[2]: 115 | results[tmp[0]] = Property(float(tmp[1]), float(tmp[2])) 116 | else: 117 | ref_prop, ref_prop_perc = _parse_percent_field(tmp[1]) 118 | query_prop, query_prop_perc = _parse_percent_field(tmp[2]) 119 | results[tmp[0]] = PropertyWithPerc(ref_prop, ref_prop_perc, query_prop, query_prop_perc) 120 | return results 121 | 122 | 123 | def _parse_complex_section(lines): 124 | """Parse a complex dnadiff report section.""" 125 | section = "NO_SECTION" 126 | sections = defaultdict(list) 127 | results = defaultdict(dict) 128 | # Parse alignment section into subsections: 129 | for line in lines: 130 | if len(line) == 0: 131 | continue 132 | # FIXME: Very specific to current dnadiff output: 133 | if line.startswith('1-to-1') or line.startswith('M-to-M') or re.match("Total(S|G|I)", line): 134 | tmp = re.split("\s+", line) 135 | section = tmp[0] 136 | results[section]['Number'] = Property(float(tmp[1]), float(tmp[2])) 137 | else: 138 | sections[section].append(line) 139 | 140 | # Parse subsections and update results dictionary: 141 | for section, lines in six.iteritems(sections): 142 | parsed = _parse_simple_section(lines) 143 | for name, prop in six.iteritems(parsed): 144 | results[section][name] = prop 145 | return results 146 | 147 | 148 | def parse_dnadiff_report(report_file): 149 | """Parse dnadiff report file. 150 | 151 | :param report_file: dnadiff report output. 152 | :returns: Data structure with parsed results. 153 | :rtype: dict 154 | """ 155 | sections = _parse_dnadiff_into_sections(report_file) 156 | 157 | results_sequences = _parse_simple_section(sections['Sequences']) 158 | results_bases = _parse_simple_section(sections['Bases']) 159 | results_features = _parse_simple_section(sections['Feature Estimates']) 160 | results_alignments = _parse_complex_section(sections['Alignments']) 161 | results_snps = _parse_complex_section(sections['SNPs']) 162 | 163 | results = { 164 | 'Sequences': results_sequences, 165 | 'Bases': results_bases, 166 | 'Features': results_features, 167 | 'Alignments': results_alignments, 168 | 'SNPs': results_snps, 169 | } 170 | return results 171 | --------------------------------------------------------------------------------