├── .gitignore ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── Makefile ├── _static │ ├── cartoons │ │ ├── data_hierarchy.png │ │ ├── overlap.png │ │ └── overlap_reads.png │ ├── enrich2_env.yml │ ├── gui_screenshots │ │ ├── complete_example.png │ │ ├── context_menu.png │ │ ├── empty.png │ │ ├── new_child.png │ │ ├── new_root.png │ │ ├── seqlib.png │ │ └── seqlib_choice.png │ ├── iD_icon.png │ ├── notebook_plots │ │ ├── min_count_plot.png │ │ └── unique_barcodes_plot.png │ ├── plots │ │ ├── barcodes_per_variant.png │ │ ├── diversity.png │ │ ├── overlap_mismatches.png │ │ ├── regression_weights.png │ │ ├── se_pctile.png │ │ ├── selection_counts.png │ │ ├── seqlib_counts.png │ │ ├── sfmap.png │ │ ├── volcano.png │ │ └── wt_shape.png │ └── sfmap_aa_files │ │ ├── aagroup_default.txt │ │ └── aagroup_helical_propensity.txt ├── api.rst ├── conf.py ├── exported_notebooks │ ├── README │ ├── min_count.rst │ └── unique_barcodes.rst ├── gui.rst ├── index.rst ├── installation.rst ├── introduction.rst ├── make.bat ├── notebooks.rst ├── notebooks │ ├── min_count.ipynb │ └── unique_barcodes.ipynb ├── output.rst ├── plots.rst └── seqlib_config.rst ├── enrich2 ├── __init__.py ├── aligner.py ├── barcode.py ├── barcodeid.py ├── barcodemap.py ├── barcodevariant.py ├── basic.py ├── condition.py ├── config_check.py ├── constants.py ├── dataframe.py ├── experiment.py ├── fastqheader.py ├── gui │ ├── __init__.py │ ├── configurator.py │ ├── create_root_dialog.py │ ├── create_seqlib_dialog.py │ ├── delete_dialog.py │ ├── dialog_elements.py │ ├── edit_dialog.py │ ├── runner_window.py │ └── seqlib_apply_dialog.py ├── idonly.py ├── main.py ├── overlap.py ├── plots.py ├── random_effects.py ├── selection.py ├── seqlib.py ├── sfmap.py ├── storemanager.py ├── variant.py ├── wildtype.py └── ztest.py ├── pylintrc └── pyproject.toml /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # Unit test output directory 57 | test/test_output/ 58 | 59 | # Jupyter checkpoint files 60 | docs/notebooks/.ipynb_checkpoints/ 61 | 62 | # Visual Studio Code 63 | .vscode/ 64 | 65 | # PyCharm 66 | .idea/ -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Rubin" 5 | given-names: "Alan F" 6 | orcid: "https://orcid.org/0000-0003-1474-605X" 7 | title: "Enrich2" 8 | version: 1.3.1 9 | doi: 10.5281/zenodo.3742545 10 | date-released: 2020-04-07 11 | url: "https://github.com/FowlerLab/Enrich2" 12 | preferred-citation: 13 | type: article 14 | authors: 15 | - family-names: "Rubin" 16 | given-names: "Alan F" 17 | orcid: "https://orcid.org/0000-0003-1474-605X" 18 | - family-names: "Gelman" 19 | given-names: "Hannah" 20 | - family-names: "Lucas" 21 | given-names: "Nathan" 22 | - family-names: "Bajjalieh" 23 | given-names: "Sandra M" 24 | - family-names: "Papenfuss" 25 | given-names: "Anthony T" 26 | orcid: "https://orcid.org/0000-0002-1102-8506" 27 | - family-names: "Speed" 28 | given-names: "Terence P" 29 | orcid: "https://orcid.org/0000-0002-5403-7998" 30 | - family-names: "Fowler" 31 | given-names: "Douglas M" 32 | orcid: "https://orcid.org/0000-0001-7614-1713" 33 | doi: "10.1186/s13059-017-1272-5" 34 | journal: "Genome Biology" 35 | month: 8 36 | start: 150 37 | title: "A statistical framework for analyzing deep mutational scanning data" 38 | issue: 1 39 | volume: 18 40 | year: 2017 41 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | alan.rubin@wehi.edu.au. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## How to contribute to Enrich2 2 | 3 | **Enrich2 is no longer under active development. For the Python 3-based successor to Enrich2, please see [CountESS](https://github.com/countess-project/countess).** 4 | 5 | All contributors should familiarize themselves with the [Code of Conduct](https://github.com/fowlerlab/enrich2/CODE_OF_CONDUCT.md). 6 | 7 | #### **Reporting a bug** 8 | 9 | * Check and see if the bug has already been reported by searching on GitHub under [Issues](https://github.com/fowlerlab/enrich2/issues). 10 | 11 | * If you're unable to find an open issue addressing the problem, [open a new issue](https://github.com/fowlerlab/enrich2/issues/new). Be sure to include a **title and clear description** with as much relevant information as possible. 12 | 13 | Thanks you for reading! 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2016-2020, Alan F Rubin 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14681278.svg)](https://doi.org/10.5281/zenodo.14681278) 2 | [![PyPI version](https://badge.fury.io/py/Enrich2.svg)](https://badge.fury.io/py/Enrich2) 3 | 4 | Enrich2 5 | ======= 6 | 7 | Enrich2 is a general software tool for processing, analyzing, and visualizing data from deep mutational scanning experiments. 8 | For more information or to cite Enrich2, please refer to [A statistical framework for analyzing deep mutational scanning data](https://doi.org/10.1186/s13059-017-1272-5). 9 | 10 | [Enrich2 documentation](https://enrich2.readthedocs.io) is available on [Read the Docs](https://readthedocs.org/). 11 | 12 | An example dataset is available at the [Enrich2-Example GitHub repository](https://github.com/FowlerLab/Enrich2-Example/). 13 | 14 | Thanks to the efforts of [Chris Macdonald](https://github.com/odcambc), Enrich2 is now able to run under modern versions of Python as of v2.0.0! 15 | 16 | Installation and dependencies 17 | ----------------------------- 18 | 19 | Enrich2 runs on Python 3 (v2.0.0 and higher) and requires the following packages: 20 | 21 | * [NumPy](http://www.numpy.org/) 22 | * [SciPy](http://www.scipy.org/) 23 | * [pandas](http://pandas.pydata.org/) 24 | * [PyTables](http://www.pytables.org/) 25 | * [Statsmodels](http://statsmodels.sourceforge.net/) 26 | * [matplotlib](http://matplotlib.org/) 27 | * [fqfa](https://fqfa.readthedocs.io/) 28 | 29 | The configuration GUI requires [Tkinter](https://docs.python.org/2/library/tkinter.html). 30 | Building a local copy of the documentation requires [Sphinx](http://sphinx-doc.org/). 31 | 32 | Enrich2 can be installed in a new virtual environment using pip: 33 | 34 | python3 -m venv e2env 35 | source e2env/bin/activate 36 | pip install enrich2 37 | 38 | You should now be able to launch the Enrich2 graphical user interface by typing `enrich_gui` or the command line interface by typing `enrich_cmd`. 39 | 40 | For additional information consult the [online documentation](https://enrich2.readthedocs.io/). 41 | 42 | Questions? 43 | ---------- 44 | 45 | Please use the [GitHub Issue Tracker](https://github.com/FowlerLab/Enrich2/issues) to file bug reports or request features. 46 | 47 | Enrich2 was written by [Alan F Rubin](mailto:alan.rubin@wehi.edu.au). 48 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Enrich2.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Enrich2.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/Enrich2" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Enrich2" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/_static/cartoons/data_hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/cartoons/data_hierarchy.png -------------------------------------------------------------------------------- /docs/_static/cartoons/overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/cartoons/overlap.png -------------------------------------------------------------------------------- /docs/_static/cartoons/overlap_reads.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/cartoons/overlap_reads.png -------------------------------------------------------------------------------- /docs/_static/enrich2_env.yml: -------------------------------------------------------------------------------- 1 | name: enrich2 2 | dependencies: 3 | - python=3 4 | - numpy 5 | - scipy 6 | - pandas 7 | - pytables 8 | - statsmodels 9 | - matplotlib 10 | -------------------------------------------------------------------------------- /docs/_static/gui_screenshots/complete_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/complete_example.png -------------------------------------------------------------------------------- /docs/_static/gui_screenshots/context_menu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/context_menu.png -------------------------------------------------------------------------------- /docs/_static/gui_screenshots/empty.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/empty.png -------------------------------------------------------------------------------- /docs/_static/gui_screenshots/new_child.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/new_child.png -------------------------------------------------------------------------------- /docs/_static/gui_screenshots/new_root.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/new_root.png -------------------------------------------------------------------------------- /docs/_static/gui_screenshots/seqlib.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/seqlib.png -------------------------------------------------------------------------------- /docs/_static/gui_screenshots/seqlib_choice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/gui_screenshots/seqlib_choice.png -------------------------------------------------------------------------------- /docs/_static/iD_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/iD_icon.png -------------------------------------------------------------------------------- /docs/_static/notebook_plots/min_count_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/notebook_plots/min_count_plot.png -------------------------------------------------------------------------------- /docs/_static/notebook_plots/unique_barcodes_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/notebook_plots/unique_barcodes_plot.png -------------------------------------------------------------------------------- /docs/_static/plots/barcodes_per_variant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/barcodes_per_variant.png -------------------------------------------------------------------------------- /docs/_static/plots/diversity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/diversity.png -------------------------------------------------------------------------------- /docs/_static/plots/overlap_mismatches.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/overlap_mismatches.png -------------------------------------------------------------------------------- /docs/_static/plots/regression_weights.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/regression_weights.png -------------------------------------------------------------------------------- /docs/_static/plots/se_pctile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/se_pctile.png -------------------------------------------------------------------------------- /docs/_static/plots/selection_counts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/selection_counts.png -------------------------------------------------------------------------------- /docs/_static/plots/seqlib_counts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/seqlib_counts.png -------------------------------------------------------------------------------- /docs/_static/plots/sfmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/sfmap.png -------------------------------------------------------------------------------- /docs/_static/plots/volcano.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/volcano.png -------------------------------------------------------------------------------- /docs/_static/plots/wt_shape.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/docs/_static/plots/wt_shape.png -------------------------------------------------------------------------------- /docs/_static/sfmap_aa_files/aagroup_default.txt: -------------------------------------------------------------------------------- 1 | (+) H,K,R 2 | (-) D,E 3 | Polar-neutral C,M,N,Q,S,T 4 | Non-polar A,I,L,V 5 | Aromatic F,W,Y 6 | Unique G,P 7 | * 8 | -------------------------------------------------------------------------------- /docs/_static/sfmap_aa_files/aagroup_helical_propensity.txt: -------------------------------------------------------------------------------- 1 | High helical propensity A,L,R,M,K,Q,E,I,W 2 | Low helical propensity S,Y,F,V,H,N,T,C,D 3 | Disruptive G,P 4 | * 5 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | Appendix: API documentation 2 | ############################################# 3 | 4 | This page contains automatically generated documentation from the Enrich2 codebase. It is intended for developers and advanced users. 5 | 6 | :py:mod:`~enrich2.storemanager` --- Abstract class for Enrich2 data 7 | =================================================================== 8 | 9 | .. py:module:: storemanager 10 | :synopsis: Abstract class for Enrich2 data. 11 | 12 | This module contains the class definition for the :py:class:`~enrich2.storemanager.storemanager.StoreManager` abstract class, the shared base class for most classes in the `Enrich2 `_ project. This class provides general behavior for the GUI and for handling HDF5 data files. 13 | 14 | :py:class:`~enrich2.storemanager.StoreManager` class 15 | ----------------------------------------------------------------- 16 | .. autoclass:: enrich2.storemanager.StoreManager 17 | :members: 18 | 19 | 20 | :py:mod:`~enrich2.seqlib` --- Sequencing library file handling and element counting 21 | =================================================================================== 22 | 23 | .. py:module:: seqlib 24 | :synopsis: Sequencing library file handling and element counting. 25 | 26 | This module provides class definitions for the various types of sequencing library designs usable by `Enrich2 `_. Data for each FASTQ_ file (or pair of overlapping FASTQ_ files for overlapping paired-end data) is read into its own :py:class:`~enrich2.seqlib.SeqLib` object. If necessary, FASTQ_ files should be split by index read before being read by a :py:class:`~enrich2.seqlib.SeqLib` object. :py:class:`~enrich2.seqlib.SeqLib` objects are coordinated by :py:mod:`~enrich2.selection.Selection` objects. 27 | 28 | :py:class:`~enrich2.seqlib.SeqLib` and :py:class:`~enrich2.variant.VariantSeqLib` are abstract classes. 29 | 30 | :py:class:`~enrich2.seqlib.SeqLib` class 31 | ------------------------------------------------------- 32 | .. autoclass:: enrich2.seqlib.SeqLib 33 | :members: 34 | 35 | :py:class:`~enrich2.variant.VariantSeqLib` class 36 | ------------------------------------------------------- 37 | .. autoclass:: enrich2.variant.VariantSeqLib 38 | :members: 39 | 40 | :py:class:`~enrich2.barcode.BarcodeSeqLib` class 41 | ------------------------------------------------------- 42 | .. autoclass:: enrich2.barcode.BarcodeSeqLib 43 | :members: 44 | 45 | :py:class:`~enrich2.barcodevariant.BcvSeqLib` class 46 | ------------------------------------------------------------- 47 | .. autoclass:: enrich2.barcodevariant.BcvSeqLib 48 | :members: 49 | 50 | :py:class:`~enrich2.barcodeid.BcidSeqLib` class 51 | ------------------------------------------------------------- 52 | .. autoclass:: enrich2.barcodeid.BcidSeqLib 53 | :members: 54 | 55 | :py:class:`~enrich2.basic.BasicSeqLib` class 56 | ----------------------------------------------------- 57 | .. autoclass:: enrich2.basic.BasicSeqLib 58 | :members: 59 | 60 | :py:class:`~enrich2.overlap.OverlapSeqLib` class 61 | -------------------------------------------------------- 62 | .. autoclass:: enrich2.overlap.OverlapSeqLib 63 | :members: 64 | 65 | :py:class:`~enrich2.idonly.IdOnlySeqLib` class 66 | ------------------------------------------------------------- 67 | .. autoclass:: enrich2.idonly.IdOnlySeqLib 68 | :members: 69 | 70 | :py:class:`~enrich2.seqlib.SeqLib` helper classes 71 | ------------------------------------------------------- 72 | 73 | :py:class:`~enrich2.aligner.Aligner` class 74 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 75 | .. autoclass:: enrich2.aligner.Aligner 76 | :members: 77 | 78 | :py:class:`~enrich2.wildtype.WildTypeSequence` class 79 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 80 | .. autoclass:: enrich2.wildtype.WildTypeSequence 81 | :members: 82 | 83 | :py:class:`~enrich2.barcodemap.BarcodeMap` class 84 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 85 | .. autoclass:: enrich2.barcodemap.BarcodeMap 86 | :members: 87 | 88 | :py:mod:`~enrich2.selection` --- Functional score calculation using SeqLib count data 89 | ===================================================================================== 90 | 91 | .. py:module:: selection 92 | :synopsis: Functional score calculation using :py:class:`~enrich2.seqlib.SeqLib` count data. 93 | 94 | This module provides class definitions for the :py:class:`~enrich2.selection.Selection` class. This is where functional scores are calculated from the :py:class:`~enrich2.seqlib.SeqLib` count data. For time series data, each time point in the selection can have multiple :py:class:`~enrich2.seqlib.SeqLib` assigned to it, in which case the counts for each element will be added together. Each time series selection must have a time point 0 (the "input library"). 95 | 96 | :py:class:`~enrich2.selection.Selection` class 97 | ---------------------------------------------------------- 98 | .. autoclass:: enrich2.selection.Selection 99 | :members: 100 | 101 | :py:class:`~enrich2.selection.Selection` helpers 102 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 103 | .. autofunction:: enrich2.selection.regression_apply 104 | 105 | :py:mod:`~enrich2.condition` --- Dummy class for GUI 106 | ======================================================================= 107 | 108 | .. py:module:: condition 109 | :synopsis: Dummy class for GUI. 110 | 111 | This module provides class definitions for the :py:class:`~enrich2.experiment.condition.Condition` classes. This class is required for proper GUI operation. All condition-related behaviors are in the :py:class:`~enrich2.experiment.Experiment` class. 112 | 113 | :py:class:`~enrich2.condition.Condition` class 114 | ----------------------------------------------------------- 115 | .. autoclass:: enrich2.condition.Condition 116 | :members: 117 | 118 | :py:mod:`~enrich2.experiment` --- Aggregation of replicate selections 119 | ======================================================================= 120 | 121 | .. py:module:: experiment 122 | :synopsis: Aggregation of replicate selections. 123 | 124 | This module provides class definitions for the :py:class:`~enrich2.experiment.Experiment`. Functional scores for selections within the same condition are combined to generate a single functional score (and associated error) for each element in each experimental condition. 125 | 126 | :py:class:`~enrich2.experiment.Experiment` class 127 | -------------------------------------------------------------- 128 | .. autoclass:: enrich2.experiment.Experiment 129 | :members: 130 | 131 | Enrich2 plotting 132 | =================================================================== 133 | 134 | .. py:module:: plots 135 | :synopsis: Library for general Enrich2 plotting. 136 | 137 | Text goes here. 138 | 139 | .. automodule:: enrich2.plots 140 | :members: 141 | 142 | Sequence-function map plotting 143 | -------------------------------------------------------------------- 144 | 145 | .. py:module:: sfmap 146 | :synopsis: Library for sequence-function map plotting. 147 | 148 | Text goes here. 149 | 150 | .. automodule:: enrich2.sfmap 151 | :members: 152 | 153 | Utility functions 154 | ==================================================================== 155 | 156 | Configuration object type detection 157 | --------------------------------------------------------------------------- 158 | 159 | .. automodule:: enrich2.config_check 160 | :members: 161 | 162 | Dataframe and index helper functions 163 | ---------------------------------------------------------------------------- 164 | 165 | .. automodule:: enrich2.dataframe 166 | :members: 167 | 168 | .. _api-variant-helper: 169 | 170 | Variant helper functions 171 | ---------------------------------------------------------------------------- 172 | 173 | .. automodule:: enrich2.variant 174 | :members: mutation_count, has_indel, has_unresolvable, protein_variant 175 | 176 | HGVS_ variant regular expressions 177 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 178 | 179 | .. autodata:: enrich2.variant.re_protein 180 | 181 | .. autodata:: enrich2.variant.re_coding 182 | 183 | .. autodata:: enrich2.variant.re_noncoding 184 | 185 | Enrich2 entry points 186 | ==================================================================== 187 | 188 | .. automodule:: enrich2.main 189 | :members: 190 | 191 | 192 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Enrich2 documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Dec 23 14:59:50 2013. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | #sys.path.insert(0, os.path.abspath('.')) 20 | sys.path.insert(0, os.path.abspath('..')) 21 | 22 | # -- General configuration ----------------------------------------------------- 23 | 24 | # If your documentation needs a minimal Sphinx version, state it here. 25 | #needs_sphinx = '1.0' 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be extensions 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 29 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.imgmath', 30 | 'sphinx.ext.inheritance_diagram', 'sphinx.ext.intersphinx', 31 | 'sphinx.ext.todo', 'sphinx.ext.napoleon'] 32 | 33 | # Add any paths that contain templates here, relative to this directory. 34 | templates_path = ['_templates'] 35 | 36 | # The suffix of source filenames. 37 | source_suffix = '.rst' 38 | 39 | # The encoding of source files. 40 | #source_encoding = 'utf-8-sig' 41 | 42 | # The master toctree document. 43 | master_doc = 'index' 44 | 45 | # General information about the project. 46 | project = u'Enrich2' 47 | copyright = u'2016-2024, Alan F Rubin' 48 | 49 | # The version info for the project you're documenting, acts as replacement for 50 | # |version| and |release|, also used in various other places throughout the 51 | # built documents. 52 | # 53 | # The short X.Y version. 54 | version = '2.0' 55 | # The full version, including alpha/beta/rc tags. 56 | release = '2.0.0' 57 | 58 | # The language for content autogenerated by Sphinx. Refer to documentation 59 | # for a list of supported languages. 60 | #language = None 61 | 62 | # There are two options for replacing |today|: either, you set today to some 63 | # non-false value, then it is used: 64 | #today = '' 65 | # Else, today_fmt is used as the format for a strftime call. 66 | #today_fmt = '%B %d, %Y' 67 | 68 | # List of patterns, relative to source directory, that match files and 69 | # directories to ignore when looking for source files. 70 | exclude_patterns = ['_build', 'exported_notebooks'] 71 | 72 | # The reST default role (used for this markup: `text`) to use for all documents. 73 | #default_role = None 74 | 75 | # If true, '()' will be appended to :func: etc. cross-reference text. 76 | #add_function_parentheses = True 77 | 78 | # If true, the current module name will be prepended to all description 79 | # unit titles (such as .. function::). 80 | #add_module_names = True 81 | 82 | # If true, sectionauthor and moduleauthor directives will be shown in the 83 | # output. They are ignored by default. 84 | #show_authors = False 85 | 86 | # The name of the Pygments (syntax highlighting) style to use. 87 | pygments_style = 'sphinx' 88 | 89 | # A list of ignored prefixes for module index sorting. 90 | #modindex_common_prefix = [] 91 | 92 | 93 | # -- Options for HTML output --------------------------------------------------- 94 | 95 | # The theme to use for HTML and HTML Help pages. See the documentation for 96 | # a list of builtin themes. 97 | #html_theme = 'alabaster' 98 | html_theme = 'sphinx_rtd_theme' 99 | 100 | # Theme options are theme-specific and customize the look and feel of a theme 101 | # further. For a list of options available for each theme, see the 102 | # documentation. 103 | html_theme_options = { 104 | # 'github_user': 'FowlerLab', 105 | # 'github_repo': 'Enrich2', 106 | # 'github_button': 'true', 107 | } 108 | 109 | # Add any paths that contain custom themes here, relative to this directory. 110 | #html_theme_path = [] 111 | 112 | # The name for this set of Sphinx documents. If None, it defaults to 113 | # " v documentation". 114 | #html_title = None 115 | 116 | # A shorter title for the navigation bar. Default is the same as html_title. 117 | #html_short_title = None 118 | 119 | # The name of an image file (relative to this directory) to place at the top 120 | # of the sidebar. 121 | #html_logo = None 122 | 123 | # The name of an image file (within the static path) to use as favicon of the 124 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 125 | # pixels large. 126 | #html_favicon = None 127 | 128 | # Add any paths that contain custom static files (such as style sheets) here, 129 | # relative to this directory. They are copied after the builtin static files, 130 | # so a file named "default.css" will overwrite the builtin "default.css". 131 | html_static_path = ['_static'] 132 | 133 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 134 | # using the given strftime format. 135 | #html_last_updated_fmt = '%b %d, %Y' 136 | 137 | # If true, SmartyPants will be used to convert quotes and dashes to 138 | # typographically correct entities. 139 | #html_use_smartypants = True 140 | 141 | # Custom sidebar templates, maps document names to template names. 142 | #html_sidebars = {} 143 | 144 | # Additional templates that should be rendered to pages, maps page names to 145 | # template names. 146 | #html_additional_pages = {} 147 | 148 | # If false, no module index is generated. 149 | #html_domain_indices = True 150 | 151 | # If false, no index is generated. 152 | #html_use_index = True 153 | 154 | # If true, the index is split into individual pages for each letter. 155 | #html_split_index = False 156 | 157 | # If true, links to the reST sources are added to the pages. 158 | #html_show_sourcelink = True 159 | 160 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 161 | #html_show_sphinx = True 162 | 163 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 164 | #html_show_copyright = True 165 | 166 | # If true, an OpenSearch description file will be output, and all pages will 167 | # contain a tag referring to it. The value of this option must be the 168 | # base URL from which the finished HTML is served. 169 | #html_use_opensearch = '' 170 | 171 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 172 | #html_file_suffix = None 173 | 174 | # Output file base name for HTML help builder. 175 | htmlhelp_basename = 'Enrich2doc' 176 | 177 | 178 | # -- Options for LaTeX output -------------------------------------------------- 179 | 180 | latex_elements = { 181 | # The paper size ('letterpaper' or 'a4paper'). 182 | #'papersize': 'letterpaper', 183 | 184 | # The font size ('10pt', '11pt' or '12pt'). 185 | #'pointsize': '10pt', 186 | 187 | # Additional stuff for the LaTeX preamble. 188 | #'preamble': '', 189 | } 190 | 191 | # Grouping the document tree into LaTeX files. List of tuples 192 | # (source start file, target name, title, author, documentclass [howto/manual]). 193 | latex_documents = [ 194 | ('index', 'Enrich2.tex', u'Enrich2 Documentation', 195 | u'Alan F Rubin', 'manual'), 196 | ] 197 | 198 | # The name of an image file (relative to this directory) to place at the top of 199 | # the title page. 200 | #latex_logo = None 201 | 202 | # For "manual" documents, if this is true, then toplevel headings are parts, 203 | # not chapters. 204 | #latex_use_parts = False 205 | 206 | # If true, show page references after internal links. 207 | #latex_show_pagerefs = False 208 | 209 | # If true, show URL addresses after external links. 210 | #latex_show_urls = False 211 | 212 | # Documents to append as an appendix to all manuals. 213 | #latex_appendices = [] 214 | 215 | # If false, no module index is generated. 216 | #latex_domain_indices = True 217 | 218 | 219 | # -- Options for manual page output -------------------------------------------- 220 | 221 | # One entry per manual page. List of tuples 222 | # (source start file, name, description, authors, manual section). 223 | man_pages = [ 224 | ('index', 'enrich2', u'Enrich2 Documentation', 225 | [u'Alan F Rubin'], 1) 226 | ] 227 | 228 | # If true, show URL addresses after external links. 229 | #man_show_urls = False 230 | 231 | 232 | # -- Options for Texinfo output ------------------------------------------------ 233 | 234 | # Grouping the document tree into Texinfo files. List of tuples 235 | # (source start file, target name, title, author, 236 | # dir menu entry, description, category) 237 | texinfo_documents = [ 238 | ('index', 'Enrich2', u'Enrich2 Documentation', 239 | u'Alan F Rubin', 'Enrich2', 'One line description of project.', 240 | 'Miscellaneous'), 241 | ] 242 | 243 | # Documents to append as an appendix to all manuals. 244 | #texinfo_appendices = [] 245 | 246 | # If false, no module index is generated. 247 | #texinfo_domain_indices = True 248 | 249 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 250 | #texinfo_show_urls = 'footnote' 251 | 252 | ########## 253 | 254 | intersphinx_mapping = {'python': ('http://docs.python.org/3/', None), 255 | 'pandas': ('http://pandas.pydata.org/pandas-docs/stable', None), 256 | 'matplotlib': ('http://matplotlib.org/', None), 257 | 'numpy': ('http://docs.scipy.org/doc/numpy/', None), 258 | 'scipy': ('http://docs.scipy.org/doc/scipy/reference/', None), 259 | 'statsmodels': ('http://www.statsmodels.org/stable/', None)} 260 | todo_include_todos = True 261 | 262 | rst_epilog = """ 263 | .. Aliases for commonly used web links 264 | 265 | .. _FASTQ: http://en.wikipedia.org/wiki/FASTQ_format 266 | 267 | .. _Araya and Fowler: http://www.pnas.org/content/109/42/16858.abstract 268 | 269 | .. _HGVS: http://www.hgvs.org/mutnomen/recs.html 270 | 271 | .. _matplotlib cmap: http://matplotlib.org/examples/color/colormaps_reference.html 272 | 273 | .. _HDF5: http://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables 274 | 275 | .. _Enrich2 manuscript: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-017-1272-5 276 | 277 | .. Replacement aliases for intersphinx library documentation 278 | 279 | .. |mpl_PdfPages| replace:: :py:class:`~matplotlib.backends.backend_pdf.PdfPages` 280 | 281 | .. |mpl_pcolormesh| replace:: :py:func:`~matplotlib.pyplot.pcolormesh` 282 | 283 | .. |pd_DataFrame| replace:: :py:class:`~pandas.DataFrame` 284 | 285 | .. |pd_Series| replace:: :py:class:`~pandas.Series` 286 | """ 287 | -------------------------------------------------------------------------------- /docs/exported_notebooks/README: -------------------------------------------------------------------------------- 1 | This directory contains reST exports of jupyter notebooks, so that the static 2 | notebooks can be included in the documentation. 3 | 4 | reST files in this directory are placed inline using the 'include' directive. 5 | To create these files, download the notebook as reST (requires pandoc). 6 | Image links (for plots) must be hand-edited. Plot images should be renamed and 7 | put in the '_static/notebook_plots/' directory. 8 | 9 | If sphinx generates a lot of errors that look like the following, this may be 10 | caused by C function calls being parsed as reST. Pointer notation looks like 11 | emphasis/strong start-strings. To fix it, change the '.. parsed-literal::' to 12 | the non-parsed version '::'. 13 | WARNING: Inline emphasis start-string without end-string. 14 | WARNING: Inline strong start-string without end-string. 15 | -------------------------------------------------------------------------------- /docs/exported_notebooks/min_count.rst: -------------------------------------------------------------------------------- 1 | 2 | Selecting variants by input library count 3 | ----------------------------------------- 4 | 5 | This notebook gets scores and standard errors for the variants in a 6 | Selection that exceed a minimum count cutoff in the input time point, 7 | and plots the relationship between each variant's score and input count. 8 | 9 | .. code:: python 10 | 11 | % matplotlib inline 12 | 13 | .. code:: python 14 | 15 | from __future__ import print_function 16 | import os.path 17 | import numpy as np 18 | import pandas as pd 19 | import matplotlib.pyplot as plt 20 | from enrich2.variant import WILD_TYPE_VARIANT 21 | import enrich2.plots as enrich_plot 22 | pd.set_option("display.max_rows", 10) # rows shown when pretty-printing 23 | 24 | Modify the ``results_path`` variable in the next cell to match the 25 | output directory of your Enrich2-Example dataset. 26 | 27 | .. code:: python 28 | 29 | results_path = "/path/to/Enrich2-Example/Results/" 30 | 31 | Open the Selection HDF5 file with the variants we are interested in. 32 | 33 | .. code:: python 34 | 35 | my_store = pd.HDFStore(os.path.join(results_path, "Rep1_sel.h5")) 36 | 37 | The ``pd.HDFStore.keys()`` method returns a list of all the tables in 38 | this HDF5 file. 39 | 40 | .. code:: python 41 | 42 | my_store.keys() 43 | 44 | 45 | 46 | 47 | .. parsed-literal:: 48 | 49 | ['/main/barcodemap', 50 | '/main/barcodes/counts', 51 | '/main/barcodes/counts_unfiltered', 52 | '/main/barcodes/log_ratios', 53 | '/main/barcodes/scores', 54 | '/main/barcodes/weights', 55 | '/main/synonymous/counts', 56 | '/main/synonymous/counts_unfiltered', 57 | '/main/synonymous/log_ratios', 58 | '/main/synonymous/scores', 59 | '/main/synonymous/weights', 60 | '/main/variants/counts', 61 | '/main/variants/counts_unfiltered', 62 | '/main/variants/log_ratios', 63 | '/main/variants/scores', 64 | '/main/variants/weights'] 65 | 66 | 67 | 68 | We will work with the "/main/variants/counts" table first. Enrich2 69 | names the columns for counts ``c_n`` where ``n`` is the time point, 70 | beginning with ``0`` for the input library. 71 | 72 | We can use a query to extract the subset of variants in the table that 73 | exceed the specified cutoff. Since we're only interested in variants, 74 | we'll explicitly exclude the wild type. We will store the data we 75 | extract in the ``variant_count`` data frame. 76 | 77 | .. code:: python 78 | 79 | read_cutoff = 10 80 | 81 | .. code:: python 82 | 83 | variant_counts = my_store.select('/main/variants/counts', where='c_0 > read_cutoff and index != WILD_TYPE_VARIANT') 84 | variant_counts 85 | 86 | 87 | 88 | 89 | .. raw:: html 90 | 91 |
92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 |
c_0c_2c_5
c.10G>A (p.Ala4Arg), c.11C>G (p.Ala4Arg), c.12T>A (p.Ala4Arg)787.0106.0124.0
c.10G>A (p.Ala4Asn), c.11C>A (p.Ala4Asn)699.080.0114.0
c.10G>A (p.Ala4Asn), c.11C>A (p.Ala4Asn), c.12T>C (p.Ala4Asn)94.08.013.0
c.10G>A (p.Ala4Ile), c.11C>T (p.Ala4Ile)1280.0137.080.0
c.10G>A (p.Ala4Ile), c.11C>T (p.Ala4Ile), c.12T>A (p.Ala4Ile)717.042.027.0
............
c.9T>A (p.=)327.0217.0284.0
c.9T>C (p.=)1947.0523.01230.0
c.9T>C (p.=), c.49A>T (p.Met17Ser), c.50T>C (p.Met17Ser), c.51G>A (p.Met17Ser)277.043.05.0
c.9T>C (p.=), c.62T>C (p.Leu21Ser), c.63A>T (p.Leu21Ser)495.0138.055.0
c.9T>G (p.=)406.018.020.0
170 |

1440 rows × 3 columns

171 |
172 | 173 | 174 | 175 | The index of the data frame is the list of variants that exceeded the 176 | cutoff. 177 | 178 | .. code:: python 179 | 180 | variant_counts.index 181 | 182 | 183 | 184 | 185 | .. parsed-literal:: 186 | 187 | Index([u'c.10G>A (p.Ala4Arg), c.11C>G (p.Ala4Arg), c.12T>A (p.Ala4Arg)', 188 | u'c.10G>A (p.Ala4Asn), c.11C>A (p.Ala4Asn)', 189 | u'c.10G>A (p.Ala4Asn), c.11C>A (p.Ala4Asn), c.12T>C (p.Ala4Asn)', 190 | u'c.10G>A (p.Ala4Ile), c.11C>T (p.Ala4Ile)', 191 | u'c.10G>A (p.Ala4Ile), c.11C>T (p.Ala4Ile), c.12T>A (p.Ala4Ile)', 192 | u'c.10G>A (p.Ala4Ile), c.11C>T (p.Ala4Ile), c.12T>C (p.Ala4Ile)', 193 | u'c.10G>A (p.Ala4Lys), c.11C>A (p.Ala4Lys), c.12T>A (p.Ala4Lys)', 194 | u'c.10G>A (p.Ala4Met), c.11C>T (p.Ala4Met), c.12T>G (p.Ala4Met)', 195 | u'c.10G>A (p.Ala4Ser), c.11C>G (p.Ala4Ser)', 196 | u'c.10G>A (p.Ala4Ser), c.11C>G (p.Ala4Ser), c.12T>C (p.Ala4Ser)', 197 | ... 198 | u'c.8C>T (p.Ser3Phe), c.60C>T (p.=)', 199 | u'c.8C>T (p.Ser3Phe), c.9T>C (p.Ser3Phe)', u'c.90C>A (p.=)', 200 | u'c.90C>G (p.Ile30Met)', u'c.90C>T (p.=)', u'c.9T>A (p.=)', 201 | u'c.9T>C (p.=)', 202 | u'c.9T>C (p.=), c.49A>T (p.Met17Ser), c.50T>C (p.Met17Ser), c.51G>A (p.Met17Ser)', 203 | u'c.9T>C (p.=), c.62T>C (p.Leu21Ser), c.63A>T (p.Leu21Ser)', 204 | u'c.9T>G (p.=)'], 205 | dtype='object', length=1440) 206 | 207 | 208 | 209 | We can use this index to get the scores for these variants by querying 210 | the "/main/variants/scores" table. We'll store the result of the query 211 | in a new data frame named ``variant_scores``, and keep only the score 212 | and standard error (SE) columns. 213 | 214 | .. code:: python 215 | 216 | variant_scores = my_store.select('/main/variants/scores', where='index in variant_counts.index') 217 | variant_scores = variant_scores[['score', 'SE']] 218 | variant_scores 219 | 220 | 221 | 222 | 223 | .. raw:: html 224 | 225 |
226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 |
scoreSE
c.10G>A (p.Ala4Arg), c.11C>G (p.Ala4Arg), c.12T>A (p.Ala4Arg)-0.9800910.134873
c.10G>A (p.Ala4Asn), c.11C>A (p.Ala4Asn)-0.9720350.268962
c.10G>A (p.Ala4Asn), c.11C>A (p.Ala4Asn), c.12T>C (p.Ala4Asn)-1.1386670.403767
c.10G>A (p.Ala4Ile), c.11C>T (p.Ala4Ile)-1.8753310.014883
c.10G>A (p.Ala4Ile), c.11C>T (p.Ala4Ile), c.12T>A (p.Ala4Ile)-2.5522890.421699
.........
c.9T>A (p.=)0.7056610.774559
c.9T>C (p.=)0.4386540.014857
c.9T>C (p.=), c.49A>T (p.Met17Ser), c.50T>C (p.Met17Ser), c.51G>A (p.Met17Ser)-1.9309221.085535
c.9T>C (p.=), c.62T>C (p.Leu21Ser), c.63A>T (p.Leu21Ser)-0.8972490.884321
c.9T>G (p.=)-2.3146040.671760
292 |

1440 rows × 2 columns

293 |
294 | 295 | 296 | 297 | Now that we're finished getting data out of the HDF5 file, we'll close 298 | it. 299 | 300 | .. code:: python 301 | 302 | my_store.close() 303 | 304 | To more easily explore the relationship between input count and score, 305 | we'll add a column to the ``variant_scores`` data frame that contains 306 | input counts from the ``variant_counts`` data frame. 307 | 308 | .. code:: python 309 | 310 | variant_scores['input_count'] = variant_counts['c_0'] 311 | variant_scores 312 | 313 | 314 | 315 | 316 | .. raw:: html 317 | 318 |
319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 |
scoreSEinput_count
c.10G>A (p.Ala4Arg), c.11C>G (p.Ala4Arg), c.12T>A (p.Ala4Arg)-0.9800910.134873787.0
c.10G>A (p.Ala4Asn), c.11C>A (p.Ala4Asn)-0.9720350.268962699.0
c.10G>A (p.Ala4Asn), c.11C>A (p.Ala4Asn), c.12T>C (p.Ala4Asn)-1.1386670.40376794.0
c.10G>A (p.Ala4Ile), c.11C>T (p.Ala4Ile)-1.8753310.0148831280.0
c.10G>A (p.Ala4Ile), c.11C>T (p.Ala4Ile), c.12T>A (p.Ala4Ile)-2.5522890.421699717.0
............
c.9T>A (p.=)0.7056610.774559327.0
c.9T>C (p.=)0.4386540.0148571947.0
c.9T>C (p.=), c.49A>T (p.Met17Ser), c.50T>C (p.Met17Ser), c.51G>A (p.Met17Ser)-1.9309221.085535277.0
c.9T>C (p.=), c.62T>C (p.Leu21Ser), c.63A>T (p.Leu21Ser)-0.8972490.884321495.0
c.9T>G (p.=)-2.3146040.671760406.0
397 |

1440 rows × 3 columns

398 |
399 | 400 | 401 | 402 | Now that all the information is in a single data frame, we can make a 403 | plot of score vs. input count. This example uses functions and colors 404 | from the Enrich2 plotting library. Taking the log10 of the counts makes 405 | the data easier to visualize. 406 | 407 | .. code:: python 408 | 409 | fig, ax = plt.subplots() 410 | enrich_plot.configure_axes(ax, xgrid=True) 411 | ax.plot(np.log10(variant_scores['input_count']), 412 | variant_scores['score'], 413 | linestyle='none', marker='.', alpha=0.6, 414 | color=enrich_plot.plot_colors['bright4']) 415 | ax.set_xlabel("log10(Input Count)") 416 | ax.set_ylabel("Variant Score") 417 | 418 | 419 | 420 | 421 | .. parsed-literal:: 422 | 423 | 424 | 425 | 426 | 427 | 428 | .. image:: _static/notebook_plots/min_count_plot.png 429 | 430 | 431 | -------------------------------------------------------------------------------- /docs/gui.rst: -------------------------------------------------------------------------------- 1 | .. _gui-documentation: 2 | 3 | Using the GUI 4 | ====================== 5 | 6 | The graphical user interface makes it easy to specify an experimental design that Enrich2 can understand. For more information about how these are organized, see :ref:`experimental-designs`. 7 | 8 | Configuring your analysis 9 | ------------------------------------ 10 | 11 | The Enrich2 installer places the graphical user interface (GUI) entry point in your path. Type ``enrich_gui`` from the command line to launch the program. 12 | 13 | .. error:: Mac OS X users running the Enrich2 GUI in a virtualenv may encounter the following error:: 14 | 15 | 2016-10-10 12:34:56.789 python[12345:12345678] -[NSApplication _setup:]: unrecognized selector sent to instance 0x12345abcd 16 | 17 | This is caused by an interaction between Tkinter and the `matplotlib backend `_. To fix the issue, edit (or create) the "~/.matplotlib/matplotlibrc" file and add the line:: 18 | 19 | backend: TkAgg 20 | 21 | .. note:: Once you have created your configuration file, you can also run the program in command line mode. Type ``enrich_cmd --help`` for usage and a list of command line options. 22 | 23 | .. image:: _static/gui_screenshots/empty.png 24 | :alt: The Enrich2 GUI window upon launch 25 | 26 | Click "New..." to create the root object. 27 | 28 | .. image:: _static/gui_screenshots/new_root.png 29 | :alt: The new root object window 30 | 31 | Enter a short but descriptive object name that will not conflict with other objects in the analysis. 32 | 33 | Choose the output directory for the HDF5_, plot, and tab-separated files generated by the analysis. 34 | 35 | Select the appropriate object type: Experiment, Selection if there are no replicates, or SeqLib if you only want to count a single sequencing library. 36 | 37 | If you created a Selection or Experiment root object, select it and click "New..." to add a child object. 38 | 39 | .. image:: _static/gui_screenshots/new_child.png 40 | :alt: Creating a child object 41 | 42 | Conditions and Selections do not have any parameters beyond their names. 43 | 44 | Continue adding child objects until the entire experimental design is represented. 45 | When creating a new SeqLib, choose the appropriate type depending on how the 46 | experiment was performed (see :ref:`intro-seqlibs`). 47 | 48 | .. note:: To avoid re-counting the reads when multiple Selections share the same input library, use the same object name for the input library in each Selection. 49 | 50 | Most parameters are specified in SeqLib objects, such as the wild type sequence, filtering options, and the location of the sequencing files or counts files (see :ref:`seqlib-configuration`). 51 | 52 | .. note:: Time points can have multiple sequencing libraries, which are added together before scores are calculated. 53 | 54 | .. image:: _static/gui_screenshots/seqlib.png 55 | :alt: Editing a SeqLib object 56 | 57 | Clicking "New..." with a SeqLib object selected will add a sibling SeqLib to the Selection that shares the same FASTQ_ filtering and other options. 58 | 59 | Saving and loading 60 | --------------------------- 61 | 62 | After you have configured the analysis, you can save a configuration file by selecting "Save" or "Save As..." from the File menu. You can also use the File menu to load an existing configuration file by selecting "Open." 63 | 64 | .. note:: If you encounter an error when loading a configuration file, try using a validator such as `JSONLint `_ to identify any issues. 65 | 66 | Context menus 67 | --------------------------- 68 | 69 | Right-clicking on an object will open a context menu with additional actions not covered by the New/Edit/Delete buttons. 70 | 71 | .. image:: _static/gui_screenshots/context_menu.png 72 | :alt: Right-click context menu for a SeqLib 73 | 74 | * Apply FASTQ... 75 | 76 | Copy the FASTQ_ filtering options from the chosen SeqLib to every highlighted SeqLib of the same type. 77 | 78 | .. _analysis-options: 79 | 80 | Analysis options 81 | --------------------- 82 | 83 | These choices are not saved in the configuration file and should be reviewed before running each analysis. For further information about the scoring and normalization methods below, see the `Enrich2 manuscript`_. 84 | 85 | Scoring method 86 | +++++++++++++++++++++++ 87 | 88 | * Weighted Least Squares 89 | 90 | Recommended for selections with at least three time points (including the input). 91 | 92 | * Log Ratios (Enrich2) 93 | 94 | Recommended for selections with two time points (input and selected). For selections with more than two time points, the last time point is used as the selected time point. Intermediate time points not used. 95 | 96 | * Counts Only 97 | 98 | No element scores are calculated. The output contains only element counts. 99 | 100 | * Ordinary Least Squares 101 | 102 | Provided for comparison and legacy support. 103 | 104 | * Log Ratios (Old Enrich) 105 | 106 | Provided for comparison and legacy support. This method is a re-implementation of the previously published `Enrich software `_. Standard errors are not calculated. For selections with more than two time points, the last time point is used as the selected time point. Intermediate time points not used. 107 | 108 | Normalization method 109 | +++++++++++++++++++++++ 110 | 111 | * Wild Type 112 | 113 | Recommended if your selection has a wild type sequence. Normalizes counts by the wild type count as described in the `Enrich2 manuscript`_. For designs with identifiers instead of variants, the special wild type identifier "_wt" can be used. 114 | 115 | * Library Size (Complete Cases) 116 | 117 | Normalizes counts by the library size. Only elements present in all time points within a selection contribute to the library size. 118 | 119 | * Library Size (All Reads) 120 | 121 | Normalizes counts by the library size. All elements contribute to the library size. 122 | 123 | Other options 124 | +++++++++++++++++++++++ 125 | 126 | * Force Recalculation 127 | 128 | Discards all data that are not raw counts before performing the analysis. See :ref:`output-table-organization` for more about raw counts. 129 | 130 | * Component Outlier Statistics 131 | 132 | Tests whether the score of each barcode differs significantly from that of its assigned variant or identifier. Performs an analogous calculation for variant and synonymous scores. 133 | 134 | .. warning:: Testing for outliers is experimental and very computationally inefficient. 135 | 136 | * Make Plots 137 | 138 | Creates plots for this analysis. 139 | 140 | * Write TSV Files 141 | 142 | Outputs tab-separated files for this analysis. 143 | 144 | Once you've finished selecting your options, click Run Analysis! 145 | 146 | The output directory will contain :ref:`hdf5-files`, :ref:`plots`, and tab-separated files. 147 | 148 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Enrich2: deep mutational scanning data analysis 2 | ============================================================================ 3 | 4 | Enrich2 is a general software tool for processing, analyzing, and visualizing data from deep mutational scanning experiments. 5 | 6 | The software is freely available from https://github.com/FowlerLab/Enrich2/ under the BSD 3-clause license. 7 | 8 | For an example dataset, visit https://github.com/FowlerLab/Enrich2-Example/. 9 | 10 | To cite Enrich2, please reference `A statistical framework for analyzing deep mutational scanning data `_. 11 | 12 | Enrich2 was written by `Alan F Rubin `_ |ORCID_icon| http://orcid.org/0000-0003-1474-605X 13 | with contributions from Chris Macdonald |ORCID_icon| https://orcid.org/0000-0002-0201-8832 for the Python 3-compatible version. 14 | 15 | .. error:: Important notice for users of Enrich2 v1.0 or v1.1 16 | 17 | Enrich2 v1.2.0 corrected an error in the software that, for most datasets, resulted in the standard errors for combined scores being over-estimated. 18 | The counts, scores, and replicate-wise standard errors are unaffected. 19 | 20 | If you have analyzed datasets that contain replicates with a previous version of Enrich2, the easiest way to get the correct standard error values is to delete the experiment HDF5_ file (the file name ends with ``'_exp.h5'``) and re-run the program. 21 | This will recalculate combined scores and standard errors without redoing other parts of the analysis. 22 | 23 | .. |ORCID_icon| image:: _static/iD_icon.png 24 | :target: http://orcid.org 25 | 26 | 27 | .. toctree:: 28 | :hidden: 29 | :maxdepth: 0 30 | 31 | installation 32 | introduction 33 | gui 34 | seqlib_config 35 | output 36 | plots 37 | notebooks 38 | api 39 | 40 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Getting started 2 | ======================================================= 3 | 4 | .. _required packages: 5 | 6 | Required packages 7 | ------------------------------------------------------- 8 | 9 | Enrich2 runs on Python 3 and has the following dependencies: 10 | 11 | * `NumPy `_ 12 | * `SciPy `_ 13 | * `pandas `_ 14 | * `PyTables `_ 15 | * `Statsmodels `_ 16 | * `matplotlib `_ 17 | * `fqfa `_ 18 | 19 | The configuration GUI requires `Tkinter `_. Building a local copy of the documentation requires `Sphinx `_. 20 | 21 | .. note:: PyTables may not be installed by your distribution. If you encounter errors, check that the ``tables`` module is present. 22 | 23 | .. note:: Tkinter may not be installed by your distribution. If you encounter errors, try installing ``python3-tk`` or similar using your system package manager. 24 | 25 | Installation and example dataset 26 | ------------------------------------------------------- 27 | 28 | You can install Enrich2 in a new `virtual environment `_ using `pip `_:: 29 | 30 | python3 -m venv e2env 31 | source e2env/bin/activate 32 | pip install enrich2 33 | 34 | To download the example dataset, visit the `Enrich2-Example GitHub repository `_. 35 | Running this preconfigured analysis will create several :ref:`plots`. The :ref:`example-notebooks` demonstrate how to explore the :ref:`hdf5-files`. 36 | 37 | Enrich2 executables 38 | ------------------------------------------------------- 39 | 40 | The Enrich2 installer places two executable scripts into the user's path. Both executables run the same analysis, but through different interfaces. 41 | 42 | * ``enrich_gui`` launches the Enrich2 graphical user interface. This is the recommended way to create a configuration file for Enrich2. See :ref:`gui-documentation` for a step-by-step guide. 43 | 44 | * ``enrich_cmd`` launches the program from the command line. This is recommended for users performing analyses on a remote server who have already created configuration files. For a detailed list of command line options, type ``enrich_cmd --help`` 45 | 46 | 47 | -------------------------------------------------------------------------------- /docs/introduction.rst: -------------------------------------------------------------------------------- 1 | Defining experiments 2 | ===================================================== 3 | 4 | .. _experimental-designs: 5 | 6 | Experimental designs 7 | ------------------------------------------------------- 8 | 9 | Enrich2 represents deep mutational scanning experimental designs as a tree of objects. The hierarchy of object types is defined below: 10 | 11 | * Experiment 12 | 13 | The root object for most experimental designs. Parent of at least one experimental condition. 14 | 15 | * Condition 16 | 17 | A single experimental condition. Parent of at least one replicate selection performed under the condition. 18 | 19 | * Selection 20 | 21 | A single deep mutational scanning replicate. Parent of at least two sequencing libraries, one or more for each time point/round/bin of the selection. 22 | 23 | * Sequencing library (SeqLib) 24 | 25 | FASTQ_ output or count data from a deep mutational scanning time point/round/bin. Has no children. 26 | 27 | Each experimental design has a single root object, which can be an Experiment, Selection, or SeqLib. With the exception of Conditions, each experimental design object has its own HDF5 file containing its data. 28 | 29 | .. note:: Conditions do not have their own HDF5 file. If there is only one condition, use an Experiment as the root. 30 | 31 | .. image:: _static/cartoons/data_hierarchy.png 32 | :alt: Hierarchy of objects in an experimental design 33 | 34 | The above diagram illustrates an experimental design with two conditions, each with three replicates sampled at three time points (including the input). 35 | 36 | .. _intro-elements: 37 | 38 | Elements 39 | ------------------------------------------------------- 40 | 41 | Enrich2 counts elements to quantify their enrichment or depletion in a complex population. The four element types are defined below: 42 | 43 | * Barcode 44 | 45 | A short DNA barcode sequence often used for tagging variants. Stored as the barcode DNA sequence. Barcodes are counted directly from sequencing data. 46 | 47 | * Variant 48 | 49 | A DNA-level variant of the wild type sequence, which can be coding or non-coding. Stored as an HGVS_ string describing the nucleotide and any amino acid differences from the wild type sequence. Variants can be counted either directly from sequencing data or as the sum of counts for linked barcodes as defined by a barcode-variant map. 50 | 51 | * Synonymous 52 | 53 | A protein-level variant of the wild type sequence. Stored as an HGVS_ string describing the amino acid differences from the wild type sequence. Synonymous elements are counted as the sum of counts for variant elements with the same amino acid sequence. Variant elements with the wild type amino acid sequence but a non-wild type DNA sequence are assigned to a special variant. 54 | 55 | * Identifier 56 | 57 | An arbitrary label (such as a target gene name) for barcode assignment. Stored as the label string. Identifiers are counted as the sum of counts for associated barcodes as defined by a barcode-identifier map or specified as counts. 58 | 59 | .. _intro-seqlibs: 60 | 61 | SeqLibs 62 | ------------------------------------------------------- 63 | 64 | Enrich2 implements five types of SeqLib, each supporting different element types and/or methods of sequencing deep mutational scanning populations. 65 | 66 | .. note:: Synonymous elements are only present if the wild type sequence is protein coding. 67 | 68 | * Barcoded Variant 69 | 70 | Contains barcode, variant, and synonymous elements. Each DNA variant in the experiment is linked to one or more DNA barcode sequences. A barcode-variant map describes which barcodes map to each variant. The FASTQ_ file contains only barcode sequences. 71 | 72 | * Barcoded Identifier 73 | 74 | Contains barcode and identifier elements. Each identifier in the experiment is associated with one or more DNA barcode sequences. A barcode-identifier map describes which barcodes map to each identifier. The FASTQ_ file contains only barcode sequences. 75 | 76 | * Overlap 77 | 78 | Contains variant and synonymous elements. DNA variants are sequenced directly using overlapping paired-end reads. Requires FASTQ_ files for both forward and reverse reads. 79 | 80 | * Basic 81 | 82 | Contains variant and synonymous elements. DNA variants are sequenced directly using single-end reads. 83 | 84 | * Barcodes Only 85 | 86 | Contains barcode elements. The FASTQ_ file contains only barcode sequences. 87 | 88 | * Identifiers Only 89 | 90 | Contains identifier elements. No FASTQ_ file is processed, so the counts must be provided by the user. 91 | 92 | For more information, see :ref:`seqlib-configuration`. 93 | 94 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Enrich2.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Enrich2.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /docs/notebooks.rst: -------------------------------------------------------------------------------- 1 | .. _example-notebooks: 2 | 3 | Example notebooks 4 | ==================================== 5 | 6 | Begin exploring Enrich2 datasets with the following notebooks. They rely on the `Enrich2 example dataset `_, so please perform that analysis before running any of these notebooks locally. 7 | 8 | The notebooks can be run interactively by using the command line to navigate to the "Enrich2/docs/notebooks" directory and enter ``jupyter notebook `` where ```` is the notebook file name. 9 | 10 | The first two notebooks demonstrate using pandas to open an HDF5 file, extract its contents into a data frame, and perform queries on tables in the HDF5 file. For more information, see the `pandas HDF5 documentation `_. 11 | 12 | .. include:: exported_notebooks/min_count.rst 13 | 14 | .. include:: exported_notebooks/unique_barcodes.rst 15 | 16 | For more information on Enrich2 data tables, see :ref:`hdf5-files`. 17 | -------------------------------------------------------------------------------- /docs/output.rst: -------------------------------------------------------------------------------- 1 | .. _hdf5-files: 2 | 3 | Output HDF5 files 4 | ======================================= 5 | 6 | Enrich2 stores data in an HDF5 file for each Experiment, Selection, and SeqLib analysis object. The name of the HDF5 file is the object's name plus the suffix "_.h5", where is the object type ("exp", "sel", or "lib"). 7 | Each file has multiple tables that can be queried and retrieved as pandas data frames (see :ref:`example-notebooks`). 8 | 9 | Each Experiment, Selection, and SeqLib has its own directory inside "Results/tsv/" containing tab-separated value files for users who want to work with other tools, such as R or Excel. 10 | 11 | .. _output-table-organization: 12 | 13 | Table organization 14 | --------------------------------------------------- 15 | 16 | HDF5 files organize tables into groups like directories in a file system. Enrich2 has two top-level groups, "/main" (used for most tables) and "/raw" (used exclusively in SeqLibs to store raw counts). The first subgroup is typically the element type (variant, barcode, etc.), followed by the kind of data (counts, scores, etc.). 17 | 18 | .. note:: When the "Force recalculation" analysis option is chosen, the "/main" tables are deleted from all HDF5 files in this analysis, and regenerated based on the "/raw" count data. 19 | 20 | Enrich2 uses NaN (Not a Number) values to represent missing data, such as zero counts or scores that could not be calculated. 21 | 22 | List of tables by object type 23 | ------------------------------------------------------- 24 | 25 | Experiment 26 | +++++++++++++++++++++++ 27 | 28 | Most experiment tables use a pandas MultiIndex for column names. The MultiIndex levels are: condition, selection (if applicable), and data value. See the `pandas advanced indexing documentation `_ for more information on how to work with these objects. 29 | 30 | * "/main//counts" 31 | 32 | Counts of elements that appear in at least one time point in the experiment. 33 | 34 | * "/main//scores" 35 | 36 | Condition-level scores, standard errors, and epsilon (change in the standard error after the last iteration of the random-effects model) for all elements scored in all selections of at least one condition. 37 | 38 | * "/main//scores_shared" 39 | 40 | Selection-level scores and standard errors for each element with at least one condition-level score. 41 | 42 | * "/main//scores_shared_full" 43 | 44 | Selection-level scores and standard errors for each element scored in at least one selection. 45 | 46 | * "/main//scores_pvalues_wt" 47 | 48 | z-scores and p-values for each variant or synonymous element with a condition-level score. The null hypothesis is that the element's score is equal to wild type. 49 | 50 | * "/main/barcodemap" 51 | 52 | Barcode-variant or barcode-identifier map for all barcodes that appear in the Experiment. Only present for Barcoded Variant or Barcoded Identifier SeqLibs. 53 | 54 | Selection 55 | +++++++++++++++++++++++ 56 | 57 | * "/main//counts" 58 | 59 | Counts of elements that appear in all time points in the selection. 60 | 61 | * "/main//counts_unfiltered" 62 | 63 | Counts of elements that appear in at least one time point in the selection. 64 | 65 | * "/main//scores" 66 | 67 | Scores, standard errors, standard error percentiles, and method-specific values (e.g. regression slope and intercept) for all elements counted in all time points in the selection. 68 | 69 | * "/main//weights" 70 | 71 | Regression weights for each element at each time point in weighted least squares regression. 72 | 73 | * "/main//log_ratios" 74 | 75 | Y-values for each element at each time point in weighted and ordinary least squares regression. 76 | 77 | * "/main/barcodemap" 78 | 79 | Barcode-variant or barcode-identifier map for all barcodes that appear in the Selection. Only present for Barcoded Variant or Barcoded Identifier SeqLibs. 80 | 81 | SeqLib 82 | +++++++++++++++++++++++ 83 | 84 | * "/main//counts" 85 | 86 | Counts of elements after minimum count filtering and barcode mapping. 87 | 88 | * "/raw//counts" 89 | 90 | Counts of elements taken directly from the FASTQ_ data. 91 | 92 | * "/raw/filter" 93 | 94 | Number of reads removed for each FASTQ_ filtering option. 95 | 96 | * "/raw/barcodemap" 97 | 98 | Barcode-variant or barcode-identifier map for barcodes that appear in this SeqLib. Only present for Barcoded Variant or Barcoded Identifier SeqLibs. 99 | -------------------------------------------------------------------------------- /docs/plots.rst: -------------------------------------------------------------------------------- 1 | .. _plots: 2 | 3 | Automatically generated plots 4 | ================================================= 5 | 6 | In addition to providing structured output to allow users to create their own plots, Enrich2 produces default visualizations for each analysis. Experiment, Selection, and SeqLib objects each have their own directory inside "Results/plots/". Plots are saved in PDF format, and many of the files contain multiple pages. 7 | 8 | Experiment plots 9 | ------------------------------------------- 10 | 11 | * Sequence-function map 12 | 13 | .. image:: _static/plots/sfmap.png 14 | 15 | Visualization of scores and standard errors for single changes from wild type. Separate protein- and nucleotide-level sequence-function maps are generated. 16 | 17 | Cell color indicates the score for the single change (row) at the given position (column). Positive scores (in red) indicate better performance in the assay, and negative scores (in blue) indicate worse performance. Grey squares denote changes that were not measured. Diagonal lines in each cell represent the standard error for the score, and are scaled such that the highest standard error on the plot covers the entire diagonal. Standard errors that are less than 2% of this maximum value are not plotted. Cells containing circles have the wild type residue at that position. 18 | 19 | .. _sfmap_aa_file: 20 | 21 | Custom amino acid ordering and groups can be specified by running Enrich2 in command line mode and using the ``--sfmap-aa-file`` option. Each line of the file begins with an optional label followed by a single tab character and then a comma-separated list of single-letter amino acid codes. All amino acid codes must be present exactly once. 22 | 23 | The following amino acid grouping files are provided: 24 | 25 | Default (:download:`click to download <_static/sfmap_aa_files/aagroup_default.txt>`) 26 | 27 | This grouping is used when no file is specified. `Reference `__ 28 | 29 | .. literalinclude:: _static/sfmap_aa_files/aagroup_default.txt 30 | 31 | Helical Propensity (:download:`click to download <_static/sfmap_aa_files/aagroup_helical_propensity.txt>`) 32 | 33 | `Reference `__ 34 | 35 | .. literalinclude:: _static/sfmap_aa_files/aagroup_helical_propensity.txt 36 | 37 | Selection plots 38 | -------------------------------------------- 39 | 40 | * Sequence-function map 41 | 42 | As above. 43 | 44 | * Diversity map 45 | 46 | .. image:: _static/plots/diversity.png 47 | 48 | Variant frequencies are visualized in the style of a sequence-function map. Separate protein- and nucleotide-level diversity maps for each time point are generated. 49 | 50 | Custom amino acid ordering and groups can be specified by running Enrich2 in command line mode and using the ``--sfmap-aa-file`` option. :ref:`See above ` for more details. 51 | 52 | * Counts per time point 53 | 54 | .. image:: _static/plots/selection_counts.png 55 | 56 | Bar plots showing the total element count in each time point. One plot for each element type. 57 | 58 | * Representative regression fits 59 | 60 | .. image:: _static/plots/se_pctile.png 61 | 62 | Present for linear regression scoring methods only. Linear fits for the element closest to each 5th percentile (0, 5, 10, ..., 95, 100). Used for diagnostic purposes and setting standard error filtering cutoffs. One plot for each element type. 63 | 64 | * Regression weights 65 | 66 | .. image:: _static/plots/regression_weights.png 67 | 68 | Present for weighted linear regression scoring method only. Boxplot of regression weights for each time point. Dashed line indicates uniform weight. One plot for each element type. 69 | 70 | * Volcano plot 71 | 72 | .. image:: _static/plots/volcano.png 73 | 74 | Present for linear regression scoring methods with variants only. Volcano plot of the raw p-value from a z-test under the null hypothesis that the element behaves the same as wild type vs. the element's score. One plot for each element type. 75 | 76 | * Wild type shape 77 | 78 | .. image:: _static/plots/wt_shape.png 79 | 80 | Present for linear regression scoring methods with variants only. Plot of the non-normalized linear fit of the wild type. Used to assess the effect of wild type correction. 81 | 82 | SeqLib plots 83 | ---------------------------------------------- 84 | 85 | * Counts per element 86 | 87 | .. image:: _static/plots/seqlib_counts.png 88 | 89 | Histogram of element counts. Two plots for each element type, one with log-transformed x-axis and one without. 90 | 91 | * Unique barcodes per element 92 | 93 | .. image:: _static/plots/barcodes_per_variant.png 94 | 95 | Present for Barcoded Variant and Barcoded Identifier SeqLibs only. Histogram of unique barcodes per variant or identifier. 96 | 97 | * Mismatches in overlapping reads 98 | 99 | .. image:: _static/plots/overlap_mismatches.png 100 | 101 | Present for Overlap SeqLibs only. Barplot of the number of resolved and unresolved mismatches at each position in the overlap region, and the number of times the first mismatch in a read pair occured at each position. Used for diagnosing misalignment of overlapping reads. 102 | -------------------------------------------------------------------------------- /docs/seqlib_config.rst: -------------------------------------------------------------------------------- 1 | .. _seqlib-configuration: 2 | 3 | SeqLib configuration details 4 | ================================ 5 | 6 | Most parameters are specified within SeqLib objects. Experiment, Condition, and Selection objects have only a name (and output directory if at the root). :ref:`analysis-options`, such as scoring method, are chosen at run time. 7 | 8 | Sequencing libraries have :ref:`general-seqlib-parameters`, :ref:`sequence-file-seqlib-parameters`, and other parameter groups depending on the type: 9 | 10 | +----------------------+---------+---------+------------+---------+ 11 | | SeqLib type | Barcode | Variant | Identifier | Overlap | 12 | +======================+=========+=========+============+=========+ 13 | | Barcoded Variant | X | X | | | 14 | +----------------------+---------+---------+------------+---------+ 15 | | Barcoded Identifier | X | | X | | 16 | +----------------------+---------+---------+------------+---------+ 17 | | Overlap | | X | | X | 18 | +----------------------+---------+---------+------------+---------+ 19 | | Basic | | X | | | 20 | +----------------------+---------+---------+------------+---------+ 21 | | Barcodes Only | X | | | | 22 | +----------------------+---------+---------+------------+---------+ 23 | | Identifiers Only | | | X | | 24 | +----------------------+---------+---------+------------+---------+ 25 | 26 | See :ref:`intro-seqlibs` for descriptions of each type. 27 | 28 | .. _general-seqlib-parameters: 29 | 30 | General parameters 31 | -------------------------------- 32 | 33 | * Name 34 | 35 | The object name should be short, descriptive, and not conflict with other object names in the analysis. 36 | 37 | * Output Directory 38 | 39 | Path to the output directory. This field only appears for the root object. 40 | 41 | * Time Point 42 | 43 | The time point must be an integer. All Selections require an input library as time point 0. Time point values may refer to the round of selection or hour of sampling. 44 | 45 | * Counts File 46 | 47 | Required for Counts File Mode. Path to an HDF5 file or tab-separated value file that contains counts for this time point. Raw counts from that file will be used for this SeqLib. If an HDF5 file is provided, all tables in the "raw/" group are copied. Sequence file parameters will be ignored. The file must have the suffix ".h5" for HDF5 or one of ".txt" ".tsv" or ".csv" for tab-separated value files. 48 | 49 | .. note:: Tab-separated value files must have exactly two columns separated by a tab. The first line of the file must have the column heading "counts" preceded by a single tab character. The first column contains the barcode, identifier, or HGVS variant string depending on the type of raw counts required by the SeqLib type. The second column contains the count for that element. 50 | 51 | .. _sequence-file-seqlib-parameters: 52 | 53 | Sequence file parameters 54 | -------------------------------- 55 | 56 | Enrich2 accepts sequence files in FASTQ_ format. These files may be processed while compressed with gzip or bzip2. The file must have the suffix ".fq" or ".fastq" before compression. 57 | 58 | * Reads 59 | 60 | Required for FASTQ_ File Mode. Path to a FASTQ_ file containing the sequencing reads. For overlap SeqLibs, there are fields for Forward Reads and Reverse Reads. 61 | 62 | * Reverse 63 | 64 | Checking this box will reverse-complement reads before analysis. Not present for Overlap SeqLibs. 65 | 66 | Read filtering parameters 67 | ++++++++++++++++++++++++++++++++++ 68 | 69 | Filters are applied after read trimming and any read merging. 70 | 71 | * Minimum Quality 72 | 73 | Minimum single-base quality. If a single base in the read has a quality score below this value, the read will be discarded. 74 | 75 | * Average Quality 76 | 77 | Average read quality. If the mean quality score of all bases in the read is below this value, the read will be discarded. 78 | 79 | * Maximum N's 80 | 81 | Maximum number of N nucleotides. If the read contains more than this number of bases called as N, the read will be discarded. This should be set to 0 in most cases. 82 | 83 | * Remove Unresolvable Overlaps 84 | 85 | Present for Overlap SeqLibs only. Checking this box discards merged reads with unresolvable discrepant bases (see :ref:`overlap-seqlib-parameters`). 86 | 87 | * Maximum Mutations 88 | 89 | Present for SeqLibs with variants only. Maximum number of mutations. If the variant contains more than this number of differences from wild type, the variant is discarded (or aligned if that option is enabled under :ref:`variant-seqlib-parameters`). 90 | 91 | .. _barcode-seqlib-parameters: 92 | 93 | Barcode parameters 94 | -------------------------------- 95 | 96 | * Barcode-variant File 97 | 98 | Not present for barcode-only SeqLibs. Path to a tab-separated file in which each line contains a barcode followed by its identifier or linked variant DNA sequence. This file may be processed while compressed with gzip or bzip2. 99 | 100 | * Minimum Count 101 | 102 | Minimum barcode count. If the barcode has fewer counts than this value, it will not be scored and will not contribute to counts of its variant or identifier. 103 | 104 | * Trim Start 105 | 106 | Position of the first base to keep when trimming barcodes. All subsequent bases are kept if Trim Length is not specified. Reverse-complementing occurs before trimming. Bases are numbered starting at 1. 107 | 108 | * Trim Length 109 | 110 | Number of bases to keep when trimming barcodes. Starts at the first base if Trim Start is not specified. Reverse-complementing occurs before trimming. 111 | 112 | .. _variant-seqlib-parameters: 113 | 114 | Variant parameters 115 | -------------------------------- 116 | 117 | * Wild Type Sequence 118 | 119 | The wild type DNA sequence. This sequence will be compared to reads or the barcode-variant map when calling variants. All sequences must have the same length and starting position. 120 | 121 | * Wild Type Offset 122 | 123 | Integer added to every variant nucleotide position. Used to place variants in the context of a larger sequence. 124 | 125 | * Protein Coding 126 | 127 | Checking this box will interpret the wild type sequence as protein coding. The wild type sequence must be in frame. 128 | 129 | * Use Aligner 130 | 131 | Checking this box will enable Needleman-Wunsch alignment. Insertion and deletion events will be called. 132 | 133 | .. warning:: Using the aligner will dramatically increase run time, and is not recommended for most users. 134 | 135 | * Minimum Count 136 | 137 | Minimum variant count. If the variant has fewer counts than this value, it will not be scored and will not contribute to counts of any synonymous elements. 138 | 139 | .. _identifier-seqlib-parameters: 140 | 141 | Identifier parameters 142 | -------------------------------- 143 | 144 | * Minimum Count 145 | 146 | Minimum identifier count. If the identifier has fewer counts than this value, it will not be scored. 147 | 148 | .. _overlap-seqlib-parameters: 149 | 150 | Overlap parameters 151 | -------------------------------- 152 | 153 | Overlapping read pairs reduce the likelihood of calling sequencing errors as variants. Paired-end Illumina reads are generated such that they overlap in the target region. 154 | 155 | When Enrich2 combines forward and reverse reads into merged reads, base quality values in the overlapping region are defined as the higher quality value at each position. Mismatches are resolved by assuming the base with the higher quality value is correct. If mismatched bases have the same quality value, the position is considered unresolvable and replaced by an 'X' base. 156 | 157 | * Forward Start 158 | 159 | Position of the first overlapping base in the forward read. Bases are numbered starting at 1. 160 | 161 | * Reverse Start 162 | 163 | Position of the first overlapping base in the reverse read before reverse complementing. Bases are numbered starting at 1. 164 | 165 | * Overlap Length 166 | 167 | Number of bases in the overlapping region. 168 | 169 | * Maximum Mismatches 170 | 171 | Maximum number of mismatches in the overlapping region. If a merged read has more than this number of mismatches, the read pair will be discarded. 172 | 173 | * Overlap Only 174 | 175 | Checking this box will trim the merged reads to the overlapping region. 176 | 177 | -------------------------------------------------------------------------------- /enrich2/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "2.0.2" 2 | -------------------------------------------------------------------------------- /enrich2/aligner.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for alignment of variants to the wild type sequence. 3 | 4 | This module is optional, and using it will dramatically increase runtime when 5 | counting variants. It is only recommended for users who need to count 6 | insertion and deletion variants (i.e. not coding sequences). 7 | """ 8 | 9 | import numpy as np 10 | 11 | #: Default similarity matrix used by the aligner. 12 | #: User-defined matrices must have this format. 13 | _simple_similarity = { 14 | "A": {"A": 1, "C": -1, "G": -1, "T": -1, "N": 0, "X": 0}, 15 | "C": {"A": -1, "C": 1, "G": -1, "T": -1, "N": 0, "X": 0}, 16 | "G": {"A": -1, "C": -1, "G": 1, "T": -1, "N": 0, "X": 0}, 17 | "T": {"A": -1, "C": -1, "G": -1, "T": 1, "N": 0, "X": 0}, 18 | "N": {"A": 0, "C": 0, "G": 0, "T": 0, "N": 0, "X": 0}, 19 | "X": {"A": 0, "C": 0, "G": 0, "T": 0, "N": 0, "X": 0}, 20 | "gap": -1, 21 | } 22 | 23 | 24 | class Aligner(object): 25 | """ 26 | Class for performing local alignment of two DNA sequences. 27 | 28 | This class implements `Needleman-Wunsch `_ local alignment. 30 | 31 | The :py:class:`~aligner.Aligner` requires a scoring matrix when 32 | created. The format is a nested dictionary, with a special ``'gap'`` entry 33 | for the gap penalty (this value is used for both gap opening and gap 34 | extension). 35 | 36 | The ``'X'`` nucleotide is a special case for unresolvable mismatches in 37 | :py:class:`~overlap.OverlapSeqLib` variant data. 38 | """ 39 | 40 | _MAT = 1 # match 41 | _INS = 2 # insertion (with respect to wild type) 42 | _DEL = 3 # deletion (with respect to wild type) 43 | _END = 4 # end of traceback 44 | 45 | def __init__(self, similarity=_simple_similarity): 46 | similarity_keys = list(similarity.keys()) 47 | if "gap" in similarity_keys: 48 | similarity_keys.remove("gap") 49 | for key in similarity_keys: 50 | if not all(x in similarity[key] for x in similarity_keys) or len( 51 | similarity[key] 52 | ) != len(similarity_keys): 53 | raise ValueError("Asymmetrical alignment scoring matrix") 54 | 55 | self.similarity = similarity 56 | if "gap" not in self.similarity: 57 | raise ValueError("No gap penalty in alignment scoring matrix.") 58 | 59 | self.matrix = None 60 | self.seq1 = None 61 | self.seq2 = None 62 | self.calls = 0 63 | 64 | def align(self, seq1, seq2): 65 | """ 66 | Aligns the two sequences, *seq1* and *seq2* and returns a list of 67 | tuples describing the differences between the sequences. 68 | 69 | The tuple format is ``(i, j, type, length)``, where ``i`` and ``j`` 70 | are the positions in *seq1* and *seq2*, respectively, and type is one 71 | of ``"match"``, ``"mismatch"``, ``"insertion"``, or ``"deletion"``. 72 | For indels, the ``length`` value is the number of bases inserted or 73 | deleted with respect to *seq1* starting at ``i``. 74 | """ 75 | self.matrix = np.ndarray( 76 | shape=(len(seq1) + 1, len(seq2) + 1), 77 | dtype=np.dtype([("score", int), ("trace", np.byte)]), 78 | ) 79 | seq1 = seq1.upper() 80 | seq2 = seq2.upper() 81 | 82 | # build matrix of scores/traceback information 83 | for i in range(len(seq1) + 1): 84 | self.matrix[i, 0] = (self.similarity["gap"] * i, Aligner._DEL) 85 | for j in range(len(seq2) + 1): 86 | self.matrix[0, j] = (self.similarity["gap"] * j, Aligner._INS) 87 | for i in range(1, len(seq1) + 1): 88 | for j in range(1, len(seq2) + 1): 89 | match = ( 90 | self.matrix[i - 1, j - 1]["score"] 91 | + self.similarity[seq1[i - 1]][seq2[j - 1]], 92 | Aligner._MAT, 93 | ) 94 | delete = ( 95 | self.matrix[i - 1, j]["score"] + self.similarity["gap"], 96 | Aligner._DEL, 97 | ) 98 | insert = ( 99 | self.matrix[i, j - 1]["score"] + self.similarity["gap"], 100 | Aligner._INS, 101 | ) 102 | self.matrix[i, j] = max(delete, insert, match, key=lambda x: x[0]) 103 | self.matrix[0, 0] = (0, Aligner._END) 104 | 105 | # calculate alignment from the traceback 106 | i = len(seq1) 107 | j = len(seq2) 108 | traceback = list() 109 | while i > 0 or j > 0: 110 | if self.matrix[i, j]["trace"] == Aligner._MAT: 111 | if seq1[i - 1] == seq2[j - 1]: 112 | traceback.append((i - 1, j - 1, "match", None)) 113 | else: 114 | traceback.append((i - 1, j - 1, "mismatch", None)) 115 | i -= 1 116 | j -= 1 117 | elif self.matrix[i, j]["trace"] == Aligner._INS: 118 | traceback.append((i - 1, j - 1, "insertion", 1)) 119 | j -= 1 120 | elif self.matrix[i, j]["trace"] == Aligner._DEL: 121 | traceback.append((i - 1, j - 1, "deletion", 1)) 122 | i -= 1 123 | elif self.matrix[i, j]["trace"] == Aligner._END: 124 | pass 125 | else: 126 | raise RuntimeError("Invalid value in alignment traceback.") 127 | traceback.reverse() 128 | 129 | # combine indels 130 | indel = None 131 | traceback_combined = list() 132 | for t in traceback: 133 | if t[2] == "insertion" or t[2] == "deletion": 134 | if indel is not None: 135 | if t[2] == indel[2]: 136 | indel[3] += t[3] 137 | else: 138 | raise RuntimeError( 139 | "Aligner failed to combine indels. " "Check gap penalty." 140 | ) 141 | else: 142 | indel = list(t) 143 | else: 144 | if indel is not None: 145 | traceback_combined.append(tuple(indel)) 146 | indel = None 147 | traceback_combined.append(t) 148 | if indel is not None: 149 | traceback_combined.append(tuple(indel)) 150 | 151 | self.calls += 1 152 | return traceback_combined 153 | -------------------------------------------------------------------------------- /enrich2/barcode.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | import sys 4 | from .seqlib import SeqLib 5 | from fqfa import open_compressed, parse_fastq_reads, has_fastq_ext 6 | 7 | 8 | class BarcodeSeqLib(SeqLib): 9 | """ 10 | Class for count data from barcoded sequencing libraries. Designed for 11 | barcode-only scoring or as a parent class for 12 | :py:class:`~seqlib.barcodevariant.BcvSeqLib` and 13 | :py:class:`~seqlib.barcodeid.BcidSeqLib`. 14 | """ 15 | 16 | treeview_class_name = "Barcode SeqLib" 17 | 18 | def __init__(self): 19 | # Init step handled by VariantSeqLib's init for Barcode-variant 20 | if type(self).__name__ != "BcvSeqLib": 21 | SeqLib.__init__(self) 22 | self.reads = None 23 | self.reverse_complement_reads = False 24 | self.trim_start = 1 25 | self.trim_length = sys.maxsize 26 | self.barcode_min_count = 0 27 | self.add_label("barcodes") 28 | self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__)) 29 | 30 | def configure(self, cfg): 31 | """ 32 | Set up the object using the config object *cfg*, usually derived from 33 | a ``.json`` file. 34 | """ 35 | SeqLib.configure(self, cfg) 36 | self.logger = logging.getLogger( 37 | "{}.{} - {}".format(__name__, self.__class__.__name__, self.name) 38 | ) 39 | 40 | # handle non-FASTQ config options 41 | try: 42 | if "min count" in cfg["barcodes"]: 43 | self.barcode_min_count = int(cfg["barcodes"]["min count"]) 44 | except KeyError as key: 45 | raise KeyError("Missing required config value {}".format(key), self.name) 46 | 47 | # if counts are specified, copy them later 48 | # else handle the FASTQ config options and check the files 49 | if self.counts_file is None: 50 | self.configure_fastq(cfg) 51 | try: 52 | if not has_fastq_ext(self.reads): 53 | raise ValueError( 54 | "FASTQ file error: unrecognized file extension", self.name 55 | ) 56 | except IOError as fqerr: 57 | raise IOError("FASTQ file error: {}".format(fqerr), self.name) 58 | 59 | def serialize(self): 60 | """ 61 | Format this object (and its children) as a config object suitable for 62 | dumping to a config file. 63 | """ 64 | cfg = SeqLib.serialize(self) 65 | 66 | cfg["barcodes"] = dict() 67 | if self.barcode_min_count > 0: 68 | cfg["barcodes"]["min count"] = self.barcode_min_count 69 | 70 | cfg["fastq"] = self.serialize_fastq() 71 | 72 | return cfg 73 | 74 | def configure_fastq(self, cfg): 75 | """ 76 | Set up the object's FASTQ_ file handling and filtering options. 77 | """ 78 | try: 79 | self.reads = cfg["fastq"]["reads"] 80 | self.reverse_complement_reads = cfg["fastq"]["reverse"] 81 | 82 | if "start" in cfg["fastq"]: 83 | self.trim_start = cfg["fastq"]["start"] 84 | 85 | if "length" in cfg["fastq"]: 86 | self.trim_length = cfg["fastq"]["length"] 87 | 88 | self.filters = cfg["fastq"]["filters"] 89 | except KeyError as key: 90 | raise KeyError("Missing required config value {}".format(key), self.name) 91 | 92 | def serialize_fastq(self): 93 | """ 94 | Serialize this object's FASTQ_ file handling and filtering options. 95 | """ 96 | fastq = { 97 | "reads": self.reads, 98 | "reverse": self.reverse_complement_reads, 99 | "filters": self.serialize_filters(), 100 | } 101 | if self.trim_start > 1: 102 | fastq["start"] = self.trim_start 103 | 104 | if self.trim_length < sys.maxsize: 105 | fastq["length"] = self.trim_length 106 | 107 | return fastq 108 | 109 | def counts_from_reads(self): 110 | """ 111 | Reads the forward or reverse FASTQ_ file (reverse reads are 112 | reverse-complemented), performs quality-based filtering, and counts 113 | the barcodes. 114 | 115 | Barcode counts after read-level filtering are stored under 116 | ``"/raw/barcodes/counts"``. 117 | """ 118 | df_dict = dict() 119 | 120 | filter_flags = dict() 121 | for key in self.filters: 122 | filter_flags[key] = False 123 | 124 | # count all the barcodes 125 | self.logger.info("Counting barcodes") 126 | with open_compressed(self.reads) as handle: 127 | for fq in parse_fastq_reads(handle): 128 | fq.trim(start=self.trim_start, end=self.trim_start + self.trim_length -1) 129 | if self.reverse_complement_reads: 130 | fq.reverse_complement() 131 | 132 | if self.read_quality_filter(fq): # passed filtering 133 | try: 134 | df_dict[fq.sequence.upper()] += 1 135 | except KeyError: 136 | df_dict[fq.sequence.upper()] = 1 137 | 138 | self.save_counts("barcodes", df_dict, raw=True) 139 | del df_dict 140 | 141 | def calculate(self): 142 | """ 143 | Counts the barcodes from the FASTQ file or from the provided counts 144 | file depending on the config. 145 | 146 | Barcodes that pass the minimum count 147 | filtering are stored under ``"/main/barcodes/counts"``. 148 | 149 | If ``"/main/barcodes/counts"`` already exists, those will be used 150 | instead of re-counting. 151 | """ 152 | if self.check_store("/main/barcodes/counts"): 153 | return 154 | 155 | # no raw counts present 156 | if not self.check_store("/raw/barcodes/counts"): 157 | if self.counts_file is not None: 158 | self.counts_from_file(self.counts_file) 159 | else: 160 | self.counts_from_reads() 161 | 162 | if len(self.labels) == 1: # only barcodes 163 | self.save_filtered_counts("barcodes", "count >= self.barcode_min_count") 164 | self.save_filter_stats() 165 | -------------------------------------------------------------------------------- /enrich2/barcodeid.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .seqlib import SeqLib 3 | from .barcode import BarcodeSeqLib 4 | from .barcodemap import BarcodeMap 5 | import pandas as pd 6 | from .plots import barcodemap_plot 7 | from matplotlib.backends.backend_pdf import PdfPages 8 | import os.path 9 | 10 | 11 | class BcidSeqLib(BarcodeSeqLib): 12 | """ 13 | Class for counting data from barcoded sequencing libraries with non-variant 14 | identifiers. 15 | Creating a :py:class:`BcidSeqLib` requires a valid *config* 16 | object with an ``'barcodes'`` entry and information. 17 | 18 | The ``barcode_map`` keyword argument can be used to pass an existing 19 | :py:class:`~seqlib.barcodemap.BarcodeMap`. Ensuring this is the 20 | right :py:class:`~seqlib.barcodemap.BarcodeMap` is the responsibility 21 | of the caller. 22 | """ 23 | 24 | treeview_class_name = "Barcoded ID SeqLib" 25 | 26 | def __init__(self): 27 | BarcodeSeqLib.__init__(self) 28 | self.barcode_map = None 29 | self.identifier_min_count = 0 30 | self.add_label("identifiers") 31 | self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__)) 32 | 33 | def configure(self, cfg, barcode_map=None): 34 | """ 35 | Set up the object using the config object *cfg*, usually derived from 36 | a ``.json`` file. 37 | """ 38 | BarcodeSeqLib.configure(self, cfg) 39 | self.logger = logging.getLogger( 40 | "{}.{} - {}".format(__name__, self.__class__.__name__, self.name) 41 | ) 42 | try: 43 | if "min count" in cfg["identifiers"]: 44 | self.identifier_min_count = int(cfg["identifiers"]["min count"]) 45 | 46 | if barcode_map is not None: 47 | if barcode_map.filename == cfg["barcodes"]["map file"]: 48 | self.barcode_map = barcode_map 49 | else: 50 | raise ValueError( 51 | "Attempted to assign non-matching barcode map [{}]".format( 52 | self.name 53 | ) 54 | ) 55 | else: 56 | self.barcode_map = BarcodeMap( 57 | cfg["barcodes"]["map file"], is_variant=False 58 | ) 59 | except KeyError as key: 60 | raise KeyError( 61 | "Missing required config value {key} [{name}]".format( 62 | key=key, name=self.name 63 | ) 64 | ) 65 | 66 | def serialize(self): 67 | """ 68 | Format this object (and its children) as a config object suitable for dumping to a config file. 69 | """ 70 | cfg = BarcodeSeqLib.serialize(self) 71 | 72 | cfg["identifiers"] = dict() 73 | if self.identifier_min_count > 0: 74 | cfg["identifiers"]["min count"] = self.identifier_min_count 75 | 76 | if self.barcode_map is not None: # required for creating new objects in GUI 77 | cfg["barcodes"]["map file"] = self.barcode_map.filename 78 | 79 | return cfg 80 | 81 | def calculate(self): 82 | """ 83 | Counts the barcodes using :py:meth:`BarcodeSeqLib.count` and combines them into 84 | identifier counts using the :py:class:`BarcodeMap`. 85 | """ 86 | if not self.check_store("/main/identifiers/counts"): 87 | BarcodeSeqLib.calculate(self) # count the barcodes 88 | df_dict = dict() 89 | barcode_identifiers = dict() 90 | 91 | self.logger.info("Converting barcodes to identifiers") 92 | # store mapped barcodes 93 | self.save_filtered_counts( 94 | "barcodes", 95 | "index in self.barcode_map.keys() & count >= self.barcode_min_count", 96 | ) 97 | 98 | # count identifiers associated with the barcodes 99 | for bc, count in self.store["/main/barcodes/counts"].iterrows(): 100 | count = count["count"] 101 | identifier = self.barcode_map[bc] 102 | try: 103 | df_dict[identifier] += count 104 | except KeyError: 105 | df_dict[identifier] = count 106 | barcode_identifiers[bc] = identifier 107 | 108 | # save counts, filtering based on the min count 109 | self.save_counts( 110 | "identifiers", 111 | { 112 | k: v 113 | for k, v in df_dict.items() 114 | if v >= self.identifier_min_count 115 | }, 116 | raw=False, 117 | ) 118 | del df_dict 119 | 120 | # write the active subset of the BarcodeMap to the store 121 | barcodes = list(barcode_identifiers.keys()) 122 | barcode_identifiers = pd.DataFrame( 123 | {"value": [barcode_identifiers[bc] for bc in barcodes]}, index=barcodes 124 | ) 125 | del barcodes 126 | barcode_identifiers.sort_values("value", inplace=True) 127 | self.store.put( 128 | "/raw/barcodemap", 129 | barcode_identifiers, 130 | data_columns=barcode_identifiers.columns, 131 | format="table", 132 | ) 133 | del barcode_identifiers 134 | 135 | # self.report_filter_stats() 136 | self.save_filter_stats() 137 | 138 | def make_plots(self): 139 | """ 140 | Make plots for :py:class:`~seqlib.seqlib.BcidSeqLib` objects. 141 | 142 | Creates plot of the number of barcodes mapping to each identifier. 143 | """ 144 | if self.plots_requested: 145 | SeqLib.make_plots(self) 146 | # open the PDF file 147 | pdf = PdfPages(os.path.join(self.plot_dir, "barcodes_per_identifier.pdf")) 148 | barcodemap_plot(self, pdf) 149 | pdf.close() 150 | -------------------------------------------------------------------------------- /enrich2/barcodemap.py: -------------------------------------------------------------------------------- 1 | import re 2 | import gzip 3 | import bz2 4 | import os.path 5 | 6 | re_barcode = re.compile("^[ACGT]+$") 7 | re_variant_dna = re.compile("^[ACGTN]+$") 8 | re_identifier = re.compile("^.+$") 9 | 10 | 11 | class BarcodeMap(dict): 12 | """ 13 | Dictionary-derived class for storing the relationship between barcodes 14 | (keys) and variants (values). Requires the path to a *mapfile*, containing 15 | lines in the format ``'barcodevariant'`` for each barcode 16 | expected in the library. This file can be plain text or compressed 17 | (``.bz2`` or ``.gz``). 18 | 19 | Barcodes must only contain the characters ``ACGT`` and variants must only 20 | contain the characters ``ACGTN`` (lowercase characters are converted to 21 | uppercase). 22 | 23 | Blank lines and lines that begin with ``#`` (comments) are ignored. 24 | 25 | *is_variant* is a boolean that is ``True`` if the barcodes are assigned to 26 | variant DNA sequences, or ``False`` if the barcodes are assigned to 27 | arbitrary identifiers. If this is ``True``, additional error checking 28 | is performed on the variant DNA sequences. 29 | 30 | """ 31 | 32 | def __init__(self, mapfile, is_variant=False): 33 | super(BarcodeMap, self).__init__() 34 | self.name = "barcodemap_{}".format(os.path.basename(mapfile)) 35 | self.filename = mapfile 36 | self.is_variant = is_variant 37 | 38 | # open the file 39 | try: 40 | ext = os.path.splitext(mapfile)[-1].lower() 41 | if ext in (".bz2",): 42 | handle = bz2.BZ2File(mapfile, "r") 43 | elif ext in (".gz",): 44 | handle = gzip.GzipFile(mapfile, "r") 45 | else: 46 | handle = open(mapfile, "r") 47 | except IOError: 48 | raise IOError( 49 | "Could not open barcode map file '{}' [{}]".format(mapfile, self.name) 50 | ) 51 | 52 | # handle each line 53 | for line in handle: 54 | line = line.decode("utf-8") 55 | # skip comments and whitespace-only lines 56 | if len(line.strip()) == 0 or line[0] == "#": 57 | continue 58 | 59 | try: 60 | barcode, value = line.strip().split() 61 | except ValueError: 62 | raise ValueError( 63 | "Unexpected barcode map line format " "[{}]".format(self.name) 64 | ) 65 | 66 | barcode = barcode.upper() 67 | if not re_barcode.match(barcode): 68 | raise ValueError( 69 | "Barcode DNA sequence contains unexpected " 70 | "characters [{}]".format(self.name) 71 | ) 72 | if self.is_variant: 73 | value = value.upper() 74 | if not re_variant_dna.match(value): 75 | raise ValueError( 76 | "Variant DNA sequence contains unexpected" 77 | " characters [{}]".format(self.name) 78 | ) 79 | else: 80 | if not re_identifier.match(value): 81 | raise ValueError( 82 | "Identifier contains unexpected " 83 | "characters [{}]".format(self.name) 84 | ) 85 | 86 | if barcode in self: 87 | if self[barcode] != value: 88 | raise ValueError( 89 | "Barcode '{}' assigned to multiple " 90 | "unique values: {}".format(barcode, self.name) 91 | ) 92 | else: 93 | self[barcode] = value 94 | 95 | handle.close() 96 | -------------------------------------------------------------------------------- /enrich2/barcodevariant.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .seqlib import SeqLib 3 | from .variant import VariantSeqLib 4 | from .barcode import BarcodeSeqLib 5 | from .barcodemap import BarcodeMap 6 | import pandas as pd 7 | from .plots import barcodemap_plot 8 | from matplotlib.backends.backend_pdf import PdfPages 9 | import os.path 10 | 11 | 12 | class BcvSeqLib(VariantSeqLib, BarcodeSeqLib): 13 | """ 14 | Class for counting variant data from barcoded sequencing libraries. 15 | Creating a :py:class:`BcvSeqLib` requires a valid *config* 16 | object with an ``'barcodes'`` entry and information about the wild type 17 | sequence. 18 | 19 | The ``barcode_map`` keyword argument can be used to pass an existing 20 | :py:class:`~seqlib.barcodemap.BarcodeMap`. Ensuring this is the 21 | right :py:class:`~seqlib.barcodemap.BarcodeMap` is the responsibility 22 | of the caller. 23 | """ 24 | 25 | treeview_class_name = "Barcoded Variant SeqLib" 26 | 27 | def __init__(self): 28 | VariantSeqLib.__init__(self) 29 | BarcodeSeqLib.__init__(self) 30 | self.barcode_map = None 31 | self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__)) 32 | 33 | def configure(self, cfg, barcode_map=None): 34 | """ 35 | Set up the object using the config object *cfg*, usually derived from 36 | a ``.json`` file. 37 | """ 38 | VariantSeqLib.configure(self, cfg) 39 | BarcodeSeqLib.configure(self, cfg) 40 | self.logger = logging.getLogger( 41 | "{}.{} - {}".format(__name__, self.__class__.__name__, self.name) 42 | ) 43 | try: 44 | if barcode_map is not None: 45 | if barcode_map.filename == cfg["barcodes"]["map file"]: 46 | self.barcode_map = barcode_map 47 | else: 48 | raise ValueError( 49 | "Attempted to assign non-matching barcode map [{}]".format( 50 | self.name 51 | ) 52 | ) 53 | else: 54 | self.barcode_map = BarcodeMap( 55 | cfg["barcodes"]["map file"], is_variant=True 56 | ) 57 | except KeyError as key: 58 | raise KeyError( 59 | "Missing required config value {key} [{name}]".format( 60 | key=key, name=self.name 61 | ) 62 | ) 63 | 64 | def serialize(self): 65 | """ 66 | Format this object (and its children) as a config object suitable for dumping to a config file. 67 | """ 68 | cfg = VariantSeqLib.serialize(self) 69 | cfg.update(BarcodeSeqLib.serialize(self)) 70 | 71 | if self.barcode_map is not None: # required for creating new objects in GUI 72 | cfg["barcodes"]["map file"] = self.barcode_map.filename 73 | 74 | return cfg 75 | 76 | def calculate(self): 77 | """ 78 | Counts the barcodes using :py:meth:`BarcodeSeqLib.count` and combines them into 79 | variant counts using the :py:class:`BarcodeMap`. 80 | """ 81 | if not self.check_store("/main/variants/counts"): 82 | BarcodeSeqLib.calculate(self) # count the barcodes 83 | df_dict = dict() 84 | barcode_variants = dict() 85 | 86 | self.logger.info("Converting barcodes to variants") 87 | # store mapped barcodes 88 | self.save_filtered_counts( 89 | "barcodes", 90 | "index in self.barcode_map.keys() & count >= self.barcode_min_count", 91 | ) 92 | 93 | # count variants associated with the barcodes 94 | max_mut_barcodes = 0 95 | max_mut_variants = 0 96 | for bc, count in self.store["/main/barcodes/counts"].iterrows(): 97 | count = count["count"] 98 | variant = self.barcode_map[bc] 99 | mutations = self.count_variant(variant) 100 | if mutations is None: # variant has too many mutations 101 | max_mut_barcodes += 1 102 | max_mut_variants += count 103 | if self.report_filtered: 104 | self.report_filtered_variant(variant, count) 105 | else: 106 | try: 107 | df_dict[mutations] += count 108 | except KeyError: 109 | df_dict[mutations] = count 110 | barcode_variants[bc] = mutations 111 | 112 | # save counts, filtering based on the min count 113 | self.save_counts( 114 | "variants", 115 | {k: v for k, v in df_dict.items() if v >= self.variant_min_count}, 116 | raw=False, 117 | ) 118 | del df_dict 119 | 120 | # write the active subset of the BarcodeMap to the store 121 | barcodes = list(barcode_variants.keys()) 122 | barcode_variants = pd.DataFrame( 123 | {"value": [barcode_variants[bc] for bc in barcodes]}, index=barcodes 124 | ) 125 | del barcodes 126 | barcode_variants.sort_values("value", inplace=True) 127 | self.store.put( 128 | "/raw/barcodemap", 129 | barcode_variants, 130 | data_columns=barcode_variants.columns, 131 | format="table", 132 | ) 133 | del barcode_variants 134 | 135 | if self.aligner is not None: 136 | self.logger.info("Aligned {} variants".format(self.aligner.calls)) 137 | self.aligner_cache = None 138 | # self.report_filter_stats() 139 | self.logger.info( 140 | "Removed {} unique barcodes ({} total variants) " 141 | "with excess mutations".format(max_mut_barcodes, max_mut_variants) 142 | ) 143 | self.save_filter_stats() 144 | 145 | self.count_synonymous() 146 | 147 | def make_plots(self): 148 | """ 149 | Make plots for :py:class:`~seqlib.seqlib.BcvSeqLib` objects. 150 | 151 | Creates plot of the number of barcodes mapping to each variant. 152 | """ 153 | if self.plots_requested: 154 | SeqLib.make_plots(self) 155 | # open the PDF file 156 | pdf = PdfPages(os.path.join(self.plot_dir, "barcodes_per_variant.pdf")) 157 | barcodemap_plot(self, pdf) 158 | pdf.close() 159 | -------------------------------------------------------------------------------- /enrich2/basic.py: -------------------------------------------------------------------------------- 1 | from .variant import VariantSeqLib 2 | from fqfa import open_compressed, parse_fastq_reads, has_fastq_ext 3 | import logging 4 | import sys 5 | 6 | 7 | class BasicSeqLib(VariantSeqLib): 8 | """ 9 | Class for count data from sequencing libraries with a single read for 10 | each variant. Creating a :py:class:`BasicSeqLib` requires a valid 11 | *config* object, usually from a ``.json`` configuration file. 12 | """ 13 | 14 | treeview_class_name = "Basic SeqLib" 15 | 16 | def __init__(self): 17 | VariantSeqLib.__init__(self) 18 | self.reads = None 19 | self.reverse_complement_reads = False 20 | self.trim_start = 1 21 | self.trim_length = sys.maxsize 22 | self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__)) 23 | 24 | def configure(self, cfg): 25 | """ 26 | Set up the object using the config object *cfg*, usually derived from 27 | a ``.json`` file. 28 | """ 29 | VariantSeqLib.configure(self, cfg) 30 | self.logger = logging.getLogger( 31 | "{}.{} - {}".format(__name__, self.__class__.__name__, self.name) 32 | ) 33 | 34 | # if counts are specified, copy them later 35 | # else handle the FASTQ config options and check the files 36 | if self.counts_file is None: 37 | self.configure_fastq(cfg) 38 | try: 39 | if not has_fastq_ext(self.reads): 40 | raise IOError( 41 | "FASTQ file error: unrecognized extension " 42 | "[{}]".format(self.name) 43 | ) 44 | except IOError as fqerr: 45 | raise IOError("FASTQ file error [{}]: {}".format(self.name, fqerr)) 46 | 47 | def serialize(self): 48 | """ 49 | Format this object (and its children) as a config object suitable for 50 | dumping to a config file. 51 | """ 52 | cfg = VariantSeqLib.serialize(self) 53 | 54 | cfg["fastq"] = self.serialize_fastq() 55 | 56 | return cfg 57 | 58 | def configure_fastq(self, cfg): 59 | """ 60 | Set up the object's FASTQ_ file handling and filtering options. 61 | """ 62 | try: 63 | self.reads = cfg["fastq"]["reads"] 64 | 65 | if "reverse" in cfg["fastq"]: 66 | self.reverse_complement_reads = cfg["fastq"]["reverse"] 67 | 68 | if "start" in cfg["fastq"]: 69 | self.trim_start = cfg["fastq"]["start"] 70 | 71 | if "length" in cfg["fastq"]: 72 | self.trim_length = cfg["fastq"]["length"] 73 | 74 | self.filters = cfg["fastq"]["filters"] 75 | except KeyError as key: 76 | raise KeyError( 77 | "Missing required config value {key} [{name}]" 78 | "".format(key=key, name=self.name) 79 | ) 80 | 81 | def serialize_fastq(self): 82 | """ 83 | Serialize this object's FASTQ_ file handling and filtering options. 84 | """ 85 | fastq = {"filters": self.serialize_filters()} 86 | fastq["reads"] = self.reads 87 | 88 | if self.reverse_complement_reads: 89 | fastq["reverse"] = True 90 | else: 91 | fastq["reverse"] = False 92 | 93 | if self.trim_start > 1: 94 | fastq["start"] = self.trim_start 95 | 96 | if self.trim_length < sys.maxsize: 97 | fastq["length"] = self.trim_length 98 | 99 | return fastq 100 | 101 | def counts_from_reads(self): 102 | """ 103 | Reads the forward or reverse FASTQ_ file (reverse reads are 104 | reverse-complemented), performs quality-based filtering, and counts 105 | the variants. 106 | """ 107 | df_dict = dict() 108 | 109 | self.logger.info("Counting variants") 110 | max_mut_variants = 0 111 | with open_compressed(self.reads) as handle: 112 | for fq in parse_fastq_reads(handle): 113 | fq.trim(start=self.trim_start, end=self.trim_start + self.trim_length -1) 114 | if self.reverse_complement_reads: 115 | fq.reverse_complement() 116 | 117 | if self.read_quality_filter(fq): 118 | mutations = self.count_variant(fq.sequence) 119 | if mutations is None: # too many mutations 120 | max_mut_variants += 1 121 | if self.report_filtered: 122 | self.report_filtered_variant(fq.sequence, 1) 123 | else: 124 | try: 125 | df_dict[mutations] += 1 126 | except KeyError: 127 | df_dict[mutations] = 1 128 | 129 | self.save_counts("variants", df_dict, raw=True) 130 | del df_dict 131 | 132 | if self.aligner is not None: 133 | self.logger.info("Aligned {} variants".format(self.aligner.calls)) 134 | self.aligner_cache = None 135 | self.logger.info( 136 | "Removed {} total variants with excess mutations" 137 | "".format(max_mut_variants) 138 | ) 139 | self.save_filter_stats() 140 | 141 | def calculate(self): 142 | """ 143 | Counts variants from counts file or FASTQ. 144 | """ 145 | if not self.check_store("/main/variants/counts"): 146 | if not self.check_store("/raw/variants/counts"): 147 | if self.counts_file is not None: 148 | self.counts_from_file(self.counts_file) 149 | else: 150 | self.counts_from_reads() 151 | self.save_filtered_counts("variants", "count >= self.variant_min_count") 152 | self.count_synonymous() 153 | -------------------------------------------------------------------------------- /enrich2/condition.py: -------------------------------------------------------------------------------- 1 | from .storemanager import StoreManager 2 | from .selection import Selection 3 | 4 | 5 | class Condition(StoreManager): 6 | """ 7 | Dummy class for experimental conditions within an 8 | :py:class:`~experiment.Experiment`. Required for proper GUI behavior. 9 | """ 10 | 11 | has_store = False # don't create an HDF5 for Conditions 12 | treeview_class_name = "Condition" 13 | 14 | def __init__(self): 15 | StoreManager.__init__(self) 16 | self.selections = list() 17 | 18 | def configure(self, cfg, configure_children=True): 19 | StoreManager.configure(self, cfg) 20 | if configure_children: 21 | if "selections" not in cfg: 22 | raise KeyError( 23 | "Missing required config value {} [{}]".format( 24 | "selections", self.name 25 | ) 26 | ) 27 | 28 | for sel_cfg in cfg["selections"]: 29 | sel = Selection() 30 | sel.configure(sel_cfg) 31 | self.add_child(sel) 32 | 33 | def serialize(self): 34 | """ 35 | Format this object (and its children) as a config object suitable for dumping to a config file. 36 | """ 37 | cfg = StoreManager.serialize(self) 38 | cfg["selections"] = [child.serialize() for child in self.children] 39 | return cfg 40 | 41 | def validate(self): 42 | """ 43 | Calls validate on all child Selections. 44 | """ 45 | for child in self.children: 46 | child.validate() 47 | 48 | def _children(self): 49 | """ 50 | Method bound to the ``children`` property. Returns a list of all 51 | :py:class:`~selection.Selection` objects belonging to this object, 52 | sorted by name. 53 | """ 54 | return sorted(self.selections, key=lambda x: x.name) 55 | 56 | def add_child(self, child): 57 | """ 58 | Add a :py:class:`~selection.Selection`. 59 | """ 60 | if child.name in self.child_names(): 61 | raise ValueError( 62 | "Non-unique selection name '{}' [{}]".format(child.name, self.name) 63 | ) 64 | child.parent = self 65 | self.selections.append(child) 66 | 67 | def remove_child_id(self, tree_id): 68 | """ 69 | Remove the reference to a :py:class:`~selection.Selection` with 70 | Treeview id *tree_id*. 71 | """ 72 | self.selections = [x for x in self.selections if x.treeview_id != tree_id] 73 | -------------------------------------------------------------------------------- /enrich2/config_check.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for identifying the type of 3 | :py:class:`~enrich2.storemanager.StoreManager` derived object associated with a 4 | given configuration object (decoded from a JSON file as described `here 5 | `_). 6 | 7 | """ 8 | 9 | 10 | def is_experiment(cfg): 11 | """ 12 | Check if the given configuration object specifies an 13 | :py:class:`~enrich2.experiment.Experiment`. 14 | 15 | Args: 16 | cfg (dict): decoded JSON object 17 | 18 | Returns: 19 | bool: True if `cfg` if specifies an 20 | :py:class:`~enrich2.experiment.Experiment`, else False. 21 | 22 | """ 23 | if "conditions" in list(cfg.keys()): 24 | return True 25 | else: 26 | return False 27 | 28 | 29 | def is_condition(cfg): 30 | """ 31 | Check if the given configuration object specifies a 32 | :py:class:`~enrich2.condition.Condition`. 33 | 34 | Args: 35 | cfg (dict): decoded JSON object 36 | 37 | Returns: 38 | bool: True if `cfg` if specifies a 39 | :py:class:`~enrich2.condition.Condition`, else False. 40 | 41 | """ 42 | if "selections" in list(cfg.keys()): 43 | return True 44 | else: 45 | return False 46 | 47 | 48 | def is_selection(cfg): 49 | """ 50 | Check if the given configuration object specifies a 51 | :py:class:`~enrich2.selection.Selection`. 52 | 53 | Args: 54 | cfg (dict): decoded JSON object 55 | 56 | Returns: 57 | bool: True if `cfg` if specifies a 58 | :py:class:`~enrich2.selection.Selection`, else False. 59 | 60 | """ 61 | if "libraries" in list(cfg.keys()): 62 | return True 63 | else: 64 | return False 65 | 66 | 67 | def is_seqlib(cfg): 68 | """ 69 | Check if the given configuration object specifies a 70 | :py:class:`~enrich2.seqlib.SeqLib` derived object. 71 | 72 | Args: 73 | cfg (dict): decoded JSON object 74 | 75 | Returns: 76 | bool: True if `cfg` if specifies a :py:class:`~enrich2.seqlib.SeqLib` 77 | derived object, else False. 78 | 79 | """ 80 | if "fastq" in list(cfg.keys()) or "identifiers" in list(cfg.keys()): 81 | return True 82 | else: 83 | return False 84 | 85 | 86 | def seqlib_type(cfg): 87 | """ 88 | Get the type of :py:class:`~enrich2.seqlib.SeqLib` derived object 89 | specified by the configuration object. 90 | 91 | Args: 92 | cfg (dict): decoded JSON object 93 | 94 | Returns: 95 | str: The class name of the :py:class:`~seqlib.seqlib.SeqLib` derived 96 | object specified by `cfg`. 97 | 98 | Raises: 99 | ValueError: If the class name cannot be determined. 100 | 101 | """ 102 | if "barcodes" in cfg: 103 | if "map file" in cfg["barcodes"]: 104 | if "variants" in cfg and "identifiers" in cfg: 105 | raise ValueError("Unable to determine SeqLib type.") 106 | elif "variants" in cfg: 107 | return "BcvSeqLib" 108 | elif "identifiers" in cfg: 109 | return "BcidSeqLib" 110 | else: 111 | raise ValueError("Unable to determine SeqLib type.") 112 | else: 113 | return "BarcodeSeqLib" 114 | elif "overlap" in cfg and "variants" in cfg: 115 | return "OverlapSeqLib" 116 | elif "variants" in cfg: 117 | return "BasicSeqLib" 118 | elif "identifiers" in cfg: 119 | return "IdOnlySeqLib" 120 | else: 121 | raise ValueError("Unable to determine SeqLib type for configuration " "object.") 122 | 123 | 124 | def element_type(cfg): 125 | """ 126 | Get the type of :py:class:`~enrich2.storemanager.StoreManager` derived 127 | object specified by the configuration object. 128 | 129 | Args: 130 | cfg (dict): decoded JSON object 131 | 132 | Returns: 133 | str: The class name of the 134 | :py:class:`~enrich2.storemanager.StoreManager` derived object specified 135 | by `cfg`. 136 | 137 | Raises: 138 | ValueError: If the class name cannot be determined. 139 | 140 | """ 141 | if is_experiment(cfg): 142 | return "Experiment" 143 | elif is_condition(cfg): 144 | return "Condition" 145 | elif is_selection(cfg): 146 | return "Selection" 147 | elif is_seqlib(cfg): 148 | return seqlib_type(cfg) 149 | else: 150 | raise ValueError("Unable to determine type for configuration object.") 151 | -------------------------------------------------------------------------------- /enrich2/constants.py: -------------------------------------------------------------------------------- 1 | #: Variant string for counting wild type sequences 2 | WILD_TYPE_VARIANT = "_wt" 3 | 4 | 5 | #: Variant string for synonymous variants in 'synonymous' DataFrame 6 | SYNONYMOUS_VARIANT = "_sy" 7 | 8 | 9 | #: Standard codon table for translating wild type and variant DNA sequences 10 | CODON_TABLE = { 11 | "TTT": "F", 12 | "TCT": "S", 13 | "TAT": "Y", 14 | "TGT": "C", 15 | "TTC": "F", 16 | "TCC": "S", 17 | "TAC": "Y", 18 | "TGC": "C", 19 | "TTA": "L", 20 | "TCA": "S", 21 | "TAA": "*", 22 | "TGA": "*", 23 | "TTG": "L", 24 | "TCG": "S", 25 | "TAG": "*", 26 | "TGG": "W", 27 | "CTT": "L", 28 | "CCT": "P", 29 | "CAT": "H", 30 | "CGT": "R", 31 | "CTC": "L", 32 | "CCC": "P", 33 | "CAC": "H", 34 | "CGC": "R", 35 | "CTA": "L", 36 | "CCA": "P", 37 | "CAA": "Q", 38 | "CGA": "R", 39 | "CTG": "L", 40 | "CCG": "P", 41 | "CAG": "Q", 42 | "CGG": "R", 43 | "ATT": "I", 44 | "ACT": "T", 45 | "AAT": "N", 46 | "AGT": "S", 47 | "ATC": "I", 48 | "ACC": "T", 49 | "AAC": "N", 50 | "AGC": "S", 51 | "ATA": "I", 52 | "ACA": "T", 53 | "AAA": "K", 54 | "AGA": "R", 55 | "ATG": "M", 56 | "ACG": "T", 57 | "AAG": "K", 58 | "AGG": "R", 59 | "GTT": "V", 60 | "GCT": "A", 61 | "GAT": "D", 62 | "GGT": "G", 63 | "GTC": "V", 64 | "GCC": "A", 65 | "GAC": "D", 66 | "GGC": "G", 67 | "GTA": "V", 68 | "GCA": "A", 69 | "GAA": "E", 70 | "GGA": "G", 71 | "GTG": "V", 72 | "GCG": "A", 73 | "GAG": "E", 74 | "GGG": "G", 75 | } 76 | 77 | 78 | #: Conversions between single- and three-letter amino acid codes 79 | AA_CODES = { 80 | "Ala": "A", 81 | "A": "Ala", 82 | "Arg": "R", 83 | "R": "Arg", 84 | "Asn": "N", 85 | "N": "Asn", 86 | "Asp": "D", 87 | "D": "Asp", 88 | "Cys": "C", 89 | "C": "Cys", 90 | "Glu": "E", 91 | "E": "Glu", 92 | "Gln": "Q", 93 | "Q": "Gln", 94 | "Gly": "G", 95 | "G": "Gly", 96 | "His": "H", 97 | "H": "His", 98 | "Ile": "I", 99 | "I": "Ile", 100 | "Leu": "L", 101 | "L": "Leu", 102 | "Lys": "K", 103 | "K": "Lys", 104 | "Met": "M", 105 | "M": "Met", 106 | "Phe": "F", 107 | "F": "Phe", 108 | "Pro": "P", 109 | "P": "Pro", 110 | "Ser": "S", 111 | "S": "Ser", 112 | "Thr": "T", 113 | "T": "Thr", 114 | "Trp": "W", 115 | "W": "Trp", 116 | "Tyr": "Y", 117 | "Y": "Tyr", 118 | "Val": "V", 119 | "V": "Val", 120 | "Ter": "*", 121 | "*": "Ter", 122 | "???": "?", 123 | "?": "???", 124 | } 125 | -------------------------------------------------------------------------------- /enrich2/dataframe.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import logging 4 | from .constants import WILD_TYPE_VARIANT 5 | import collections 6 | from .variant import mutation_count, re_protein, re_coding, re_noncoding 7 | from .barcodemap import re_barcode, re_identifier 8 | from .constants import AA_CODES 9 | from .storemanager import ELEMENT_LABELS 10 | from .sfmap import AA_LIST, NT_LIST 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | SingleMut = collections.namedtuple("SingleMut", ["pre", "post", "pos", "key"]) 15 | 16 | 17 | def validate_index(index, element): 18 | """ 19 | Return a boolean list for which index values are valid for the given 20 | element type. 21 | """ 22 | if element not in ELEMENT_LABELS: 23 | raise ValueError("Invalid element label '{}'".format(element)) 24 | 25 | if element == "barcodes": 26 | retval = [re_barcode.match(x) is not None for x in index] 27 | elif element == "identifiers": 28 | retval = [re_identifier.match(x) is not None for x in index] 29 | elif element == "variants": 30 | pass 31 | elif element == "synonymous": 32 | pass 33 | else: 34 | raise NotImplementedError("Unimplemented element type '{}'" "".format(element)) 35 | 36 | 37 | def single_mutation_index(index): 38 | """ 39 | Return a filtered pandas Index containing only single mutations. Filtering 40 | also removes unrecognized amino acids (denoted by ``"???"``) caused by 41 | some indels. 42 | 43 | *index* the index to be filtered for single mutations. 44 | """ 45 | return pd.Index(x for x in index if mutation_count(x) == 1) 46 | 47 | 48 | def filter_coding_index(index): 49 | """ 50 | Return a filtered pandas Index with any unrecognized amino acids (denoted 51 | by ``"???"``) removed. These are caused by some frame shift mutations. 52 | 53 | *index* the index to be filtered. 54 | """ 55 | return pd.Index(x for x in index if "???" not in x) 56 | 57 | 58 | def single_mutations_to_tuples(index): 59 | """ 60 | Return a list of SingleMut namedtuples for each single mutation in the 61 | *index*. The type of index (noncoding DNA, coding DNA, or protein) is 62 | automatically detected. 63 | 64 | Position value in the tuple is stored as an integer. 65 | 66 | If the *index* is a protein index, the amino acids are referred to by 67 | single-letter codes not three-letter codes. 68 | 69 | *index* is the index to convert to SingleMut tuples. 70 | 71 | Raises a ValueError if non-single mutations are included in *index*. 72 | 73 | Raises a ValueError if one of the *index* entries cannot be parsed. 74 | 75 | Raises an IndexError if the *index* is empty. 76 | """ 77 | if any(mutation_count(x) != 1 for x in index): 78 | raise ValueError( 79 | "Non-single mutations cannot be converted into " "SingleMut tuples." 80 | ) 81 | 82 | # identify the type of index 83 | try: 84 | if re_noncoding.match(index[0]): 85 | is_protein = False 86 | expression = re_noncoding 87 | elif re_coding.match(index[0]): 88 | is_protein = False 89 | expression = re_coding 90 | elif re_protein.match(index[0]): 91 | is_protein = True 92 | expression = re_protein 93 | else: 94 | raise ValueError("Unrecognized HGVS string.") 95 | except IndexError: 96 | raise IndexError("Cannot convert empty index to tuples.") 97 | 98 | # perform the regular expression matches and create the SingleMut tuples 99 | tuples = list() 100 | for x in index: 101 | m = expression.match(x) 102 | if m is None: 103 | raise ValueError("Unrecognized HGVS string.") 104 | else: 105 | if is_protein: # convert to single-letter amino acid code 106 | tuples.append( 107 | SingleMut( 108 | AA_CODES[m.group("pre")], 109 | AA_CODES[m.group("post")], 110 | int(m.group("pos")), 111 | m.group("match"), 112 | ) 113 | ) 114 | else: 115 | tuples.append( 116 | SingleMut( 117 | m.group("pre"), 118 | m.group("post"), 119 | int(m.group("pos")), 120 | m.group("match"), 121 | ) 122 | ) 123 | 124 | return tuples 125 | 126 | 127 | def fill_position_gaps(positions, gap_size): 128 | """ 129 | Create a list of integer positions with gaps filled in. Used by 130 | :py:func:`singleton_dataframe`. 131 | 132 | Args: 133 | positions (list): integer positions 134 | gap_size (int): maximum length of gap that will be filled 135 | 136 | Returns: 137 | list: sorted list of unique integer positions with gaps filled 138 | """ 139 | if len(positions) == 0: 140 | raise ValueError("Empty positions list.") 141 | 142 | # uniqify and sort 143 | positions = sorted(list(set(positions))) 144 | 145 | # fill in short gaps 146 | fill = set() 147 | for i in range(len(positions) - 1): 148 | delta = positions[i + 1] - positions[i] 149 | if delta > 1 and delta <= gap_size: 150 | fill.update(positions[i] + n + 1 for n in range(delta)) 151 | fill.update(positions) 152 | 153 | return sorted(list(fill)) 154 | 155 | 156 | def singleton_dataframe( 157 | values, wt, gap_size=5, coding=True, plot_wt_score=True, aa_list=AA_LIST 158 | ): 159 | """ 160 | Prepare data for plotting as a sequence-function map. Returns a data frame 161 | suitable for plotting as heat map data and a wild type sequence extracted 162 | from the variant information. 163 | 164 | The type of variants stored is automatically detected, and the index will 165 | be filtered for single mutations. 166 | 167 | The data frame has amino acids or nucleotides as columns and positions with 168 | rows. If there are no mutations at a given position, it will not appear in 169 | the data frame unless this gap is filled with rows containing no data. The 170 | wild type sequence entry for these rows will be blank. 171 | 172 | Args: 173 | values (|pd_Series|): data values (typically scores or counts) 174 | 175 | wt (WildTypeSequence): wild type for the data 176 | 177 | gap_size (int): maximum length of missing data gap that will be filled 178 | 179 | coding (bool): True for amino acid data, False for nucleotide 180 | 181 | plot_wt_score (bool): True if the wild type positions should have the 182 | wild type score, False if they should be missing 183 | 184 | Returns: 185 | tuple: two-element tuple containing a |pd_DataFrame| filled with the 186 | data values and a list of single-character wild type values 187 | """ 188 | if len(values.index) == 0: 189 | raise ValueError( 190 | "Cannot process an empty data frame [{}]".format(wt.parent_name) 191 | ) 192 | 193 | # save the wild type score for later 194 | if plot_wt_score: 195 | try: 196 | wt_score = values[WILD_TYPE_VARIANT] 197 | except KeyError: 198 | logger.warning("Wild type score not measured, will be missing in " "plots") 199 | wt_score = np.nan 200 | 201 | # select only rows with singleton mutations 202 | values = values[filter_coding_index(single_mutation_index(values.index))] 203 | 204 | # parse out the information from the index 205 | index_tuples = single_mutations_to_tuples(values.index) 206 | 207 | # create and populate the DataFrame 208 | # get sorted, unique list of positions that have a mutation 209 | positions = fill_position_gaps([x.pos for x in index_tuples], gap_size=gap_size) 210 | # initialize the DataFrame 211 | if coding: 212 | columns = aa_list 213 | else: 214 | columns = NT_LIST 215 | frame = pd.DataFrame(np.nan, columns=columns, index=positions) 216 | # populate the DataFrame 217 | for x in index_tuples: 218 | frame.loc[x.pos, x.post] = values.loc[x.key] 219 | 220 | # create a dictionary of position->nucleotide/amino acid 221 | wt_dict = dict(wt.position_tuples(protein=coding)) 222 | 223 | # convert subset of the wild type dictionary into sequence 224 | try: 225 | wt_sequence = "".join(wt_dict[x] for x in positions) 226 | except KeyError: 227 | raise ValueError("Inconsistent wild type positions [{}]".format(wt.parent_name)) 228 | 229 | # double-check that the wild type is consistent with the data frame 230 | for x in index_tuples: 231 | if x.pos in wt_dict: 232 | if x.pre != wt_dict[x.pos]: 233 | raise ValueError( 234 | "Inconsistent wild type sequence [{}]".format(wt.parent_name) 235 | ) 236 | 237 | # add wild type scores if desired 238 | if plot_wt_score: 239 | for p in positions: 240 | frame.loc[p, wt_dict[p]] = wt_score 241 | 242 | return (frame, wt_sequence) 243 | -------------------------------------------------------------------------------- /enrich2/fastqheader.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | # Matches FASTQ headers based on the following pattern (modify as needed): 4 | # @:::::: ::: 5 | 6 | # Example: @M02564:876:000000000-L3775:1:1101:16862:1800 1:N:0:TCACTCGA+TAACGGTT 7 | # Sample number contains indexes if they are present. 8 | 9 | # See: https://help.basespace.illumina.com/files-used-by-basespace/fastq-files 10 | 11 | # Note: this regex is currently unused since it is not needed to support the legacy chastity filtering feature 12 | new_header_pattern = re.compile( 13 | r""" 14 | @(?P[^:]+): 15 | (?P\d+): 16 | (?P[^:]+): 17 | (?P\d+): 18 | (?P\d+): 19 | (?P\d+): 20 | (?P\d+) 21 | \s 22 | (?P\d+): 23 | (?P[YN]): 24 | (?P[^:]+): 25 | (?P[^:]+) 26 | """, 27 | re.VERBOSE, 28 | ) 29 | 30 | # Matches FASTQ headers based on the following pattern (modify as needed): 31 | # @:::::#/ 32 | old_header_pattern = re.compile( 33 | r""" 34 | @(?P.+): 35 | (?P\d+): 36 | (?P\d+): 37 | (?P\d+): 38 | (?P\d+): 39 | (?P[01])# 40 | (?P\d)/ 41 | (?P\d) 42 | """, 43 | re.VERBOSE, 44 | ) 45 | 46 | def parse_fastq_header(fq, pattern=old_header_pattern): 47 | """Parse the read's FASTQ_ header and return key-value pairs. 48 | 49 | Parses the first FASTQ_ header (@ header) and returns a dictionary. 50 | Dictionary keys are the named groups in the regular expression 51 | *pattern*. Unnamed matches are ignored. Integer values are converted 52 | from strings to integers. 53 | 54 | The default pattern matches a header in the format:: 55 | 56 | @:::::#/ 57 | 58 | """ 59 | match = pattern.match(fq.header) 60 | if match is None: 61 | return None 62 | else: 63 | header_dict = match.groupdict() 64 | for key in header_dict: 65 | if header_dict[key].isdigit(): 66 | header_dict[key] = int(header_dict[key]) 67 | return header_dict 68 | 69 | 70 | def fastq_read_is_chaste(self, raises=True): 71 | """ 72 | Returns ``True`` if the chastity bit is set in the header. The 73 | regular experession used by :py:meth:`header_information` must 74 | include a ``'Chastity'`` match that equals ``1`` if the read is 75 | chaste. 76 | 77 | If ``raises`` is ``True``, raises an informative error if the 78 | chastity information in the header is not found. Otherwise, a 79 | read without chastity information is treated as unchaste. 80 | """ 81 | try: 82 | if self.header_information()["Chastity"] == 1: 83 | return True 84 | else: 85 | return False 86 | except KeyError: # no 'Chastity' in pattern 87 | if raises: 88 | raise KeyError("No chastity bit in FASTQ header pattern") 89 | else: 90 | return False 91 | except TypeError: # no header match (unexpected format) 92 | if raises: 93 | raise ValueError("Unexpected FASTQ header format") 94 | else: 95 | return False 96 | -------------------------------------------------------------------------------- /enrich2/gui/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FowlerLab/Enrich2/bb31cfd60d0128b75f7dd028ee0e5f1f90a05996/enrich2/gui/__init__.py -------------------------------------------------------------------------------- /enrich2/gui/create_root_dialog.py: -------------------------------------------------------------------------------- 1 | 2 | import tkinter as tk 3 | import tkinter.ttk 4 | import tkinter.simpledialog 5 | from .dialog_elements import FileEntry, StringEntry, DEFAULT_COLUMNS 6 | from .create_seqlib_dialog import SEQLIB_LABEL_TEXT 7 | from ..barcode import BarcodeSeqLib 8 | from ..barcodevariant import BcvSeqLib 9 | from ..barcodeid import BcidSeqLib 10 | from ..basic import BasicSeqLib 11 | from ..idonly import IdOnlySeqLib 12 | from ..overlap import OverlapSeqLib 13 | from ..selection import Selection 14 | from ..experiment import Experiment 15 | 16 | 17 | #: map class names to class definitions to avoid use of globals() 18 | ELEMENT_CLASSES = { 19 | "BarcodeSeqLib": BarcodeSeqLib, 20 | "BcvSeqLib": BcvSeqLib, 21 | "BcidSeqLib": BcidSeqLib, 22 | "BasicSeqLib": BasicSeqLib, 23 | "IdOnlySeqLib": IdOnlySeqLib, 24 | "OverlapSeqLib": OverlapSeqLib, 25 | "Selection": Selection, 26 | "Experiment": Experiment, 27 | } 28 | 29 | 30 | class CreateRootDialog(tkinter.simpledialog.Dialog): 31 | """ 32 | Dialog box for creating a new root element. 33 | """ 34 | 35 | def __init__(self, parent_window, title="Create Root Object"): 36 | self.element_tkstring = tk.StringVar() 37 | self.cfg_dict = dict() 38 | self.output_directory_tk = FileEntry( 39 | "Output Directory", 40 | self.cfg_dict, 41 | "output directory", 42 | optional=False, 43 | directory=True, 44 | ) 45 | self.name_tk = StringEntry("Name", self.cfg_dict, "name", optional=False) 46 | self.element = None 47 | tkinter.simpledialog.Dialog.__init__(self, parent_window, title) 48 | 49 | def body(self, master): 50 | row_no = self.name_tk.body(master, 0) 51 | row_no += self.output_directory_tk.body(master, row_no) 52 | 53 | element_types = tkinter.ttk.Frame(master, padding=(3, 3, 12, 12)) 54 | element_types.grid( 55 | column=0, row=row_no, sticky="nsew", columnspan=DEFAULT_COLUMNS 56 | ) 57 | 58 | message = tkinter.ttk.Label(element_types, text="Root object type:") 59 | message.grid(column=0, row=0) 60 | 61 | label = tkinter.ttk.Label(element_types, text="Experiment") 62 | label.grid(column=0, row=1, sticky="w") 63 | rb = tkinter.ttk.Radiobutton( 64 | element_types, 65 | text="Experiment", 66 | variable=self.element_tkstring, 67 | value="Experiment", 68 | ) 69 | rb.grid(column=0, row=2, sticky="w") 70 | rb.invoke() 71 | 72 | label = tkinter.ttk.Label(element_types, text="Selection") 73 | label.grid(column=0, row=3, sticky="w") 74 | rb = tkinter.ttk.Radiobutton( 75 | element_types, 76 | text="Selection", 77 | variable=self.element_tkstring, 78 | value="Selection", 79 | ) 80 | rb.grid(column=0, row=4, sticky="w") 81 | 82 | label = tkinter.ttk.Label(element_types, text="SeqLib") 83 | label.grid(column=0, row=5, sticky="w") 84 | for i, k in enumerate(SEQLIB_LABEL_TEXT.keys()): 85 | rb = tkinter.ttk.Radiobutton( 86 | element_types, 87 | text=SEQLIB_LABEL_TEXT[k], 88 | variable=self.element_tkstring, 89 | value=k, 90 | ) 91 | rb.grid(column=0, row=(i + 6), sticky="w") 92 | 93 | def buttonbox(self): 94 | """ 95 | Display only one button. 96 | """ 97 | box = tk.Frame(self) 98 | 99 | w = tk.Button(box, text="OK", width=10, command=self.ok, default="active") 100 | w.pack(side="left", padx=5, pady=5) 101 | 102 | self.bind("", self.ok) 103 | 104 | box.pack() 105 | 106 | def validate(self): 107 | # check the fields 108 | return self.output_directory_tk.validate() and self.name_tk.validate() 109 | 110 | def apply(self): 111 | # apply the fields 112 | self.output_directory_tk.apply() 113 | self.name_tk.apply() 114 | 115 | # create the object 116 | try: 117 | self.element = ELEMENT_CLASSES[self.element_tkstring.get()]() 118 | except KeyError: 119 | raise KeyError( 120 | "Unrecognized element type '{}'".format(self.element_tkstring.get()) 121 | ) 122 | 123 | # set the properties from this dialog 124 | self.element.output_dir_override = False 125 | self.element.output_dir = self.cfg_dict["output directory"] 126 | self.element.name = self.cfg_dict["name"] 127 | -------------------------------------------------------------------------------- /enrich2/gui/create_seqlib_dialog.py: -------------------------------------------------------------------------------- 1 | 2 | import tkinter as tk 3 | import tkinter.ttk 4 | import tkinter.simpledialog 5 | from collections import OrderedDict 6 | from ..barcode import BarcodeSeqLib 7 | from ..barcodevariant import BcvSeqLib 8 | from ..barcodeid import BcidSeqLib 9 | from ..basic import BasicSeqLib 10 | from ..idonly import IdOnlySeqLib 11 | from ..overlap import OverlapSeqLib 12 | 13 | 14 | SEQLIB_LABEL_TEXT = OrderedDict( 15 | [ 16 | ("BcvSeqLib", "Barcoded Variant"), 17 | ("BcidSeqLib", "Barcoded Identifier"), 18 | ("OverlapSeqLib", "Overlap"), 19 | ("BasicSeqLib", "Basic"), 20 | ("BarcodeSeqLib", "Barcodes Only"), 21 | ("IdOnlySeqLib", "Identifiers Only"), 22 | ] 23 | ) 24 | 25 | #: map class names to class definitions to avoid use of globals() 26 | SEQLIB_CLASSES = { 27 | "BarcodeSeqLib": BarcodeSeqLib, 28 | "BcvSeqLib": BcvSeqLib, 29 | "BcidSeqLib": BcidSeqLib, 30 | "BasicSeqLib": BasicSeqLib, 31 | "IdOnlySeqLib": IdOnlySeqLib, 32 | "OverlapSeqLib": OverlapSeqLib, 33 | } 34 | 35 | 36 | class CreateSeqLibDialog(tkinter.simpledialog.Dialog): 37 | """ 38 | Dialog box for creating a new SeqLib. 39 | """ 40 | 41 | def __init__(self, parent_window, title="New SeqLib"): 42 | self.element_tkstring = tk.StringVar() 43 | self.element_type = None 44 | tkinter.simpledialog.Dialog.__init__(self, parent_window, title) 45 | 46 | def body(self, master): 47 | message = tkinter.ttk.Label(master, text="SeqLib type:") 48 | message.grid(column=0, row=0) 49 | 50 | for i, k in enumerate(SEQLIB_LABEL_TEXT.keys()): 51 | rb = tkinter.ttk.Radiobutton( 52 | master, 53 | text=SEQLIB_LABEL_TEXT[k], 54 | variable=self.element_tkstring, 55 | value=k, 56 | ) 57 | rb.grid(column=0, row=(i + 1), sticky="w") 58 | if i == 0: 59 | rb.invoke() 60 | 61 | def buttonbox(self): 62 | """ 63 | Display only one button. 64 | """ 65 | box = tk.Frame(self) 66 | 67 | w = tk.Button(box, text="OK", width=10, command=self.ok, default="active") 68 | w.pack(side="left", padx=5, pady=5) 69 | 70 | self.bind("", self.ok) 71 | 72 | box.pack() 73 | 74 | def apply(self): 75 | try: 76 | self.element_type = SEQLIB_CLASSES[self.element_tkstring.get()] 77 | except KeyError: 78 | raise KeyError("Unrecognized element type.") 79 | -------------------------------------------------------------------------------- /enrich2/gui/delete_dialog.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | import tkinter.ttk 3 | import tkinter.simpledialog 4 | 5 | 6 | def subtree_ids(treeview, x, level=0): 7 | """ 8 | Return a list of tuples containing the ids and levels for *x* and every element below it in the Treeview *treeview*. 9 | 10 | The level of *x* is 0, children of *x* are 1, and so forth. 11 | """ 12 | id_list = list() 13 | id_list.append((x, level)) 14 | for y in treeview.get_children(x): 15 | id_list.extend(subtree_ids(treeview, y, level + 1)) 16 | return id_list 17 | 18 | 19 | class DeleteDialog(tkinter.simpledialog.Dialog): 20 | """ 21 | Confirmation dialog box for deleting the selected items from the Treeview. 22 | """ 23 | 24 | def __init__(self, parent_window, tree, title="Confirm Deletion"): 25 | self.tree = tree 26 | self.id_tuples = list() 27 | for x in self.tree.treeview.selection(): 28 | if x not in [y[0] for y in self.id_tuples]: 29 | self.id_tuples.extend(subtree_ids(self.tree.treeview, x)) 30 | tkinter.simpledialog.Dialog.__init__(self, parent_window, title) 31 | 32 | def body(self, master): 33 | """ 34 | Generates the required text listing all elements that will be deleted. 35 | 36 | Displays the "OK" and "Cancel" buttons. 37 | """ 38 | if len(self.id_tuples) == 0: 39 | message_string = "No elements selected." 40 | elif len(self.id_tuples) == 1: 41 | message_string = 'Delete "{}"?'.format( 42 | self.tree.get_element(self.id_tuples[0][0]).name 43 | ) 44 | else: 45 | message_string = "Delete the following items?\n" 46 | for x, level in self.id_tuples: 47 | if level == 0: 48 | bullet = " " + "\u25C6" 49 | else: 50 | bullet = " " * (level + 1) + "\u25C7" 51 | message_string += "{bullet} {name}\n".format( 52 | bullet=bullet, name=self.tree.get_element(x).name 53 | ) 54 | message = tkinter.ttk.Label(master, text=message_string, justify="left") 55 | message.grid(row=0, sticky="w") 56 | 57 | def buttonbox(self): 58 | """ 59 | Display only one button if there's no selection. Otherwise, use the default method to display two buttons. 60 | """ 61 | if len(self.id_tuples) == 0: 62 | box = tk.Frame(self) 63 | 64 | w = tk.Button( 65 | box, text="OK", width=10, command=self.cancel, default="active" 66 | ) 67 | w.pack(side="left", padx=5, pady=5) 68 | 69 | self.bind("", self.cancel) 70 | 71 | box.pack() 72 | else: 73 | tkinter.simpledialog.Dialog.buttonbox(self) 74 | 75 | def apply(self): 76 | """ 77 | Called when the user chooses "OK". Performs the deletion. 78 | """ 79 | for tree_id, _ in self.id_tuples: 80 | self.tree.delete_element(tree_id) 81 | self.tree.refresh_treeview() 82 | -------------------------------------------------------------------------------- /enrich2/gui/dialog_elements.py: -------------------------------------------------------------------------------- 1 | 2 | import tkinter as tk 3 | import tkinter.ttk 4 | import tkinter.messagebox 5 | import tkinter.filedialog 6 | import os.path 7 | 8 | DEFAULT_COLUMNS = 3 9 | 10 | 11 | class SectionLabel(object): 12 | def __init__(self, text): 13 | self.text = text 14 | 15 | def body(self, master, row, columns=DEFAULT_COLUMNS, **kwargs): 16 | label = tkinter.ttk.Label(master, text=self.text) 17 | label.grid(row=row, column=0, columnspan=columns, sticky="w") 18 | return 1 19 | 20 | def validate(self): 21 | return True 22 | 23 | def apply(self): 24 | return None 25 | 26 | def enable(self): 27 | pass 28 | 29 | def disable(self): 30 | pass 31 | 32 | 33 | class Checkbox(object): 34 | def __init__(self, text, cfg, key): 35 | self.checkbox = None 36 | self.enabled = True 37 | 38 | self.value = tk.BooleanVar() 39 | self.text = text 40 | self.cfg = cfg 41 | self.key = key 42 | try: 43 | if self.cfg[self.key] not in (True, False): 44 | self.value.set(False) 45 | else: 46 | self.value.set(self.cfg[self.key]) 47 | except KeyError: 48 | self.value.set(False) # default to False 49 | 50 | def body(self, master, row, columns=DEFAULT_COLUMNS, **kwargs): 51 | """ 52 | Place the required elements using the grid layout method. 53 | 54 | Returns the number of rows taken by this element. 55 | """ 56 | self.checkbox = tkinter.ttk.Checkbutton(master, text=self.text, variable=self.value) 57 | self.checkbox.grid(row=row, column=0, columnspan=columns, sticky="w") 58 | return 1 59 | 60 | def validate(self): 61 | return True 62 | 63 | def apply(self): 64 | if self.enabled: 65 | self.cfg[self.key] = self.value.get() 66 | else: 67 | self.cfg[self.key] = None 68 | 69 | def enable(self): 70 | self.enabled = True 71 | self.checkbox.state(["!disabled"]) 72 | 73 | def disable(self): 74 | self.enabled = False 75 | self.checkbox.state(["disabled"]) 76 | 77 | 78 | class MyEntry(object): 79 | """ 80 | Base class for labeled Entry fields. 81 | 82 | *text* is the Label/error box text. 83 | """ 84 | 85 | def __init__(self, text, cfg, key, optional=False): 86 | self.entry = None 87 | self.enabled = True 88 | 89 | self.value = tk.StringVar() 90 | self.text = text 91 | self.cfg = cfg 92 | self.key = key 93 | self.optional = optional 94 | try: 95 | if self.cfg[self.key] is None: 96 | self.value.set("") 97 | else: 98 | self.value.set(self.cfg[self.key]) 99 | except KeyError: 100 | self.value.set("") 101 | 102 | def body(self, master, row, columns=DEFAULT_COLUMNS, **kwargs): 103 | """ 104 | Place the required elements using the grid layout method. 105 | 106 | Returns the number of rows taken by this element. 107 | """ 108 | label = tkinter.ttk.Label(master, text=self.text) 109 | label.grid(row=row, column=0, columnspan=1, sticky="e") 110 | self.entry = tkinter.ttk.Entry(master, textvariable=self.value) 111 | self.entry.grid(row=row, column=1, columnspan=columns - 1, sticky="ew") 112 | return 1 113 | 114 | def validate(self): 115 | """ 116 | Validates the input. Returns ``True`` unless the field is blank and 117 | *optional* is ``False``. 118 | """ 119 | if not self.enabled: 120 | return True 121 | elif not self.optional and len(self.value.get()) == 0: 122 | tkinter.messagebox.showwarning("", "{} not specified.".format(self.text)) 123 | return False 124 | else: 125 | return True 126 | 127 | def apply(self): 128 | if self.enabled and len(self.value.get()) > 0: 129 | self.cfg[self.key] = self.value.get() 130 | else: 131 | self.cfg[self.key] = None 132 | 133 | def enable(self): 134 | self.enabled = True 135 | self.entry.state(["!disabled"]) 136 | 137 | def disable(self): 138 | self.enabled = False 139 | self.entry.state(["disabled"]) 140 | 141 | 142 | class FileEntry(MyEntry): 143 | """ 144 | Creates a labeled Entry field for a file or directory. 145 | 146 | *text* is the Label/error box text. 147 | *directory* is ``True`` if selecting a directory (instead of a file). 148 | *extensions* is a list of valid file endings 149 | 150 | """ 151 | 152 | def __init__( 153 | self, text, cfg, key, optional=False, directory=False, extensions=None 154 | ): 155 | MyEntry.__init__(self, text, cfg, key, optional) 156 | self.choose = None 157 | self.clear = None 158 | 159 | self.directory = directory 160 | if extensions is not None: 161 | self.extensions = [x.lower() for x in extensions] 162 | else: 163 | self.extensions = None 164 | 165 | def body(self, master, row, columns=DEFAULT_COLUMNS, **kwargs): 166 | """ 167 | Place the required elements using the grid layout method. 168 | 169 | Returns the number of rows taken by this element. 170 | """ 171 | label = tkinter.ttk.Label(master, text=self.text) 172 | label.grid(row=row, column=0, columnspan=1, sticky="e") 173 | self.entry = tkinter.ttk.Entry(master, textvariable=self.value) 174 | self.entry.grid(row=row, column=1, columnspan=columns - 1, sticky="ew") 175 | if self.directory: 176 | self.choose = tkinter.ttk.Button( 177 | master, 178 | text="Choose...", 179 | command=lambda: self.value.set(tkinter.filedialog.askdirectory()), 180 | ) 181 | else: 182 | self.choose = tkinter.ttk.Button( 183 | master, 184 | text="Choose...", 185 | command=lambda: self.value.set(tkinter.filedialog.askopenfilename()), 186 | ) 187 | self.choose.grid(row=row + 1, column=1, sticky="w") 188 | if self.optional: 189 | self.clear = tkinter.ttk.Button( 190 | master, text="Clear", command=lambda: self.value.set("") 191 | ) 192 | self.clear.grid(row=row + 1, column=2, sticky="e") 193 | return 2 194 | 195 | def validate(self): 196 | if not self.enabled: 197 | return True 198 | elif len(self.value.get()) == 0: 199 | if not self.optional: 200 | tkinter.messagebox.showwarning("", "{} not specified.".format(self.text)) 201 | return False 202 | else: 203 | return True 204 | else: 205 | if os.path.exists(self.value.get()): 206 | if self.extensions is not None: 207 | if any( 208 | self.value.get().lower().endswith(x) for x in self.extensions 209 | ): 210 | return True 211 | else: 212 | tkinter.messagebox.showwarning( 213 | "", "Invalid file extension " "for {}.".format(self.text) 214 | ) 215 | return False 216 | else: # no extension restriction 217 | return True 218 | else: 219 | tkinter.messagebox.showwarning( 220 | "", "{} file does not exist." "".format(self.text) 221 | ) 222 | return False 223 | 224 | def enable(self): 225 | self.enabled = True 226 | self.entry.state(["!disabled"]) 227 | self.choose.state(["!disabled"]) 228 | if self.optional: 229 | self.clear.state(["!disabled"]) 230 | 231 | def disable(self): 232 | self.enabled = False 233 | self.entry.state(["disabled"]) 234 | self.choose.state(["disabled"]) 235 | if self.optional: 236 | self.clear.state(["disabled"]) 237 | 238 | 239 | class StringEntry(MyEntry): 240 | """ 241 | Creates a labeled Entry field for a string. 242 | 243 | *text* is the Label/error box text. 244 | """ 245 | 246 | def __init__(self, text, cfg, key, optional=False): 247 | MyEntry.__init__(self, text, cfg, key, optional) 248 | 249 | def body(self, master, row, columns=DEFAULT_COLUMNS, **kwargs): 250 | """ 251 | Place the required elements using the grid layout method. 252 | 253 | Returns the number of rows taken by this element. 254 | """ 255 | label = tkinter.ttk.Label(master, text=self.text) 256 | label.grid(row=row, column=0, columnspan=1, sticky="e") 257 | self.entry = tkinter.ttk.Entry(master, textvariable=self.value) 258 | self.entry.grid(row=row, column=1, columnspan=columns - 1, sticky="ew") 259 | return 1 260 | 261 | 262 | class IntegerEntry(MyEntry): 263 | """ 264 | Creates a labeled Entry field for an integer. 265 | 266 | *text* is the Label/error box text. 267 | """ 268 | 269 | def __init__(self, text, cfg, key, optional=False, minvalue=0): 270 | MyEntry.__init__(self, text, cfg, key, optional) 271 | self.minvalue = minvalue 272 | 273 | def body(self, master, row, columns=DEFAULT_COLUMNS, width=4, left=False, **kwargs): 274 | """ 275 | Add the labeled entry to the Frame *master* using grid at *row*. 276 | 277 | *width* controls the width of the Entry. 278 | *left* is ``True`` if the Entry is to the left of the Label. 279 | *columns* is the number of columns in *master*. 280 | 281 | Returns the number of rows taken by this element. 282 | """ 283 | if left: 284 | entry_column = 0 285 | entry_sticky = "e" 286 | entry_width = 1 287 | label_column = 1 288 | label_sticky = "w" 289 | label_width = columns - 1 290 | else: 291 | entry_column = 1 292 | entry_sticky = "w" 293 | entry_width = columns - 1 294 | label_column = 0 295 | label_sticky = "e" 296 | label_width = 1 297 | 298 | label = tkinter.ttk.Label(master, text=self.text) 299 | label.grid( 300 | row=row, column=label_column, columnspan=label_width, sticky=label_sticky 301 | ) 302 | self.entry = tkinter.ttk.Entry(master, textvariable=self.value, width=width) 303 | self.entry.grid( 304 | row=row, column=entry_column, columnspan=entry_width, sticky=entry_sticky 305 | ) 306 | return 1 307 | 308 | def validate(self): 309 | """ 310 | Returns ``True`` if the value entered validates; else ``False``. 311 | 312 | If *self.optional* is ``True``, the field can be empty. 313 | Checks the *self.minvalue* that was passed on creation. 314 | """ 315 | if not self.enabled: 316 | return True 317 | else: 318 | try: 319 | intvalue = int(self.value.get()) 320 | except ValueError: 321 | if len(self.value.get()) == 0: 322 | if not self.optional: 323 | tkinter.messagebox.showwarning( 324 | "", "{} not specified." "".format(self.text) 325 | ) 326 | return False 327 | else: 328 | return True 329 | else: 330 | tkinter.messagebox.showwarning( 331 | "", "{} is not an integer." "".format(self.text) 332 | ) 333 | return False 334 | else: 335 | if intvalue < self.minvalue: 336 | tkinter.messagebox.showwarning( 337 | "", 338 | "{} lower than minimum value " 339 | "({}).".format(self.text, self.minvalue), 340 | ) 341 | return False 342 | else: 343 | return True 344 | 345 | def apply(self): 346 | if self.enabled and len(self.value.get()) > 0: 347 | self.cfg[self.key] = int(self.value.get()) 348 | else: 349 | self.cfg[self.key] = None 350 | -------------------------------------------------------------------------------- /enrich2/gui/runner_window.py: -------------------------------------------------------------------------------- 1 | 2 | import tkinter as tk 3 | import tkinter.ttk 4 | import tkinter.simpledialog 5 | import tkinter.messagebox 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class RunnerSavePrompt(tkinter.simpledialog.Dialog): 12 | """ 13 | Dialog box for prompting the user to save before running. 14 | """ 15 | 16 | def __init__(self, parent_window, title="Enrich2"): 17 | self.pw = parent_window 18 | 19 | self.dialog_text = tk.StringVar() 20 | self.dialog_text.set("Would you like to save your config changes?") 21 | 22 | tkinter.simpledialog.Dialog.__init__(self, parent_window, title) 23 | 24 | def body(self, master): 25 | frame = tkinter.ttk.Frame(master, padding=(12, 6, 12, 6)) 26 | frame.pack() 27 | 28 | dialog_text_label = tkinter.ttk.Label(frame, textvariable=self.dialog_text) 29 | dialog_text_label.grid(column=0, row=0, sticky="nsew") 30 | 31 | def apply(self): 32 | self.pw.menu_save() 33 | 34 | 35 | class RunnerWindow(tkinter.simpledialog.Dialog): 36 | """ 37 | Dialog box for blocking input while running the analysis. 38 | """ 39 | 40 | def __init__(self, parent_window, title="Enrich2"): 41 | self.pw = parent_window 42 | self.run_button = None 43 | 44 | self.dialog_text = tk.StringVar() 45 | self.dialog_text.set("Ready to start analysis...") 46 | 47 | tkinter.simpledialog.Dialog.__init__(self, parent_window, title) 48 | 49 | def body(self, master): 50 | frame = tkinter.ttk.Frame(master, padding=(12, 6, 12, 6)) 51 | frame.pack() 52 | 53 | dialog_text_label = tkinter.ttk.Label(frame, textvariable=self.dialog_text) 54 | dialog_text_label.grid(column=0, row=0, sticky="nsew") 55 | 56 | self.run_button = tk.Button( 57 | frame, text="Begin", width=10, command=self.runner, default="active" 58 | ) 59 | self.run_button.grid(column=0, row=1, sticky="nsew") 60 | 61 | def buttonbox(self): 62 | """ 63 | Display no buttons. 64 | """ 65 | pass 66 | 67 | def runner(self): 68 | # gray out the "Run" button 69 | self.run_button.config(state="disabled") 70 | self.update_idletasks() 71 | 72 | # set the analysis options 73 | self.pw.root_element.force_recalculate = self.pw.force_recalculate.get() 74 | self.pw.root_element.component_outliers = self.pw.component_outliers.get() 75 | self.pw.root_element.scoring_method = self.pw.scoring_method.get() 76 | self.pw.root_element.logr_method = self.pw.logr_method.get() 77 | self.pw.root_element.plots_requested = self.pw.plots_requested.get() 78 | self.pw.root_element.tsv_requested = self.pw.tsv_requested.get() 79 | 80 | # run the analysis, catching any errors to display in a dialog box 81 | try: 82 | # ensure that all objects are valid 83 | self.pw.root_element.validate() 84 | 85 | # open HDF5 files for the root and all child objects 86 | self.pw.root_element.store_open(children=True) 87 | 88 | # perform the analysis 89 | self.pw.root_element.calculate() 90 | 91 | except Exception as e: 92 | # display error 93 | logger.error(e) 94 | tkinter.messagebox.showerror( 95 | "Enrich2 Error", "Enrich2 encountered an error:\n{}".format(e) 96 | ) 97 | 98 | else: 99 | # no exception occurred during calculation and setup 100 | # generate desired output 101 | if self.pw.plots_requested.get(): 102 | try: 103 | self.pw.root_element.make_plots() 104 | except Exception as e: 105 | tkinter.messagebox.showwarning( 106 | None, 107 | "Calculations completed, but plotting failed:\n{}".format(e), 108 | ) 109 | if self.pw.tsv_requested.get(): 110 | try: 111 | self.pw.root_element.write_tsv() 112 | except Exception as e: 113 | tkinter.messagebox.showwarning( 114 | None, 115 | "Calculations completed, but tsv output failed:\n{}".format(e), 116 | ) 117 | 118 | # show the dialog box 119 | tkinter.messagebox.showinfo("", "Analysis completed.") 120 | 121 | finally: 122 | # close the HDF5 files 123 | self.pw.root_element.store_close(children=True) 124 | 125 | # close this window 126 | self.destroy() 127 | -------------------------------------------------------------------------------- /enrich2/gui/seqlib_apply_dialog.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | import tkinter.ttk 3 | import tkinter.simpledialog 4 | 5 | 6 | class SeqLibApplyDialog(tkinter.simpledialog.Dialog): 7 | """ 8 | Confirmation dialog box for applying FASTQ filtering options to selected SeqLibs from the Treeview. 9 | """ 10 | 11 | def __init__( 12 | self, parent_window, tree, source_id, title="Confirm Filtering Changes" 13 | ): 14 | self.tree = tree 15 | self.source_id = source_id 16 | self.target_ids = [ 17 | x 18 | for x in self.tree.treeview.selection() 19 | if x != source_id 20 | and type(self.tree.get_element(self.source_id)) 21 | == type(self.tree.get_element(x)) 22 | ] 23 | tkinter.simpledialog.Dialog.__init__(self, parent_window, title) 24 | 25 | def body(self, master): 26 | """ 27 | Generates the required text listing all SeqLibs that will have their FASTQ options updated. 28 | 29 | Displays the "OK" and "Cancel" buttons. 30 | """ 31 | if len(self.target_ids) == 0: 32 | message_string = "No elegible SeqLibs selected." 33 | elif len(self.target_ids) == 1: 34 | message_string = 'Apply FASTQ filtering options from "{}" to "{}"?'.format( 35 | self.tree.get_element(self.source_id).name, 36 | self.tree.get_element(self.target_ids[0]).name, 37 | ) 38 | else: 39 | bullet = " " + "\u25C6" 40 | message_string = 'Apply FASTQ filtering options from "{}"" to the following?\n'.format( 41 | self.tree.get_element(self.source_id).name 42 | ) 43 | for x in self.target_ids: 44 | message_string += "{bullet} {name}\n".format( 45 | bullet=bullet, name=self.tree.get_element(x).name 46 | ) 47 | message = tkinter.ttk.Label(master, text=message_string, justify="left") 48 | message.grid(row=0, sticky="w") 49 | 50 | def buttonbox(self): 51 | """ 52 | Display only one button if there's no selection. Otherwise, use the default method to display two buttons. 53 | """ 54 | if len(self.target_ids) == 0: 55 | box = tk.Frame(self) 56 | 57 | w = tk.Button( 58 | box, text="OK", width=10, command=self.cancel, default="active" 59 | ) 60 | w.pack(side="left", padx=5, pady=5) 61 | 62 | self.bind("", self.cancel) 63 | 64 | box.pack() 65 | else: 66 | tkinter.simpledialog.Dialog.buttonbox(self) 67 | 68 | def apply(self): 69 | """ 70 | Called when the user chooses "OK". Performs the FASTQ filtering update. 71 | """ 72 | filter_cfg = self.tree.get_element(self.source_id).serialize_filters() 73 | for x in self.target_ids: 74 | self.tree.get_element(x).filters = filter_cfg 75 | self.tree.refresh_treeview() 76 | -------------------------------------------------------------------------------- /enrich2/idonly.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .seqlib import SeqLib 3 | 4 | 5 | class IdOnlySeqLib(SeqLib): 6 | """ 7 | Class for counting data with non-variant identifiers and no associated 8 | FASTQ_ data. 9 | """ 10 | 11 | treeview_class_name = "ID-only SeqLib" 12 | 13 | def __init__(self): 14 | SeqLib.__init__(self) 15 | self.identifier_min_count = 0 16 | self.add_label("identifiers") 17 | self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__)) 18 | 19 | def configure(self, cfg): 20 | """ 21 | Set up the object using the config object *cfg*, usually derived from 22 | a ``.json`` file. 23 | """ 24 | SeqLib.configure(self, cfg) 25 | self.logger = logging.getLogger( 26 | "{}.{} - {}".format(__name__, self.__class__.__name__, self.name) 27 | ) 28 | try: 29 | if "min count" in cfg["identifiers"]: 30 | self.identifier_min_count = int(cfg["identifiers"]["min count"]) 31 | except KeyError as key: 32 | raise KeyError( 33 | "Missing required config value {key} [{name}]" 34 | "".format(key=key, name=self.name) 35 | ) 36 | 37 | def serialize(self): 38 | """ 39 | Format this object (and its children) as a config object suitable for 40 | dumping to a config file. 41 | """ 42 | cfg = SeqLib.serialize(self) 43 | 44 | cfg["identifiers"] = dict() 45 | if self.identifier_min_count > 0: 46 | cfg["identifiers"]["min count"] = self.identifier_min_count 47 | 48 | return cfg 49 | 50 | def calculate(self): 51 | """ 52 | Get the identifier counts from the counts file. 53 | """ 54 | if not self.check_store("/main/identifiers/counts"): 55 | if self.counts_file is not None: 56 | self.counts_from_file(self.counts_file) 57 | else: 58 | raise ValueError("Missing counts file [{}]".format(self.name)) 59 | self.save_filtered_counts( 60 | "identifiers", "count >= self.identifier_min_count" 61 | ) 62 | -------------------------------------------------------------------------------- /enrich2/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | 4 | from argparse import ArgumentParser, RawDescriptionHelpFormatter 5 | import logging 6 | import json 7 | import sys 8 | import platform 9 | import os.path 10 | 11 | if platform.system() == "Darwin": 12 | # Explicitly set the backend to avoid the NSInvalidArgumentException when 13 | # running in GUI mode. Advanced users who want to use another matplotlib 14 | # backend when running in MacOS on the command line can modify this section 15 | # accordingly. 16 | import matplotlib 17 | 18 | matplotlib.use("TkAgg") 19 | elif os.path.exists("/.dockerenv"): 20 | # Explicitly set the backend for running inside Docker. This may fail for 21 | # older versions of docker or alternative containerization tools such as 22 | # Singularity. 23 | import matplotlib 24 | 25 | matplotlib.use("Agg") 26 | import enrich2.config_check as config_check 27 | from enrich2.experiment import Experiment 28 | from enrich2.selection import Selection 29 | from enrich2.barcode import BarcodeSeqLib 30 | from enrich2.barcodeid import BcidSeqLib 31 | from enrich2.barcodevariant import BcvSeqLib 32 | from enrich2.basic import BasicSeqLib 33 | from enrich2.overlap import OverlapSeqLib 34 | from enrich2.idonly import IdOnlySeqLib 35 | from enrich2.storemanager import SCORING_METHODS, LOGR_METHODS 36 | from enrich2.gui.configurator import Configurator 37 | from enrich2.sfmap import parse_aa_list 38 | from enrich2 import __version__ 39 | 40 | 41 | #: Name of the driver script. Used for logging output. 42 | DRIVER_NAME = os.path.basename(sys.argv[0]) 43 | 44 | 45 | #: Format string for log entries (console or file). 46 | LOG_FORMAT = "%(asctime)-15s [%(name)s] %(message)s" 47 | 48 | #: Default log level 49 | LOG_LEVEL = logging.INFO 50 | 51 | 52 | def main_gui(): 53 | """ 54 | Entry point for GUI. 55 | 56 | """ 57 | logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) 58 | app = Configurator(__version__) 59 | app.mainloop() 60 | 61 | 62 | def main_cmd(): 63 | """ 64 | Entry point for command line. 65 | 66 | """ 67 | # build description string based on available methods 68 | desc_string = ( 69 | "Command-line driver for Enrich2 v{}".format(__version__) 70 | + "\n\nscoring methods:\n" 71 | + "\n".join([" {:22}{}".format(k, v) for k, v in list(SCORING_METHODS.items())]) 72 | + "\n\nlog ratio methods:\n" 73 | + "\n".join([" {:22}{}".format(k, v) for k, v in list(LOGR_METHODS.items())]) 74 | ) 75 | 76 | # create parser and add description 77 | parser = ArgumentParser( 78 | prog="Enrich2", 79 | description=desc_string, 80 | formatter_class=RawDescriptionHelpFormatter, 81 | ) 82 | 83 | # add command line arguments 84 | parser.add_argument("config", help="JSON configuration file") 85 | parser.add_argument( 86 | "scoring_method", help="scoring method", choices=list(SCORING_METHODS.keys()) 87 | ) 88 | parser.add_argument( 89 | "logr_method", help="log ratio method", choices=list(LOGR_METHODS.keys()) 90 | ) 91 | 92 | # add support for semantic version checking 93 | parser.add_argument( 94 | "--version", action="version", version="{}".format(__version__) 95 | ) 96 | 97 | # add analysis options 98 | parser.add_argument( 99 | "--log", metavar="FILE", dest="log_file", help="path to log file" 100 | ) 101 | parser.add_argument( 102 | "--no-plots", 103 | dest="plots_requested", 104 | action="store_false", 105 | default=True, 106 | help="don't make plots", 107 | ) 108 | parser.add_argument( 109 | "--no-tsv", 110 | dest="tsv_requested", 111 | action="store_false", 112 | default=True, 113 | help="don't generate tsv files", 114 | ) 115 | parser.add_argument( 116 | "--recalculate", 117 | dest="force_recalculate", 118 | action="store_true", 119 | default=False, 120 | help="force recalculation", 121 | ) 122 | parser.add_argument( 123 | "--component-outliers", 124 | dest="component_outliers", 125 | action="store_true", 126 | default=False, 127 | help="calculate component outlier stats", 128 | ) 129 | parser.add_argument( 130 | "--output-dir", 131 | metavar="DIR", 132 | dest="output_dir_override", 133 | help="override the config file's output directory", 134 | ) 135 | parser.add_argument( 136 | "--sfmap-aa-file", 137 | metavar="FILE", 138 | dest="sfmap_aa_file", 139 | help="amino acid groups for sequence-function maps", 140 | ) 141 | 142 | args = parser.parse_args() 143 | 144 | # start the logs 145 | if args.log_file is not None: 146 | # Create directory if it doesn't exist 147 | log_dir = os.path.dirname(args.log_file) 148 | if not os.path.exists(log_dir): 149 | os.makedirs(log_dir) 150 | logging.basicConfig(filename=args.log_file, encoding='utf-8', level=LOG_LEVEL, format=LOG_FORMAT) 151 | else: 152 | logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) 153 | logger = logging.getLogger(__name__) 154 | 155 | # read the JSON file 156 | try: 157 | cfg = json.load(open(args.config, "r")) 158 | except IOError: 159 | raise IOError("Failed to open '{}' [{}]".format(args.config, DRIVER_NAME)) 160 | except ValueError: 161 | raise ValueError("Improperly formatted .json file [{}]".format(DRIVER_NAME)) 162 | 163 | # identify config file type and create the object 164 | if config_check.is_experiment(cfg): 165 | logger.info("Detected an Experiment config file") 166 | obj = Experiment() 167 | elif config_check.is_selection(cfg): 168 | logger.info("Detected a Selection config file") 169 | obj = Selection() 170 | elif config_check.is_seqlib(cfg): 171 | seqlib_type = config_check.seqlib_type(cfg) 172 | logger.info("Detected a %s config file", seqlib_type) 173 | if seqlib_type == "BarcodeSeqLib": 174 | obj = BarcodeSeqLib() 175 | elif seqlib_type == "BcidSeqLib": 176 | obj = BcidSeqLib() 177 | elif seqlib_type == "BcvSeqLib": 178 | obj = BcvSeqLib() 179 | elif seqlib_type == "BasicSeqLib": 180 | obj = BasicSeqLib() 181 | elif seqlib_type == "OverlapSeqLib": 182 | obj = OverlapSeqLib() 183 | elif seqlib_type == "IdOnlySeqLib": 184 | obj = IdOnlySeqLib() 185 | else: 186 | raise ValueError( 187 | "Unrecognized SeqLib type '{}' [{}]".format(seqlib_type, DRIVER_NAME) 188 | ) 189 | else: 190 | raise ValueError("Unrecognized .json config [{}]".format(DRIVER_NAME)) 191 | 192 | # set analysis options 193 | obj.force_recalculate = args.force_recalculate 194 | obj.component_outliers = args.component_outliers 195 | obj.scoring_method = args.scoring_method 196 | obj.logr_method = args.logr_method 197 | obj.plots_requested = args.plots_requested 198 | obj.tsv_requested = args.tsv_requested 199 | 200 | if args.output_dir_override is not None: 201 | obj.output_dir_override = True 202 | obj.output_dir = args.output_dir_override 203 | else: 204 | obj.output_dir_override = False 205 | 206 | if args.sfmap_aa_file is not None: 207 | obj.plot_options = dict() 208 | obj.plot_options["aa_list"], obj.plot_options[ 209 | "aa_label_groups" 210 | ] = parse_aa_list(args.sfmap_aa_file) 211 | 212 | # configure the object 213 | obj.configure(cfg) 214 | 215 | # make sure objects are valid 216 | try: 217 | obj.validate() 218 | except ValueError: 219 | logger.exception("Invalid configuration") 220 | else: 221 | # open HDF5 files for the object and all child objects 222 | obj.store_open(children=True) 223 | 224 | # perform the analysis 225 | obj.calculate() 226 | 227 | # generate desired output 228 | obj.make_plots() 229 | try: 230 | obj.make_plots() 231 | except Exception: 232 | logger.exception("Calculations completed, but plotting failed.") 233 | try: 234 | obj.write_tsv() 235 | except Exception: 236 | logger.exception("Calculations completed, but TSV output failed.") 237 | 238 | # clean up 239 | obj.store_close(children=True) 240 | 241 | 242 | if __name__ == "__main__": 243 | gui_mode = False 244 | 245 | try: 246 | if sys.argv[1] == "gui": 247 | gui_mode = True 248 | except IndexError: 249 | pass 250 | 251 | if gui_mode: 252 | main_gui() 253 | else: 254 | main_cmd() 255 | -------------------------------------------------------------------------------- /enrich2/overlap.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import logging 4 | 5 | from matplotlib.backends.backend_pdf import PdfPages 6 | import os.path 7 | from .plots import overlap_merge_plot 8 | from .seqlib import SeqLib 9 | from .variant import VariantSeqLib 10 | from fqfa import open_compressed, parse_fastq_pe_reads, has_fastq_ext 11 | from fqfa.fastq.fastqread import FastqRead 12 | from .fastqheader import fastq_read_is_chaste 13 | 14 | 15 | class OverlapSeqLib(VariantSeqLib): 16 | """ 17 | Class for count data from sequencing libraries with overlapping paired-end 18 | reads for each variant. Creating a 19 | :py:class:`~seqlib.overlap.OverlapSeqLib` requires a valid *config* object 20 | with an ``'overlap'`` entry. 21 | 22 | The ``"fastq"`` config entry must contain two read files, with the keys 23 | ``"forward"`` and ``"reverse"``. Information about how to combine these 24 | reads is in the ``"overlap"`` config entry. 25 | 26 | The ``"overlap"`` config entry contains the following keys: 27 | 28 | * ``"forward start"`` --- position in the forward read where the \ 29 | overlapping region begins 30 | * ``"reverse start"`` --- position in the reverse read where the \ 31 | overlapping region begins (before being reverse-complemented) 32 | * ``"length"`` --- number of bases in the overlapping region 33 | * ``"max mismatches"`` --- maximum number of mismatches tolerated in the \ 34 | overlapping region before discarding the read 35 | * ``"overlap only"`` --- whether to trim the merged read to contain only \ 36 | the overlapping region (optional, defaults to ``False``) 37 | 38 | Here is a schematic of the case in the above JSON example:: 39 | 40 | forward ---> 1 41 | CGACGCAAGGA 42 | ||||||||| 43 | ACTCCTTGCGTCG 44 | 1 <--- reverse 45 | 46 | Note that the merged sequence is identical to the wild type sequence given 47 | in the JSON file. 48 | """ 49 | 50 | treeview_class_name = "Overlap SeqLib" 51 | 52 | def __init__(self): 53 | VariantSeqLib.__init__(self) 54 | self.forward = None 55 | self.reverse = None 56 | self.fwd_start = None 57 | self.rev_start = None 58 | self.overlap_length = None 59 | self.trim = None 60 | self.max_overlap_mismatches = None 61 | self.merge_mismatches = None 62 | self.default_filters.update({"merge failure": True}) 63 | self.default_filters.update({"remove unresolvable": False}) 64 | self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__)) 65 | 66 | def configure(self, cfg): 67 | """ 68 | Set up the object using the config object *cfg*, usually derived from 69 | a ``.json`` file. 70 | """ 71 | VariantSeqLib.configure(self, cfg) 72 | self.logger = logging.getLogger( 73 | "{}.{} - {}".format(__name__, self.__class__.__name__, self.name) 74 | ) 75 | 76 | # if counts are specified, copy them later 77 | # else handle the FASTQ config options and check the files 78 | if self.counts_file is None: 79 | self.configure_fastq(cfg) 80 | try: 81 | self.fwd_start = int(cfg["overlap"]["forward start"]) 82 | self.rev_start = int(cfg["overlap"]["reverse start"]) 83 | self.overlap_length = int(cfg["overlap"]["length"]) 84 | self.trim = cfg["overlap"]["trim"] 85 | self.max_overlap_mismatches = int(cfg["overlap"]["max mismatches"]) 86 | 87 | forward_error = False 88 | reverse_error = False 89 | if not has_fastq_ext(self.forward): 90 | forward_error = True 91 | if not has_fastq_ext(self.reverse): 92 | reverse_error = True 93 | if forward_error and reverse_error: 94 | raise IOError( 95 | "FASTQ file error: unrecognized extension (forward and reverse) [{}]".format( 96 | self.name 97 | ) 98 | ) 99 | elif forward_error: 100 | raise IOError( 101 | "FASTQ file error: unrecognized extension (forward) [{}]".format( 102 | self.name 103 | ) 104 | ) 105 | elif reverse_error: 106 | raise IOError( 107 | "FASTQ file error: unrecognized extension (reverse) [{}]".format( 108 | self.name 109 | ) 110 | ) 111 | except IOError as fqerr: 112 | raise IOError("FASTQ file error [{}]: {}".format(self.name, fqerr)) 113 | except KeyError as key: 114 | raise KeyError( 115 | "Missing required config value {key} [{name}]".format( 116 | key=key, name=self.name 117 | ) 118 | ) 119 | except ValueError as value: 120 | raise ValueError( 121 | "Invalid parameter value {value} [{name}]".format( 122 | value=value, name=self.name 123 | ) 124 | ) 125 | 126 | def serialize(self): 127 | """ 128 | Format this object (and its children) as a config object suitable for dumping to a config file. 129 | """ 130 | cfg = VariantSeqLib.serialize(self) 131 | 132 | cfg["fastq"] = self.serialize_fastq() 133 | cfg["overlap"] = { 134 | "forward start": self.fwd_start, 135 | "reverse start": self.rev_start, 136 | "length": self.overlap_length, 137 | "trim": self.trim, 138 | "max mismatches": self.max_overlap_mismatches, 139 | } 140 | 141 | return cfg 142 | 143 | def configure_fastq(self, cfg): 144 | """ 145 | Set up the object's FASTQ_ file handling and filtering options. 146 | """ 147 | try: 148 | self.forward = cfg["fastq"]["forward reads"] 149 | self.reverse = cfg["fastq"]["reverse reads"] 150 | 151 | if "merge failure" in cfg["fastq"]["filters"]: 152 | raise ValueError( 153 | "'merge failure' is not user-configurable [{}]".format(self.name) 154 | ) 155 | self.filters = cfg["fastq"]["filters"] 156 | except KeyError as key: 157 | raise KeyError( 158 | "Missing required config value {key} [{name}]".format( 159 | key=key, name=self.name 160 | ) 161 | ) 162 | 163 | def serialize_fastq(self): 164 | """ 165 | Serialize this object's FASTQ_ file handling and filtering options. 166 | """ 167 | fastq = { 168 | "forward reads": self.forward, 169 | "reverse reads": self.reverse, 170 | "filters": self.serialize_filters(), 171 | } 172 | 173 | return fastq 174 | 175 | def merge_reads(self, fwd, rev): 176 | """ 177 | Combines the *fwd* and *rev* FASTQ read objects into a 178 | single FASTQ read with the same header information as 179 | *fwd*. Mismatches are resolved by taking the highest quality base. If 180 | discrepant bases have the same quality value, this position is 181 | unresolvable and an ``'X'`` is inserted. Quality values in the 182 | resulting FASTQ read are the maximum quality for the 183 | given base at that position. Returns ``None`` if the maximum number of 184 | mismatches in the overlap region is exceded. 185 | """ 186 | rev.reverse_complement() 187 | 188 | # print(fwd.sequence, "-" * (self.rev_start - 1), sep="") 189 | # print("-" * (self.fwd_start - 1), rev.sequence, sep="") 190 | rev_extra_start = len(rev) - self.rev_start + 1 191 | fwd_end = self.fwd_start + self.overlap_length - 1 192 | merge = FastqRead( 193 | header=fwd.header + "|" + rev.header, 194 | sequence="A", 195 | header2=fwd.header2 + "|" + rev.header2, 196 | quality_string="#", 197 | quality_encoding_value=fwd.quality_encoding_value, 198 | ) 199 | merge.sequence = fwd.sequence[:fwd_end] + rev.sequence[rev_extra_start:] 200 | merge.quality = fwd.quality[:fwd_end] + rev.quality[rev_extra_start:] 201 | 202 | mismatches = 0 203 | first = True 204 | for i in range(self.overlap_length): 205 | a = self.fwd_start - 1 + i 206 | b = len(rev) - self.rev_start - self.overlap_length + i + 1 207 | try: 208 | if fwd.sequence[a] == rev.sequence[b]: 209 | # take the highest quality value 210 | if rev.quality[b] > fwd.quality[a]: 211 | merge.quality[a] = rev.quality[b] 212 | else: 213 | if fwd.quality[a] == rev.quality[b]: 214 | merge.sequence[a] = "X" # unresolvable 215 | self.merge_mismatches.iloc[i]["unresolved"] += 1 216 | elif rev.quality[b] > fwd.quality[a]: 217 | merge.sequence[a] = rev.sequence[b] 218 | merge.quality[a] = rev.quality[b] 219 | self.merge_mismatches.iloc[i]["resolved"] += 1 220 | else: 221 | # overlap region already same as fwd 222 | self.merge_mismatches.iloc[i]["resolved"] += 1 223 | mismatches += 1 224 | if first: 225 | self.merge_mismatches.iloc[i]["first"] += 1 226 | first = False 227 | except IndexError: 228 | raise IndexError( 229 | "Failed to calculate overlap (a={a}, len(a)={lena}, b={b}, len(b)={lenb}) [{name}]".format( 230 | a=a, 231 | b=b, 232 | lena=len(fwd.sequence), 233 | lenb=len(rev.sequence), 234 | name=self.name, 235 | ) 236 | ) 237 | 238 | if mismatches > self.max_overlap_mismatches: 239 | return None # merge failed 240 | 241 | if self.trim: 242 | merge.trim_length(self.overlap_length, self.fwd_start) 243 | return merge 244 | 245 | def counts_from_reads(self): 246 | df_dict = dict() 247 | 248 | self.merge_mismatches = pd.DataFrame( 249 | data=0, 250 | index=[ 251 | x + self.fwd_start + self.wt.dna_offset 252 | for x in range(0, self.overlap_length) 253 | ], 254 | columns=["resolved", "unresolved", "first"], 255 | ) 256 | 257 | self.logger.info("Counting variants") 258 | max_mut_variants = 0 259 | with open_compressed(self.forward) as handle_fwd, open_compressed(self.reverse) as handle_rev: 260 | for fwd, rev in parse_fastq_pe_reads(handle_fwd, handle_rev): 261 | # filter on chastity before merge 262 | chaste = True 263 | if self.filters["chastity"]: 264 | if not fastq_read_is_chaste(fwd): 265 | chaste = False 266 | if self.report_filtered: 267 | self.report_filtered_read(fwd, "chastity") 268 | if not fastq_read_is_chaste(rev): 269 | chaste = False 270 | if self.report_filtered: 271 | self.report_filtered_read(rev, "chastity") 272 | if not chaste: 273 | self.filter_stats["chastity"] += 1 274 | self.filter_stats["total"] += 1 275 | continue 276 | 277 | merge = self.merge_reads(fwd, rev) 278 | if merge is None: # merge failed 279 | self.filter_stats["merge failure"] += 1 280 | self.filter_stats["total"] += 1 281 | if self.report_filtered: 282 | self.report_filtered_read(fwd, {"merge failure": True}) 283 | self.report_filtered_read(rev, {"merge failure": True}) 284 | else: 285 | if self.read_quality_filter(merge): 286 | mutations = self.count_variant(merge.sequence) 287 | if mutations is None: # merge read has too many mutations 288 | max_mut_variants += 1 289 | if self.report_filtered: 290 | self.report_filtered_variant(merge.sequence, 1) 291 | else: 292 | try: 293 | df_dict[mutations] += 1 294 | except KeyError: 295 | df_dict[mutations] = 1 296 | 297 | self.store.put( 298 | "/raw/overlap_mismatches", 299 | self.merge_mismatches, 300 | format="table", 301 | data_columns=self.merge_mismatches.columns, 302 | ) 303 | self.merge_mismatches = None 304 | self.save_counts("variants", df_dict, raw=True) 305 | del df_dict 306 | 307 | if self.aligner is not None: 308 | self.logger.info("Aligned {} variants".format(self.aligner.calls)) 309 | self.aligner_cache = None 310 | self.logger.info( 311 | "Removed {} total variants with excess mutations" 312 | "".format(max_mut_variants) 313 | ) 314 | self.save_filter_stats() 315 | 316 | def calculate(self): 317 | """ 318 | Reads the forward and reverse reads, merges them, performs 319 | quality-based filtering, and counts the variants. 320 | """ 321 | if not self.check_store("/main/variants/counts"): 322 | if not self.check_store("/raw/variants/counts"): 323 | if self.counts_file is not None: 324 | self.counts_from_file(self.counts_file) 325 | else: # count everything 326 | self.counts_from_reads() 327 | self.save_filtered_counts("variants", "count >= self.variant_min_count") 328 | 329 | self.count_synonymous() 330 | 331 | def make_plots(self): 332 | """ 333 | Make plots for :py:class:`~seqlib.seqlib.OverlapSeqLib` objects. 334 | 335 | Creates plots of the location of merged read mismatches. 336 | """ 337 | if self.plots_requested: 338 | SeqLib.make_plots(self) 339 | pdf = PdfPages(os.path.join(self.plot_dir, "overlap_mismatches.pdf")) 340 | overlap_merge_plot(self, pdf) 341 | pdf.close() 342 | -------------------------------------------------------------------------------- /enrich2/random_effects.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def rml_estimator(y, sigma2i, iterations=50): 5 | """Implementation of the robust maximum likelihood estimator. 6 | 7 | :: 8 | 9 | @book{demidenko2013mixed, 10 | title={Mixed models: theory and applications with R}, 11 | author={Demidenko, Eugene}, 12 | year={2013}, 13 | publisher={John Wiley & Sons} 14 | } 15 | 16 | """ 17 | w = 1 / sigma2i 18 | sw = np.sum(w, axis=0) 19 | beta0 = np.sum(y * w, axis=0) / sw 20 | sigma2ML = np.sum((y - np.mean(y, axis=0)) ** 2 / (len(beta0) - 1), axis=0) 21 | eps = np.zeros(beta0.shape) 22 | betaML = None 23 | for _ in range(iterations): 24 | w = 1 / (sigma2i + sigma2ML) 25 | sw = np.sum(w, axis=0) 26 | sw2 = np.sum(w ** 2, axis=0) 27 | betaML = np.sum(y * w, axis=0) / sw 28 | sigma2ML_new = ( 29 | sigma2ML 30 | * np.sum(((y - betaML) ** 2) * (w ** 2), axis=0) 31 | / (sw - (sw2 / sw)) 32 | ) 33 | eps = np.abs(sigma2ML - sigma2ML_new) 34 | sigma2ML = sigma2ML_new 35 | var_betaML = 1 / np.sum(1 / (sigma2i + sigma2ML), axis=0) 36 | return betaML, var_betaML, eps 37 | -------------------------------------------------------------------------------- /enrich2/wildtype.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from .constants import CODON_TABLE 4 | 5 | 6 | class WildTypeSequence(object): 7 | """ 8 | Container class for wild type sequence information. Used by :py:class:`~seqlib.seqlib.VariantSeqLib` objects and 9 | :py:class:`~enrich2.selection.Selection` or :py:class:`~enrich2.experiment.Experiment` objects that contain 10 | variant information. 11 | 12 | Requires a *parent_name* that associates this object with a StoreManager object for the 13 | purposes of error reporting and logging. 14 | """ 15 | 16 | def __init__(self, parent_name): 17 | self.parent_name = parent_name 18 | self.dna_seq = None 19 | self.protein_seq = None 20 | self.dna_offset = None 21 | self.protein_offset = None 22 | self.logger = logging.getLogger("{}.{}".format(__name__, self.__class__)) 23 | 24 | def __eq__(self, other): 25 | # note we don't need to check protein_offset, since it depends on dna_offset and protein_seq 26 | return ( 27 | self.dna_seq == other.dna_seq 28 | and self.protein_seq == other.protein_seq 29 | and self.dna_offset == other.dna_offset 30 | ) 31 | 32 | def __ne__(self, other): 33 | return not self == other 34 | 35 | def configure(self, cfg): 36 | try: 37 | # remove whitespace from WT DNA sequence and capitalize 38 | self.dna_seq = "".join(cfg["sequence"].split()).upper() 39 | 40 | # check that only valid characters are included (ACGT) 41 | if not re.match("^[ACGT]+$", self.dna_seq): 42 | raise ValueError( 43 | "WT DNA sequence contains unexpected " 44 | "characters [{}]".format(self.parent_name) 45 | ) 46 | 47 | # set the reference offset 48 | if "reference offset" in cfg: 49 | try: 50 | self.dna_offset = int(cfg["reference offset"]) 51 | except ValueError: 52 | raise ValueError( 53 | "Invalid reference offset value [{}]".format(self.parent_name) 54 | ) 55 | else: 56 | self.dna_offset = 0 57 | 58 | # handle coding sequences 59 | if cfg["coding"]: 60 | # require coding sequences are in-frame 61 | if len(self.dna_seq) % 3 != 0: 62 | raise ValueError( 63 | "WT DNA sequence contains incomplete codons [{}]".format( 64 | self.parent_name 65 | ) 66 | ) 67 | 68 | # perform translation 69 | self.protein_seq = "" 70 | for i in range(0, len(self.dna_seq), 3): 71 | self.protein_seq += CODON_TABLE[self.dna_seq[i: i + 3]] 72 | 73 | # set the reference offset if it's a multiple of three 74 | if self.dna_offset % 3 == 0: 75 | self.protein_offset = int(self.dna_offset / 3) 76 | else: 77 | self.logger.warning( 78 | "Ignoring reference offset for protein changes (not a multiple of three)" 79 | ) 80 | self.protein_offset = 0 81 | else: 82 | self.protein_seq = None 83 | self.protein_offset = None 84 | 85 | except KeyError as key: 86 | raise KeyError( 87 | "Missing required config value {key} [{name}]".format( 88 | key=key, name=self.parent_name 89 | ) 90 | ) 91 | 92 | def serialize(self): 93 | """ 94 | Format this object as a config object suitable for dumping to a config file. 95 | """ 96 | cfg = { 97 | "sequence": self.dna_seq, 98 | "coding": self.is_coding(), 99 | "reference offset": self.dna_offset, 100 | } 101 | return cfg 102 | 103 | def is_coding(self): 104 | return self.protein_seq is not None 105 | 106 | def duplicate(self, new_parent_name): 107 | """ 108 | Create a copy of this object with the *new_parent_name*. 109 | 110 | Uses the configure and serialize methods to perform the copy. 111 | """ 112 | new = WildTypeSequence(new_parent_name) 113 | new.configure(self.serialize()) 114 | 115 | if new != self: 116 | raise ValueError( 117 | "Failed to duplicate wild type sequence [{}]".format(self.parent_name) 118 | ) 119 | else: 120 | return new 121 | 122 | def position_tuples(self, protein=False): 123 | """ 124 | Return a list of tuples containing the position number (after offset adjustment) and 125 | single-letter symbol (nucleotide or amino acid) for each position the wild type sequence. 126 | """ 127 | if protein: 128 | if not self.is_coding(): 129 | raise AttributeError( 130 | "Cannot return wild type protein position tuples for non-coding wild type [{}]".format( 131 | self.parent_name 132 | ) 133 | ) 134 | else: 135 | seq = self.protein_seq 136 | offset = self.protein_offset 137 | else: 138 | seq = self.dna_seq 139 | offset = self.dna_offset 140 | 141 | return [(i + offset + 1, seq[i]) for i in range(len(seq))] 142 | -------------------------------------------------------------------------------- /enrich2/ztest.py: -------------------------------------------------------------------------------- 1 | """z-score helper functions. 2 | 3 | This module contains functions for calculating z-scores and corresponding 4 | p-values for pairs of scores. 5 | """ 6 | 7 | import numpy as np 8 | import scipy.stats as stats 9 | 10 | 11 | def ztest_pair(df1, df2): 12 | """z-test for elements in two data frames. 13 | 14 | Takes two data frames with ``'score'`` and ``'SE'`` columns and returns 15 | a new data frame containing the scores and standard errors and result for 16 | pairwise comparison of elements in both input data frames. 17 | 18 | Parameters 19 | ---------- 20 | df1 : pandas.DataFrame 21 | The first data frame. Must contain ``'score'`` and ``'SE'`` columns. 22 | 23 | df2 : pandas.DataFrame 24 | The second data frame. Must contain ``'score'`` and ``'SE'`` columns. 25 | 26 | Returns 27 | ------- 28 | pandas.DataFrame 29 | Result data frame containing scores (``'score1'`` and ``'score2'``), 30 | standard errors (``'SE1'``, ``'SE2'``), z-scores (``'z'``), and 31 | p-values (``'pvalue_raw'``) for each element found in both data frames. 32 | 33 | Raises 34 | ------ 35 | To be added. 36 | """ 37 | shared = df1.loc[:, ("score", "SE")].merge( 38 | df2.loc[:, ("score", "SE")], 39 | how="inner", 40 | left_index=True, 41 | right_index=True, 42 | suffixes=("1", "2"), 43 | ) 44 | shared["z"] = np.abs(shared["score1"] - shared["score2"]) / np.sqrt( 45 | shared["SE1"] ** 2 + shared["SE2"] ** 2 46 | ) 47 | shared["pvalue_raw"] = 2 * stats.norm.sf(shared["z"]) 48 | return shared 49 | 50 | 51 | def ztest_single(df, score, se): 52 | """z-test for comparing elements in a data frame to a single score and SE. 53 | 54 | Takes a data frames with ``'score'`` and ``'SE'`` columns and a score and 55 | standard error to compare them to and returns a new data frame containing 56 | the scores, standard errors, and result for the pairwise comparisons. 57 | 58 | Parameters 59 | ---------- 60 | df : pandas.DataFrame 61 | The data frame. Must contain ``'score'`` and ``'SE'`` columns. 62 | 63 | score : float 64 | Score used for comparison to elements in the data frame. 65 | 66 | se : float 67 | Standard error used for comparison to elements in the data frame. 68 | 69 | Returns 70 | ------- 71 | pandas.DataFrame 72 | Result data frame containing score (``'score'``), standard error 73 | (``'SE'``), z-score (``'z'``), and p-value (``'pvalue_raw'``) for each 74 | element in the input data frame. 75 | 76 | Raises 77 | ------ 78 | To be added. 79 | """ 80 | result = df.loc[:, ("score", "SE")] 81 | result["z"] = np.abs(result["score"] - score) / np.sqrt(result["SE"] ** 2 + se ** 2) 82 | result["pvalue_raw"] = 2 * stats.norm.sf(result["z"]) 83 | return result 84 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | # Variable names that will always be accepted 2 | good-names=df,se,id,_,i,j,k,a,x,y,z 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "Enrich2" 7 | dynamic = ["version"] 8 | description = "Analysis program for calculating variant scores from deep mutational scanning data." 9 | readme = "README.md" 10 | license = "BSD-3-Clause" 11 | authors = [ 12 | { name = "Alan F Rubin", email = "alan.rubin@wehi.edu.au" }, 13 | ] 14 | dependencies = [ 15 | "fqfa", 16 | "matplotlib", 17 | "numpy", 18 | "pandas", 19 | "scipy", 20 | "statsmodels", 21 | "tables", 22 | ] 23 | 24 | [project.scripts] 25 | enrich_cmd = "enrich2.main:main_cmd" 26 | 27 | [project.gui-scripts] 28 | enrich_gui = "enrich2.main:main_gui" 29 | 30 | [project.urls] 31 | Homepage = "https://github.com/FowlerLab/Enrich2/" 32 | 33 | [tool.hatch.version] 34 | path = "enrich2/__init__.py" 35 | 36 | [tool.hatch.build.targets.sdist] 37 | include = [ 38 | "/enrich2", 39 | "/docs", 40 | ] 41 | --------------------------------------------------------------------------------